public
Description: Examples of using NumPy and SciPy to analyze data using MaskedArray and timeseries
Homepage: http://www.techguyinmidtown.com/
Clone URL: git://github.com/nodogbite/maskedarray_timeseries.git
maskedarray_timeseries / src / demo_timeseries_part1.py
100644 277 lines (239 sloc) 12.418 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""
File: demo_timeseries.py
Author: Greg Harfst (greg at techguy in midtown.com)
Notes:
 
A demonstration of numpy arrays, masked arrays, and scikits.timeseries.
"""
 
import numpy as NP
import numpy.ma as MA
import time
import scikits.timeseries as TS
import random
import itertools as IT
from operator import itemgetter
import optparse
 
def processOptions():
    """Process the command line options using 'optparse'"""
    parser = optparse.OptionParser()
    parser.add_option("--numTickers", type='int', default=4000, # 1400
                      help="Number of fake tickers to generate")
    parser.add_option("--numDates", type='int', default=300, # 2300
                      help="Number of dates to generate")
    parser.add_option("--loadMA", default=False, action="store_true",
                      help="Test loading data into MaskedArrays")
    parser.add_option("--loadTS", default=False, action="store_true",
                      help="Test loading data into timeseries objects")
    parser.add_option("--timeMultDim", type='int', default=1000, dest='dim',
                      help="Number of rows and columns for our timed element-wise multiplication.")
    parser.add_option("--timeListMult", default=False, action="store_true",
                      help="Test element-wise multiplication of two 2d lists")
    parser.add_option("--timeNPMult", default=False, action="store_true",
                      help="Test element-wise multiplication of two 2d numpy ndarrays")
    parser.add_option("--timeMAMult", default=False, action="store_true",
                      help="Test element-wise multiplication of two 2d MaskedArrays")
 
    (options, args) = parser.parse_args()
    return options
 
def print_timing(func):
    def wrapper(*arg):
        t1 = time.time()
        res = func(*arg)
        t2 = time.time()
        print '%s took %0.3f ms' % (func.func_name, (t2-t1)*1000.0)
        return res
    return wrapper
 
def makeTwoDimMAArray(numRows, numCols):
    return MA.MaskedArray(NP.array(makeTwoDimList(numRows, numCols)), mask=False)
 
def makeTwoDimNumpyArray(numRows, numCols):
    return NP.array(makeTwoDimList(numRows, numCols))
 
def makeTwoDimList(numRows, numCols):
    return [[i*j for j in range(numCols)] for i in range(numRows)]
 
@print_timing
def elementwiseMultiply(l1,l2):
    numRows = len(l1)
    numCols = len(l1[0])
    return [[l1[i][j] * l2[i][j] for j in range(numCols)] for i in range(numRows)]
 
@print_timing
def elementwiseNPMultiply(a1,a2):
    return a1 * a2
 
@print_timing
def elementwiseMAMultiply(a1,a2):
    return a1 * a2
 
def generateFakeStockData(numTickers=4000, numDates=300):
    """A simple function that generates a random list of type [[string]].
 
Each sub-list contains:
[date string, ticker, daily return, daily trading volume]
The start of the stream is ragged -- some of the tickers (out of a
universe of 'numTickers' fake tickers) might not appear initially. Also, there
are periodic holidays, where there are no data. Furthermore, there are
randomly missing daily returns or daily volumes, which are
represented by empty strings. This approximates what we might get from
a well-behaved, real world CSV file.
"""
    # generate a list of tickers
    asciiRange = range(65,91)
    tickers = []
    for i, (a,b,c) in enumerate(((a,b,c) for a in asciiRange for b in asciiRange for c in asciiRange)):
        if i % 7 == 0:
            tickers.append(chr(a)+chr(b)+chr(c))
# if i >= 14000:
        if i >= 4000:
            break
 
    # calculate some random ticker-specific characteristics
    tickerVolumeMeans = {}
    tickerStartOffsets = {}
    for ticker in tickers:
        tickerVolumeMeans[ticker] = int(1e3 + 1e5 * random.lognormvariate(1.0,1.5))
        tickerStartOffsets[ticker] = int(random.lognormvariate(0,2))
 
    # an iterator for the sub-lists. This nested loop contains a 'yield' statement.
# for dateIndex in xrange(2300):
    for dateIndex in xrange(300):
        date = TS.Date('B', '1/1/2000') + dateIndex
        dateStr = date.strftime('%m/%d/%Y')
        if dateIndex % 60 == 0:
            continue # holiday
 
        for ticker in tickers:
            if dateIndex < tickerStartOffsets[ticker]:
                continue # the ticker doesn't exist yet
 
            # daily return might be missing for no particular reason
            if int(random.uniform(0,5000)) < 1:
                dailyReturn = None
            else:
                dailyReturn = 1 - random.lognormvariate(0,0.03)
 
            # daily volume might be missing for no particular reason
            if int(random.uniform(0,4000)) < 1:
                dailyVolume = None
            else:
                volMean = tickerVolumeMeans[ticker]
                dailyVolume = int(random.normalvariate(volMean, volMean / 10))
 
            dailyReturnStr = '%0.4f' % dailyReturn if dailyReturn is not None else ''
            dailyVolumeStr = '%d' % dailyVolume if dailyVolume is not None else ''
            yield [dateStr, ticker, dailyReturnStr, dailyVolumeStr]
 
@print_timing
def loadDataIntoMaskedArrays(iter):
    """ Convert an iterator produced by 'generateFakeStockData()' into MaskedArray objects.
 
Return a tuple, (dateStrs, tickers, dailyReturnsMA, dailyVolumesMA), where
dateStrs is a list of string representations of the dates for each row in the 2d MaskedArrays
tickers is a list of tickers that correspond to the columns of the 2d MaskedArrays
dailyReturnsMA is a MaskedArray containing the returns for all tickers on all of the dates.
dailyVolumesMA is a MaskedArray containing the volumes for all tickers on all of the dates.
"""
    dateStrs = []
    dailyReturns = {} # map ticker -> return (float) and mask (boolean)
    dailyVolumes = {} # map ticker -> volume (int) and mask (boolean)
    for dateCount, (dateStr, rowsForDate) in enumerate(IT.groupby(iter, itemgetter(0))):
# print dateStr
        dateStrs.append(dateStr)
        for [_, ticker, dailyReturnStr, dailyVolumeStr] in rowsForDate:
            # handle ragged starts -- every ticker is not present for every date
            if ticker not in dailyReturns:
                for d in [dailyReturns, dailyVolumes]:
                    d[ticker] = {'data': [0 for i in range(dateCount)],
                                 'mask': [True for i in range(dateCount)]}
            try:
                dailyReturnV = float(dailyReturnStr)
                dailyReturnM = False
            except ValueError:
                dailyReturnV = 0
                dailyReturnM = True
            try:
                dailyVolumeV = int(dailyVolumeStr)
                dailyVolumeM = False
            except ValueError:
                dailyVolumeV = 0
                dailyVolumeM = True
 
            dailyReturns[ticker]['data'].append(dailyReturnV)
            dailyReturns[ticker]['mask'].append(dailyReturnM)
            dailyVolumes[ticker]['data'].append(dailyVolumeV)
            dailyVolumes[ticker]['mask'].append(dailyVolumeM)
 
    tickers = sorted(dailyReturns.keys())
    numRows = len(dateStrs)
    dailyReturnsMA = MA.MaskedArray(data=[[dailyReturns[ticker]['data'][i] for ticker in tickers] for i in range(numRows)],
                                    mask=[[dailyReturns[ticker]['mask'][i] for ticker in tickers] for i in range(numRows)])
    dailyVolumesMA = MA.MaskedArray(data=[[dailyVolumes[ticker]['data'][i] for ticker in tickers] for i in range(numRows)],
                                    mask=[[dailyVolumes[ticker]['mask'][i] for ticker in tickers] for i in range(numRows)])
 
    return dateStrs, tickers, dailyReturnsMA, dailyVolumesMA
 
@print_timing
def loadDataIntoTimeseries(iter):
    """ Convert an iterator produced by 'generateFakeStockData()' into scikits.timeseries objects
 
Return a tuple, (tickers, dailyReturnsTSGrid, dailyVolumesTSGrid), where
tickers is a list of tickers that correspond to the columns of the 2d MaskedArrays
dailyReturnsTSGrid is a timeseries containing the returns for all tickers on all of the dates.
dailyVolumesTSGrid is a timeseries containing the volumes for all tickers on all of the dates.
"""
    dailyReturns = {} # map ticker -> date (string) and return (float) and mask (boolean)
    dailyVolumes = {} # map ticker -> date (string) and volume (int) and mask (boolean)
    for dateStr, rowsForDate in IT.groupby(iter, itemgetter(0)):
# print dateStr
        date = TS.Date('B', dateStr)
        for [_, ticker, dailyReturnStr, dailyVolumeStr] in rowsForDate:
            # handle ragged starts -- every ticker is not present for every date
            if ticker not in dailyReturns:
                for d in [dailyReturns, dailyVolumes]:
                    d[ticker] = {'dates': [],
                                 'data': [],
                                 'mask': []}
            try:
                dailyReturnV = float(dailyReturnStr)
                dailyReturnM = False
            except ValueError:
                dailyReturnV = 0
                dailyReturnM = True
            try:
                dailyVolumeV = int(dailyVolumeStr)
                dailyVolumeM = False
            except ValueError:
                dailyVolumeV = 0
                dailyVolumeM = True
 
            dailyReturns[ticker]['dates'].append(date)
            dailyReturns[ticker]['data'].append(dailyReturnV)
            dailyReturns[ticker]['mask'].append(dailyReturnM)
            dailyVolumes[ticker]['dates'].append(date)
            dailyVolumes[ticker]['data'].append(dailyVolumeV)
            dailyVolumes[ticker]['mask'].append(dailyVolumeM)
 
    tickersR, dailyReturnsTSGrid = makeTimeseriesGrid(dailyReturns)
    tickersV, dailyVolumesTSGrid = makeTimeseriesGrid(dailyVolumes)
    assert(tickersR == tickersV)
    
    return tickersR, dailyReturnsTSGrid, dailyVolumesTSGrid
 
def makeTimeseriesGrid(dailyDict):
    """Convert a dictionary of returns or volume data into a scikits.timeseries.time_series """
    tickers = sorted(dailyDict.keys())
    # create a list of timeseries, one for each ticker
    dailyTSs = [TS.time_series(dates=TS.DateArray(dailyDict[ticker]['dates'], freq='B'),
                               data=MA.MaskedArray(data=dailyDict[ticker]['data'],
                                                   mask=dailyDict[ticker]['mask'])).fill_missing_dates()
                for ticker in tickers]
    # align all of the time series to fall on an identical set of dates
    dailyTSs_aligned = TS.aligned(*tuple(dailyTSs)) # I wish TS.aligned() took a list instead of a variable-length argument list
    # build a 2d timeseries using all of the individual timeseries objects
    dailyTSGrid = TS.time_series(dates=dailyTSs_aligned[0].dates,
                                 data=MA.column_stack([ts.series for ts in dailyTSs_aligned]))
    return tickers, dailyTSGrid
 
def main():
    options = processOptions()
 
    if options.loadMA:
        print "Load '%d' fake tickers over '%d' dates into a masked array" % (options.numTickers, options.numDates)
        iter = generateFakeStockData(numTickers=options.numTickers, numDates=options.numDates)
        dateStrs, tickers, dailyReturnsMA, dailyVolumesMA = loadDataIntoMaskedArrays(iter)
 
    if options.loadTS:
        print "Load '%d' fake tickers over '%d' dates into a timeseries" % (options.numTickers, options.numDates)
        iter = generateFakeStockData(numTickers=options.numTickers, numDates=options.numDates)
        tickers, dailyReturnsTSGrid, dailyVolumesTSGrid = loadDataIntoTimeseries(iter)
 
    n,m = options.dim,options.dim
    if options.timeListMult:
        print "Do elementwise multiplication of an %d x %d two dimensional list." % (n, m)
        l1 = makeTwoDimList(n,m)
        l2 = makeTwoDimList(n,m)
        elementwiseMultiply(l1,l2)
    if options.timeNPMult:
        print "Do elementwise multiplication of an %d x %d numpy array." % (n, m)
        a1 = makeTwoDimNumpyArray(n,m)
        a2 = makeTwoDimNumpyArray(n,m)
        elementwiseNPMultiply(a1,a2)
    if options.timeMAMult:
        print "Do elementwise multiplication of an %d x %d masked array." % (n, m)
        ma1 = makeTwoDimMAArray(n,m)
        ma2 = makeTwoDimMAArray(n,m)
        elementwiseMAMultiply(ma1,ma2)
 
    return
 
 
if __name__ == "__main__":
    main()