In [1]:
import pandas as pd
import numpy as np
import time
import pickle

In [2]:
with open('market_factors.pkl','rb') as f:
    market_factors = pickle.load(f)

In [3]:
market_factors.columns

Index(['index', 'Unnamed: 0', 'time', 'assetCode', 'assetName', 'volume',
       'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
       'returnsOpenNextMktres10', 'universe', 'returnsVolumePrev1',
       'returnsVolumePrev10', 'CloseMA5', 'CloseMA10', 'CloseMA20',
       'returnsCloseMA5', 'returnsCloseMA10', 'returnsCloseMA20', 'VolumeMA5',
       'VolumeMA10', 'VolumeMA20', 'returnsVolumeMA5', 'returnsVolumeMA10',
       'returnsVolumeMA20'],
      dtype='object')

In [7]:
# Must first sort by assetCode and time
market_factors = market_factors.sort_values(['assetCode', 'time'], ascending=[True, True]).reset_index()

In [8]:
def truncated_return( x ):
    return x[-1]/x[0]-1.0 if x[0]>0.0 else float('nan')

# Relative return on score, measuring momentum
def relative_return( df, score, window ):
    df['return_Prev'+str(window)+'_'+score] = df[score].rolling(window+1).apply(truncated_return)
    print('Done calculating '+'return_Prev'+str(window)+'_'+score)
    
# Moving average of score
def moving_average( df, score, window):
    df['ma'+str(window)+'_'+score] = df[score].rolling(window).apply(np.mean)
    print('Done calculating '+'ma'+str(window)+'_'+score)
    
# Relative return on moving average, another measure of momentum
def moving_average_return( df, score, short_window, long_window ):
    tmp1 = df[score].rolling(short_window).apply(np.mean)
    tmp2 = df[score].rolling(long_window).apply(np.mean)
    df['return_ma'+str(short_window)+'_ma'+str(long_window)+'_'+score] = tmp1/tmp2 - 1.0
    print('Done calculating '+'return_ma'+str(short_window)+'_ma'+str(long_window)+'_'+score)

### Example usage

In [9]:
start_time = time.time()
relative_return( market_factors, 'volume', 1)
relative_return( market_factors, 'volume', 2)
relative_return( market_factors, 'volume', 5)
moving_average( market_factors, 'volume', 5)
moving_average( market_factors, 'volume', 10)
moving_average( market_factors, 'volume', 20)
moving_average_return( market_factors, 'volume', 1, 5)
moving_average_return( market_factors, 'volume', 5, 10)
moving_average_return( market_factors, 'volume', 10,20)
print('Done!')
print("--- %s seconds ---" % (time.time() - start_time))

  


Done calculating return_Prev1_volume
Done calculating return_Prev2_volume
Done calculating return_Prev5_volume


  # This is added back by InteractiveShellApp.init_path()


Done calculating ma5_volume
Done calculating ma10_volume
Done calculating ma20_volume


  app.launch_new_instance()


Done calculating return_ma1_ma5_volume
Done calculating return_ma5_ma10_volume
Done calculating return_ma10_ma20_volume
Done!
--- 226.73504281044006 seconds ---


In [11]:
market_factors = market_factors.sort_values(['time', 'assetCode'], ascending=[True, True]).reset_index(drop = True) 

In [13]:
# Rearrange to the original order
market_factors[['time','assetCode']+[score for score in market_factors.columns if 'volume' in score]].head(100)

Unnamed: 0,time,assetCode,volume,return_Prev1_volume,return_Prev2_volume,return_Prev5_volume,ma5_volume,ma10_volume,ma20_volume,return_ma1_ma5_volume,return_ma5_ma10_volume,return_ma10_ma20_volume
0,2007-02-01 22:00:00+00:00,A.N,2606900.0,,,,,,,,,
1,2007-02-01 22:00:00+00:00,AAI.N,2051600.0,-0.236187,-0.363833,-0.289472,2484507.4,2728857.9,2009839.60,-0.174243,-0.089543,0.357749
2,2007-02-01 22:00:00+00:00,AAP.N,1164800.0,5.324660,4.431721,8.089207,384673.6,287933.9,242943.85,2.028022,0.335979,0.185187
3,2007-02-01 22:00:00+00:00,AAPL.O,23747329.0,45.290819,54.048144,70.036195,5112135.2,2909132.0,1889793.80,3.645286,0.757272,0.539391
4,2007-02-01 22:00:00+00:00,ABB.N,1208600.0,2.934001,4.717232,5.542344,415506.0,389286.7,345792.35,1.908743,0.067352,0.125782
5,2007-02-01 22:00:00+00:00,ABC.N,1657300.0,-0.723746,-0.667878,-0.654593,4646620.6,5088440.1,6739285.40,-0.643332,-0.086828,-0.244959
6,2007-02-01 22:00:00+00:00,ABD.N,1186200.0,2.195599,3.245086,1.771962,536761.6,516689.2,581519.40,1.209920,0.038848,-0.111484
7,2007-02-01 22:00:00+00:00,ABM.N,301200.0,-0.817553,-0.838075,-0.807481,1134213.4,1873365.1,2056308.55,-0.734442,-0.394558,-0.088967
8,2007-02-01 22:00:00+00:00,ABT.N,5692300.0,33.280225,30.700322,38.939519,1278779.0,840188.8,650409.10,3.451356,0.522014,0.291785
9,2007-02-01 22:00:00+00:00,ABV.N,401800.0,-0.961534,-0.943014,-0.913136,6272090.0,7030432.4,8128479.65,-0.935938,-0.107866,-0.135086


### When testing these indicators, we should abandon first t days where t is the maximum time window used in calculating these indicators.