In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
#import tensorflow.contrib.eager as tfe
#tf.enable_eager_execution()

In [2]:
def trainDataLoad(local=False,market=True,news=True,sample=False):
    try:
        from kaggle.competitions import twosigmanews

        if(not local):
            env = twosigmanews.make_env()
        (market_df, news_df) = env.get_training_data()

        print('Data fetched from kaggle with {} rows of market data and {} rows of news data'.format(market_df.shape, news_df.shape))
    except:
        filename=['marketdata_sample.csv','news_sample.csv']
        if(not sample):
            filename=['market_train.csv','news_train.csv']
        print('failed to load data from kaggle, loading data from local directory.')
        if(market):
            market_df=pd.read_csv('./sampleData/'+filename[0])
        if(news):
            news_df=pd.read_csv('./sampleData/'+filename[1])
        print('Train data loaded!')
    if(market & (not news)):
        return market_df
    if(news & (not market)):
        return news_df
    return (market_df,news_df)

In [3]:
def timeCut(df,timeStart,timeEnd, replace=True):
    '''
    df: dataFrame with attribute time in datatime64 format
    time: a time in string
    return df slice cutting off the time before the time provided
    '''
    df.time=pd.to_datetime(df.time)
    timeStart=pd.Timestamp(timeStart)
    timeEnd=pd.Timestamp(timeEnd)
    df_slice = df[(df.time>timeStart) & (df.time<timeEnd)]
    if replace:
        df=df_slice
    return df_slice

def formatCodeSet(df,field):
    '''
    df:dataframe
    field:field name of the code in the form string in set format
    return the field formatted into array
    '''
    return df[field].str.findall(f"'([\w\./]+)'")

# Embeddings

### Example for embedding lookups

Embedding lookup is a matrix lookup. The parameters input is a matrix where each row is an item. The input is a row index query. Upon execution, a matrix will be constructed according to input row index.

In [4]:
# 定义一个未知变量input_ids用于存储索引
input_ids = tf.placeholder(dtype=tf.int32, shape=[None])

# 定义一个已知变量embedding，是一个5*5的对角矩阵
# embedding = tf.Variable(np.identity(5, dtype=np.int32))

# 或者随机一个矩阵
embedding = a = np.asarray([[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]])

# 根据input_ids中的id，查找embedding中对应的元素
input_embedding = tf.nn.embedding_lookup(embedding, input_ids)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

print(sess.run(input_embedding, feed_dict={input_ids: [1, 2, 3, 0, 3, 2, 1]}))

[[1.1 1.2 1.3]
 [2.1 2.2 2.3]
 [3.1 3.2 3.3]
 [0.1 0.2 0.3]
 [3.1 3.2 3.3]
 [2.1 2.2 2.3]
 [1.1 1.2 1.3]]


In [5]:
market_train_df, news_train_df=trainDataLoad(sample=False)

Loading the data... This could take a minute.
Done!
Data fetched from kaggle with (4072956, 16) rows of market data and (9328750, 35) rows of news data


To apply the embedding, we need to include all assets into the universe and at the same time need to take care of assets that are not included in current universe but may appear in the future. First we need to work out how many unique assets to be embedded.

In [6]:
# Getting assets from the data
def getUnique(df,prop):
    return df[prop].unique()
def getUniqueFromArrays(df,prop):
    try:
        propLists = df.assetCodes.str.findall(f"'([\w\./]+)'")
        return np.unique([item for sublist in propLists.tolist() for item in sublist])
    except:
        propLists = df[prop]
        return np.unique([item for sublist in propLists.tolist() for item in sublist])
def uniqueConcat(list1,list2):
    return np.unique(np.concatenate((list1,list2), axis=0))

In [7]:
market_assets=getUnique(market_train_df,'assetCode')
news_assets=getUniqueFromArrays(news_train_df,'assetCodes')
assets=uniqueConcat(market_assets,news_assets)

In [8]:
#Number of assets found in both datasets
print('Number of assets found in both datasets',len(np.intersect1d(market_assets,news_assets)))

Number of assets found in both datasets 3663


In [9]:
#Total number of assets mentioned
print('Total number of assets mentioned',len(assets))

Total number of assets mentioned 14410


In [10]:
#Total number of assets in the market
print('Total number of assets in the market',len(market_assets))

Total number of assets in the market 3780


In [11]:
#Total number of assets in the news
print('Total number of assets in the news',len(news_assets))

Total number of assets in the news 14293


As observed, many of the assets appears on the news but not all of them appears on the market data. Using asset embeddings instead of a fixed asset code should allow the system to discover more the relationship between the assets through news and market signals. The idea is first to initialise the assets in the market with PCA to estimate their similarities. For the rest, simply initialise with random vectors and let the model fix the similarity. 

In [12]:
market_train_df.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2007-02-01 22:00:00+00:00,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.00186,0.000622,,,0.034672,1.0
1,2007-02-01 22:00:00+00:00,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0
2,2007-02-01 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0
3,2007-02-01 22:00:00+00:00,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0
4,2007-02-01 22:00:00+00:00,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0


The data start from 2007-02-01, suppose we take 5 years of data to calculate stock similarity with the market data. We calculate the similarity for assets appeared in this observation period and initialise the rest with a random vector.

In [13]:
embedded_dim=50
initStart='2007-02-01 22:00:00+00:00'
initEnd='2012-02-01 22:00:00+00:00'

In [18]:
def PCA_embedding(df,timeStart,timeEnd,components):
    '''
    Do the time cutting and calculate the pca. 
    The index of the dataframe must be datetime
    '''
    import numpy as np
    from sklearn.decomposition import PCA
    if(not isinstance(df.index, pd.DatetimeIndex)):
        df.index = pd.to_datetime(df.index )
    T_s=pd.Timestamp(timeStart)
    T_e=pd.Timestamp(timeEnd)
    df_T=df[T_s:T_e]
    pca = PCA(n_components=components)
    pca.fit(df_T)
    centre=list(df_T.mean())
    return pca,centre

In [19]:
tf.reset_default_graph()

In [20]:
def initAssetEmbeddings(market_data,initStart,initEnd,assets,embedded_dim):
    #Create the pivot view of the market data
    pivot=timeCut(market_data,initStart,initEnd, replace=False).pivot(index='time',columns='assetCode',values='returnsClosePrevMktres10').fillna(0)
    
    #Find out what assets are missing
    included_assets=list(pivot.columns)
    other_assets=list(set(assets).difference(set(pivot.columns)))
    other_assets.sort()
    other_assets=other_assets+['unknown']
    assets=included_assets+other_assets
    
    #Calculate the PCA with the data to find out similarity between stocks
    pca,centre=PCA_embedding(pivot,initStart,initEnd,embedded_dim)
    
    #Make up the initialisation
    #For assets within the market intialisation period, apply the pca for similarity estimation
    inc_assetEmbd_init=np.transpose(pca.components_)
    
    #For other assets, apply a random initialisation
    othr_assetEmbd_init=np.random.rand(len(other_assets),embedded_dim)
    
    #Concatenate both to create a complete embedding variable
    assets_embd_int=np.concatenate((inc_assetEmbd_init,othr_assetEmbd_init))
    
    #To create a tensorflow embedding lookup
    asset_ids = tf.placeholder(dtype=tf.int32, shape=[None])
    assetEncodings=tf.get_variable('assetEncodings',initializer=assets_embd_int,dtype=tf.float64)
    assets_embedding = tf.nn.embedding_lookup(assetEncodings, asset_ids)
    id_lookups={val:idx for idx, val in enumerate(assets)}
    return assetEncodings,assets_embedding,id_lookups

In [21]:
assetEncodings,assets_embedding,asset_lookups=initAssetEmbeddings(market_train_df,initStart,initEnd,assets,embedded_dim)

# Transform the input for embedding lookup
To enable embedding lookup, we would need to turn the assetCode into asset ids.

In [29]:
def lookUpTransform(valueType,lookup):
    if(valueType=='str'):
        def lookupFunc(value):
            if value in lookup:
                return lookup[value]
            else:
                return lookup['unknown']
        return lookupFunc
    if(valueType=='list'):
        return lambda values:[lookup[v] if v in lookup else lookup['unknown'] for v in values]

In [33]:
market_train_df['assetID']=market_train_df.assetCode.apply(lookUpTransform('str',asset_lookups))
news_train_df['assetIDs']=news_train_df.assetCodes.str.findall(f"'([\w\./]+)'").apply(lookUpTransform('list',asset_lookups))