In [67]:
import tensorflow as tf
import numpy as np
import pandas as pd
#import tensorflow.contrib.eager as tfe
#tf.enable_eager_execution()

In [68]:
def trainDataLoad(local=False,market=True,news=True,sample=False):
    try:
        from kaggle.competitions import twosigmanews

        if(not local):
            env = twosigmanews.make_env()
        (market_df, news_df) = env.get_training_data()

        print('Data fetched from kaggle with {} rows of market data and {} rows of news data'.format(market_df.shape, news_df.shape))
    except:
        filename=['marketdata_sample.csv','news_sample.csv']
        if(not sample):
            filename=['market_train.csv','news_train.csv']
        print('failed to load data from kaggle, loading data from local directory.')
        if(market):
            market_df=pd.read_csv('./sampleData/'+filename[0])
        if(news):
            news_df=pd.read_csv('./sampleData/'+filename[1])
        print('Train data loaded!')
    if(market & (not news)):
        return market_df
    if(news & (not market)):
        return news_df
    return (market_df,news_df)

In [69]:
def timeCut(df,time, replace=True):
    '''
    df: dataFrame with attribute time in datatime64 format
    time: a time in string
    return df slice cutting off the time before the time provided
    '''
    df.time=pd.to_datetime(df.time)
    time=pd.Timestamp(time)
    df_slice = df[df.time>time]
    if replace:
        df=df_slice
    return df_slice

def formatCodeSet(df,field):
    '''
    df:dataframe
    field:field name of the code in the form string in set format
    return the field formatted into array
    '''
    return df[field].str.findall(f"'([\w\./]+)'")

# Embeddings

### Example for embedding lookups

Embedding lookup is a matrix lookup. The parameters input is a matrix where each row is an item. The input is a row index query. Upon execution, a matrix will be constructed according to input row index.

In [11]:
# 定义一个未知变量input_ids用于存储索引
input_ids = tf.placeholder(dtype=tf.int32, shape=[None])

# 定义一个已知变量embedding，是一个5*5的对角矩阵
# embedding = tf.Variable(np.identity(5, dtype=np.int32))

# 或者随机一个矩阵
embedding = a = np.asarray([[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]])

# 根据input_ids中的id，查找embedding中对应的元素
input_embedding = tf.nn.embedding_lookup(embedding, input_ids)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

print(sess.run(input_embedding, feed_dict={input_ids: [1, 2, 3, 0, 3, 2, 1]}))



[[1.1 1.2 1.3]
 [2.1 2.2 2.3]
 [3.1 3.2 3.3]
 [0.1 0.2 0.3]
 [3.1 3.2 3.3]
 [2.1 2.2 2.3]
 [1.1 1.2 1.3]]


In [88]:
market_train_df, news_train_df=trainDataLoad(sample=False)

failed to load data from kaggle, loading data from local directory.
Train data loaded!


To apply the embedding, we need to include all assets into the universe and at the same time need to take care of assets that are not included in current universe but may appear in the future. First we need to work out how many unique assets to be embedded.

In [89]:
# unique market assets in the market training data
market_assets=market_train_df.assetCode.unique()

In [90]:
#unique assets mentioned in the news training data
news_train_df['assetCodes'] = formatCodeSet(news_train_df,'assetCodes')
news_assets=np.unique([item for sublist in news_train_df.assetCodes.tolist() for item in sublist])
assets=np.unique(np.concatenate((market_assets,news_assets), axis=0))

In [96]:
#Number of assets found in both datasets
print('Number of assets found in both datasets',len(np.intersect1d(market_assets,news_assets)))

Number of assets found in both datasets 2805


In [97]:
#Total number of assets mentioned
print('Total number of assets mentioned',len(assets))

Total number of assets mentioned 10609


In [98]:
#Total number of assets in the market
print('Total number of assets in the market',len(market_assets))

Total number of assets in the market 2860


In [99]:
#Total number of assets in the news
print('Total number of assets in the news',len(news_assets))

Total number of assets in the news 10554


As observed, many of the assets appears on the news but not all of them appears on the market data. Using asset embeddings instead of a fixed asset code should allow the system to discover more the relationship between the assets through news and market signals.

In [101]:
assets

array(['000030.KS', '0005.HK', '005490.KS', ..., 'ZYNE.OQ', 'ZZ.N',
       'ZZC.N'], dtype=object)

In [102]:
pivot=market_train_df.pivot(index='time',columns='assetCode',values='returnsClosePrevMktres10')

In [104]:
pivot.reindex(columns=assets)

assetCode,000030.KS,0005.HK,005490.KS,015760.KS,017670.KS,030200.KS,0338.HK,034220.KS,0386.HK,03S.TG,...,ZTR.N,ZTS.N,ZU.O,ZUMZ.O,ZUMZ.OQ,ZX.N,ZYNE.O,ZYNE.OQ,ZZ.N,ZZC.N
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-12-31 22:00:00+00:00,,,,,,,,,,,...,,,,-0.015117,,,,,,
2013-01-02 22:00:00+00:00,,,,,,,,,,,...,,,,-0.072197,,,,,,
2013-01-03 22:00:00+00:00,,,,,,,,,,,...,,,,0.044425,,,,,,
2013-01-04 22:00:00+00:00,,,,,,,,,,,...,,,,-0.002159,,,,,,
2013-01-07 22:00:00+00:00,,,,,,,,,,,...,,,,0.019240,,,,,,
2013-01-08 22:00:00+00:00,,,,,,,,,,,...,,,,0.033704,,,,,,
2013-01-09 22:00:00+00:00,,,,,,,,,,,...,,,,-0.030560,,,,,,
2013-01-10 22:00:00+00:00,,,,,,,,,,,...,,,,-0.041177,,,,,,
2013-01-11 22:00:00+00:00,,,,,,,,,,,...,,,,-0.047996,,,,,,
2013-01-14 22:00:00+00:00,,,,,,,,,,,...,,,,-0.080143,,,,,,
