In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

sns.set_style('darkgrid')
sns.set(font_scale=1.6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [6]:
def trainDataLoad(market=True,news=True):
    try:
        from kaggle.competitions import twosigmanews

        env = twosigmanews.make_env()
        (market_df, news_df) = env.get_training_data()

        (market_train_df.shape, news_train_df.shape)
    except:
        print('failed to load data from kaggle, loading data from local directory.')
        if(market):
            market_df=pd.read_csv('./sampleData/market_train.csv')
            if(not news):
                return market_df
        if(news):
            news_df=pd.read_csv('./sampleData/news_train.csv')
            if(not market):
                return news_df
    print('Train data loaded!')
    return (market_df,news_df)

In [None]:
def loadTokens(df_col):
    from ast import literal_eval
    return [literal_eval(content)for content in news_train_df['headline_tokens'].iteritems()]

In [3]:
def timeCut(df,time, replace=True):
    '''
    df: dataFrame with attribute time in datatime64 format
    time: a time in string
    return df slice cutting off the time before the time provided
    '''
    df.time=pd.to_datetime(df.time)
    time=pd.Timestamp(time)
    df_slice = df[df.time>time]
    if replace:
        df=df_slice
    return df_slice

def formatCodeSet(df,field):
    '''
    df:dataframe
    field:field name of the code in the form string in set format
    return the field formatted into array
    '''
    return df[field].str.findall(f"'([\w\./]+)'")

# Part 2 - News data 

The news dataset already included many engineered features for prediction. However it would be nice to further explore the headline features with different kind of embeddings. In order to include the data in the news headlines, it would be useful to apply document embedding so that the model can "understand" the document contents and improve its prediction accordingly.

In [57]:
import nltk
from nltk.tokenize import RegexpTokenizer


def tokeniser_wrapper(tokeniser):
    '''
    A tokeniser wrapper proxy to handle exceptions
    '''
    def wrapped(tokeniser,text):
        try:
            return tokeniser(text)
        except:
            print('Failed tokenisation on input:',text)
            return []
    return lambda text:wrapped(tokeniser,text)
    
    
def getTokens(tokeniser, textCol, iterator=True):
    '''
    Take in a text column and then return an array of tokenised entries
    '''
    if not iterator:
        return list(map(tokeniser,textCol.as_matrix()))
    return map(tokeniser,textCol.as_matrix())

#Tokenisers
tknsr=tokeniser_wrapper(nltk.word_tokenize)
tknsr_noPunc = tokeniser_wrapper(nltk.tokenize.RegexpTokenizer('\w+').tokenize)


In [59]:
news_train_df['headline_tokens']=list(map(tknsr,news_train_df['headline'].as_matrix()))

Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenisation on input: nan
Failed tokenis

In [None]:
#Load Data
news_train_df=trainDataLoad(market=False)

#Reconstruct the token lists
from ast import literal_eval
news_train_df['headline_tokens']=news_train_df['headline_tokens'].apply(literal_eval)

# Data Observations

Before planning what to do with the new, we would first need to understand the data a bit. The sample is a smaller dataset with records starting from 2013.

In [58]:
def distribution(df,groupByCols,col,sort=True,percentage=True):
    count=df[groupByCols+[col]].groupby(groupByCols).count()
    if(percentage):
        count=count*100/len(df)
    if(sort):
        count=count[col].sort_values()
    return count

In [41]:
news_train_df.provider.count()

4054316

There are over 4M news articles.

In [50]:
distribution(news_train_df,['provider'],'time')

provider
CSE      0.000271
DGP      0.002319
ACT      0.002836
EANS     0.003083
ICE      0.004168
TEN      0.004193
FSC      0.004218
CIS      0.005895
JCN      0.015465
AWP      0.018203
ACN      0.024295
EQS      0.052931
BSE      0.094023
RNS      0.416174
ONE      0.441776
CNW      0.450014
HIIS     1.168187
LSE      1.584731
MKW      1.760593
GNW      2.464830
BSW      6.776162
PRN      7.830889
RTRS    76.874743
Name: time, dtype: float64

As seen above, over 75% of the news are from RTRS and then 7.8% for PRN, 6.8% for BSW and then 2.5 for BNW and the rest has less than 10%.

In [74]:
providerTime=distribution(news_train_df,['provider','time'],'urgency')

For other news provider other than RTRS, they have only urgency number 3. For RTRS, most of the news are either 3 or 1 with only very small proportion of urgency level 2.

In [65]:
distribution(news_train_df,['audiences','provider'],'time')

audiences                                                                                                                                            provider
['AEN', 'DNP', 'EMK']                                                                                                                                RTRS         0.000025
['UKI', 'Z', 'C', 'MTL', 'T', 'O', 'SOF', 'U', 'D', 'GRO', 'M', 'GFN', 'EMK', 'OIL', 'E', 'NAT']                                                     RTRS         0.000025
['PCO', 'PCU', 'O', 'DNP', 'PSC', 'PEN', 'U', 'RNP', 'EMK', 'NAW', 'OIL', 'CAN', 'E', 'N']                                                           RTRS         0.000025
['UKI', 'Z', 'C', 'MTL', 'T', 'O', 'SOF', 'U', 'D', 'GRO', 'M', 'GFN', 'EMK', 'J', 'OIL', 'NAW', 'E', 'NAT']                                         RTRS         0.000025
['UKI', 'Z', 'C', 'MTL', 'T', 'O', 'SOF', 'U', 'D', 'GRO', 'M', 'EMK', 'MF', 'OIL', 'J', 'E', 'NAT']                                                 RTRS     

We can embed `subjects`, `audience`, `assetCodes` and headline_tokens in to vector features for the model.
On the otherhand, if assets are embedded, we can use the average of the asset vector as feature.

In [59]:
distribution(news_train_df,['time','provider'],'urgency',sort=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,urgency
time,provider,Unnamed: 2_level_1
2012-12-30 22:05:01+00:00,RTRS,0.000049
2012-12-30 22:10:05+00:00,RTRS,0.000025
2012-12-30 23:00:24+00:00,RTRS,0.000025
2012-12-31 00:48:34+00:00,RTRS,0.000025
2012-12-31 00:52:32+00:00,RTRS,0.000025
2012-12-31 01:06:33+00:00,RTRS,0.000025
2012-12-31 01:09:12+00:00,RTRS,0.000025
2012-12-31 01:16:33+00:00,HIIS,0.000025
2012-12-31 01:16:58+00:00,HIIS,0.000025
2012-12-31 01:18:55+00:00,RTRS,0.000025


From the above, we can see that the news release time is irregular which means that the number of news article would be different in different time period. 

In order to structure the news input, we would need to know about the news-time distribution and address the problem of varying number of news in the same period. One solution could be grouping the news in to channels and calculate the mean/sum for each channels.

In order to deal with the varying input, we would need to reduce the dimensions of assets and provider. For the provider, we group all the providers after the top 4 into `Others`. For assets we apply linear embeddings, turning each assets into a vector of lower dimension(~30). Such vector would be used to estimate the similarity of changes in price of assets which is initialised by applying PCA on the stock prices over initialisation period. Each news is then presented as a feature vectors, all the news are then summed up weightedly according to the respective assets.

let $F_{(T,i)}=[f_{(N,T,i)}^{1},...,f_{(N,T,i)}^{h}]$ be the $i$th news feautre at time step $T$ and $A_N=[a_{(N,T,i)}^{1},...,a_{(N,T,i)}^{M}]$ be the average of the vectors of the news subject, then the news matrix,

$$N_{(T,i)}=[a_{(N,T,i)}^{1}*N_{(T,i)},...,a_{(N,T,i)}^{M}*N_{(T,i)}]$$,

and for each asset $A=[a^{1},....,a^{M}]$ the effect of the news on the asset is,

$$(A \cdot A_N)  F_{(T,i)}=N_{(T,i)}A$$.



## Features engineering

## Headlines 

## Word embeddings

Embeddings mean turning certain categorical/text data into meaningful vectors that can be "ingested" by a machine learning model. Word embeddings is a very common technique used in NLP. By turning words into vectors, we can further compose a meaning representation vectors for each document.

There are different models that can further encode word embeddings into a document embeddings which would be discussed later.

## Document embeddings

While some other models creates embeddings for words, there are also other models that can directly create embeddings for documents of various length.

To train for embeddings, we would first need to tokenise the headline sentences.

### Word embeddings - 1. Word2Vec
Original paper: https://arxiv.org/pdf/1310.4546.pdf

Blog referece: https://www.knime.com/blog/word-embedding-word2vec-explained

Further formalisation: https://papers.nips.cc/paper/5477-neural-word-embedding-as-implicit-matrix-factorization.pdf

Pretrained embeddings:

1. google news: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
2. freebase entity: https://docs.google.com/file/d/0B7XkCwpI5KDYaDBDQm1tZGNDRHc/edit


Word2Vec is one of the most famous embedding for words first [published by Google in 2013](https://arxiv.org/pdf/1310.4546.pdf). It combines the CBOW and the Skip-gram structure to form an encoder. The representation learnt is an embedded vector which encode the coocurrences probability between words and context. The model applied a numerous different technique to simplify the calculations such as `negative sampling` and `hierachical softmax` which are as well applied in other embedding models developed afterwards.

It can be viewed as an auto-encoder model for context-word in terms of deep-learning or [a factorisation(approximation) of the context-words pointwise-mutual information matrix](https://papers.nips.cc/paper/5477-neural-word-embedding-as-implicit-matrix-factorization.pdf). 

The understanding of the latter would help combining the model with other mathematical/statistical models to generate quantified results.

##### Custom embedding - Model training

In [2]:
from gensim.models import Word2Vec

In [69]:
model_w2vCustom = Word2Vec(sentences=news_train_df['headline_tokens'], size=100, window=5, min_count=5, workers=4, sg=0)
#model_w2vCustom.save('./sampleData/word2vecCustom.model')
model_w2vCustom=Word2Vec.load('./sampleData/word2vecCustom.model')

#### Pretrained-models

In [4]:
import gensim
# Load Google's pre-trained Word2Vec model.
fileDir='C:/Users/CK/Downloads/'
freebase_entity='knowledge-vectors-skipgram1000.bin'
google_news='GoogleNews-vectors-negative300.bin'
model_w2vGoogleNews = gensim.models.KeyedVectors.load_word2vec_format(fileDir+google_news, binary=True)

#### Transfer learning

In [None]:
#news_df=pd.read_csv('./sampleData/news_train.csv', encoding = "ISO-8859-1")
#Create the model
model_w2vGoogleNewsPlus=gensim.models.Word2Vec(size=300)
#Build the new vocabs
model_w2vGoogleNewsPlus.build_vocab(news_df['headline_tokens'])
#Read in the pretrained vectors
model_w2vGoogleNewsPlus.intersect_word2vec_format(fileDir+google_news, binary=True)
model_w2vGoogleNewsPlus.train(news_df['headline_tokens'],total_examples=model_NewsPlus.corpus_count,epochs=10)
model_w2vGoogleNewsPlus.save('./sampleData/word2vecTransfer.model')

### Word embeddings - 2. Fastext

Original paper: https://arxiv.org/pdf/1607.04606.pdf

Blog reference: https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext-a209c1d3e12c

Pre-trained embeddings:
    1. https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
    
Fasttext is very similar to word2vec but instead of simply using the whole world, it splits words into subwords as inputs and sum the vectors as the embedding of the word. Such method can generalise the meaning of the words to unseen words.

##### Custom embedding - Model training

In [None]:
from gensim.models import FastText

In [69]:
model_FTCustom = FastText(sentences=news_train_df['headline_tokens'], size=100, window=5, min_count=5, workers=4, sg=0)
model_FTCustom.save('./sampleData/FastTextCustom.model')
#model_FTCustom=FastText.load('./sampleData/word2vecCustom.model')

#### Pretrained-models

In [4]:
import gensim
model_FTgoogleNews = gensim.models.FastText.load_fasttext_format(fileDir+'cc.en.300.bin', binary=True)

#### Transfer learning

In [None]:
#news_train_df=pd.read_csv('./sampleData/news_train.csv', encoding = "ISO-8859-1")
model_FTgoogleNewsPlus=gensim.models.FastText(size=300)
model_FTgoogleNewsPlus.load_fasttext_format(fileDir+'cc.en.300.bin', binary=True)
model_FTgoogleNewsPlus.build_vocab(news_train_df['headline_tokens'])

In [33]:
model_FTgoogleNewsPlus.train(news_train_df['headline_tokens'],total_examples=model_NewsPlus.corpus_count,epochs=10)

(1442456173, 5314372690)

### Document embedding - 1. Doc2Vec

Original paper: https://arxiv.org/pdf/1405.4053.pdf

blog summary: https://medium.com/scaleabout/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

Doc2vec is similar to word2vec. Both of them use the same techniques and encode-decoding model except that now it is embedding the document vector as one of the input vectors for CBOW and using only the document vector to reconstruct the context in skip-gram. The goal is to train the model to learn how to represent a document vector based on the content. 

As it is to learn the document vector summarised, the population distribution of the documents is also important. Meaning that properly control the source of the training document can better encode the meaning of the documents into vectors. 

##### Custom embedding - Model training

In [64]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from ast import literal_eval
documents = [TaggedDocument(literal_eval(content), [i]) for i,content in news_train_df['headline_tokens'].iteritems()]

In [82]:
model_d2vCustom = Doc2Vec(documents=documents,vector_size=100, window=5, min_count=1, workers=4)

In [None]:
model_d2vCustom.save('./sampleData/model_d2vCustom100.model')