In [None]:
# in the file, we try to remove the duplicated news

In [None]:
import re
import math

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed,concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
# clean raw texts 
def clean_news(news_raws):
    lemma=WordNetLemmatizer()
    news_rows=[]
    for i in range(len(news_raws)):
        news=str(news_raws[i])
        news=re.sub('[^a-zA-Z]', ' ',news) #remove Non-English
        news=[lemma.lemmatize(w) for w in word_tokenize(str(news).lower())]  # lemmatize the word
        news=' '.join(news)
        news_rows.append(news)
    return news_rows

In [None]:
NEWS_FILE = 'data/news_reuters.csv'
col_names = ['code','name','date','headline','article','importance']
news = pd.read_csv(NEWS_FILE,names=col_names)

In [None]:
news

In [None]:
news['text'] = clean_news(news['headline'])

In [None]:
# use TF-IDF to vectorize text
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(news['text'] )
word=vectorizer.get_feature_names()

In [None]:
# remove the deplicated news
def remove_deplicates(news_file,rad=10,threshold = 0.9):
    dp_list = set()
    tops = news.loc[news['importance']=='topStory']
    for index in tops.index:
        top = list()
        top.append(str(news['text'].loc[index]))
        li = list(range(index-rad,index+rad+1))
        win = [str(item) for item in news['text'].loc[li]]
        top = vectorizer.transform(top).toarray()[0]
        win = vectorizer.transform(win).toarray()
        for i in range(len(win)):
            cur = win[i]
            up = np.dot(top,cur)
            down = math.sqrt(np.dot(top,top))*math.sqrt(np.dot(cur ,cur))
            if down == 0 :
                continue
            sim = up/down
            if sim > threshold and rad!=i:
                dp_list.add(index-rad+i)
    print('need to drop %d news'%(len(dp_list)))
    return dp_list

In [None]:
dp_list = remove_deplicates(news)

In [None]:
news2 = news.drop(list(dp_list))
news2['value'] = 0
news2['value'].loc[news['importance']=='topStory'] = 1
news2 = news2.drop(['name','headline','article','importance'],axis=1)
news2 = news2.reset_index(drop=True)

In [None]:
news2.to_csv('data/cleaned_news.csv',index=False)

In [None]:
print(len(news),len(news2))

In [None]:
emb_size = 300
max_features = 20000
maxlen = 50

In [None]:
news_by_code = dict()
codes =  news['code'].drop_duplicates() # 得到股票代码
for code in codes:
    news_by_code[code] =news2.loc[news2['code']==code]

In [None]:
# 训练tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(news2['text']))
print('the number of different words:',len(tokenizer.word_index))

In [None]:
# 使用GloVe词向量
EMB_FILE = "tool/GloVe/glove.42B.300d.txt"
def get_coefs(word,*arr):
    return word,np.asarray(arr,dtype='float32')
emb_index = dict(get_coefs(*o.strip().split()) for o in open(EMB_FILE))

In [None]:
all_embs = np.stack(emb_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()
word_index = tokenizer.word_index
hit_rate = 0
ft_words = min(max_features,len(word_index))
emb_matrix = np.random.normal(emb_mean,emb_std,(ft_words+1,emb_size))
for word, i in word_index.items():
    if i > ft_words:
        continue
    emb_vector = emb_index.get(word)
    if emb_vector is not None:
        hit_rate += 1
        emb_matrix[i] = emb_vector
    else:
        pass
        # print(word)
hit_rate = hit_rate/ft_words
print("the percentage of words in dictionary: ", hit_rate)

In [None]:
embedding_layer = Embedding(ft_words+1,300,weights=[emb_matrix],trainable=False)
article_layer = Bidirectional(GRU(50, return_sequences=True),name='article')

def build_model(code='Default'):
    inpt = Input(shape = (maxlen,) )
    x = embedding_layer(inpt)
    x = article_layer (x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.2)(x)
    x = Dense(50, activation='tanh',name=code)(x)
    x = Dropout(0.2)(x)
    x = Dense(1,activation='sigmoid')(x)
    model = Model(inputs=inpt,outputs=x)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
model_bc = dict()
for code in codes:
    model_bc[code] = build_model(code)

In [None]:
def get_XY(df):
    value_list = df['value'].tolist()
    news_list = df['text'].tolist()
    news_list = tokenizer.texts_to_sequences(news_list)
    news_list = pad_sequences(news_list,maxlen=maxlen,truncating='post')
    return (np.array(news_list),np.array(value_list).reshape(-1,1))

In [None]:
X_train_bc = dict()
Y_train_bc = dict()
X_test_bc = dict()
Y_test_bc = dict()
for code in codes:
    X_tmp, Y_tmp = get_XY(news_by_code[code])
    X_train_bc[code],X_test_bc[code],Y_train_bc[code],Y_test_bc[code] = train_test_split(X_tmp,Y_tmp,test_size=0.2,random_state = 16)

In [None]:
model_bc['GOOGL'].summary()

In [None]:
turns = 50
for i in range(turns):
    for code in codes:
        model_bc[code].fit(X_train_bc[code],Y_train_bc[code],batch_size=16,epochs=1,validation_split=0.1,verbose=1)

In [None]:
for code in codes:
    print(len(news_by_code[code].loc[news_by_code[code]['value']==1]),len(news_by_code[code].loc[news_by_code[code]['value']==0]))

In [None]:
model_bc['AAPL'].fit(X_train_bc['AAPL'],Y_train_bc['AAPL'],batch_size=16,epochs=20,validation_split=0.1,verbose=1)