In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import lib to clear the news
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed, concatenate
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
TXT_DATA_FILE = 'data2/news/output_GOOGL.csv'
NUM_DATA_FILE = 'data2/prices/stockPrices_GOOGL.csv'

In [None]:
# the number of words taken into consideration
MAX_FEATURES = 20000
# max lenght of one pieces of news
MAX_LEN = 30
# max number of news taken into consideration per day
MAX_NEWS_NUM = 30
# days taken into consideration
DATE_INTERVAL = 3

In [None]:
txt_df = pd.read_csv(TXT_DATA_FILE)
txt_df['date'] = pd.to_datetime(txt_df['date'])
txt_df.sort_values('date',inplace=True)
txt_df = txt_df[txt_df['date'] < pd.Timestamp(2019,3,1)]
txt_df = txt_df[txt_df['date'] >= pd.Timestamp(2016,1,1)]
txt_df = txt_df.drop(['company'],axis=1)

In [None]:
num_df = pd.read_csv(NUM_DATA_FILE)
num_df['Date'] = pd.to_datetime(num_df['Date'])
num_df.sort_values('Date',inplace=True)

In [None]:
# divide data in to three groups: test development train
num_test = num_df[num_df['Date'] >= pd.Timestamp(2019,1,1)].values # test_set
tmp = num_df[num_df['Date'] < pd.Timestamp(2019,1,1)]
num_dev = tmp[tmp['Date'] >= pd.Timestamp(2018,9,1)].values # development_set
num_train = tmp[tmp['Date'] < pd.Timestamp(2018,9,1)].values # train_set
del tmp

In [None]:
print(txt_df.shape)
txt_df.head(10)

In [None]:
# count the number of news by date
# in order to check the dense of news
news_num_date = txt_df.groupby(txt_df['date']).count()
attribute =  'text'
plt.bar(news_num_date.index,news_num_date[attribute])
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel('number')
plt.show()
del news_num_date

In [None]:
# clear news 
# remove non-word and lemmatize words
def _clean_text(text):
    lemma=WordNetLemmatizer()
    text=str(text)
    #text=re.sub('[^a-zA-Z\-\']', ' ',text)  # How to deal with 'NUMBER'?
    #text=[lemma.lemmatize(w) for w in word_tokenize(text)]
    text.replace('\'s','') #!
    text.replace('\'','') #!
    text=[lemma.lemmatize(w) for w in text.lower().split()]  # 词性还原
    text=' '.join(text)
    text=re.sub('[^a-zA-Z]', ' ' ,text) #!
    return text

def clean_news(df):
    text = df['text']
    text = _clean_text(text)
    return text

In [None]:
txt_df['text'] = txt_df.apply(clean_news, axis=1)

In [None]:
# change the dataframe into dict
# map: pd.Timestamp->news_group
def df_to_dict(df):
    news_group_dict = dict()
    for index, row in df.iterrows():
        if row['date'] not in news_group_dict:
            news_group_dict[row['date']] = list()
        news_group_dict[row['date']].append(row['text'])
    
    for key in news_group_dict:
        blank = MAX_NEWS_NUM - len(news_group_dict[key])
        if blank >= 0:
            # need some blank
            for _ in range(blank):
                news_group_dict[key].append('')
        else:
            # need delete some elements
            for _ in range(-blank):
                # best is 'random'
                news_group_dict[key].pop()    
    return news_group_dict

In [None]:
# divide data in to three groups: test development train
txt_test = df_to_dict(txt_df[txt_df['date'] >= pd.Timestamp(2019,1,1)]) # test_set
tmp = txt_df[txt_df['date'] < pd.Timestamp(2019,1,1)]
txt_dev = df_to_dict(tmp[tmp['date'] >= pd.Timestamp(2018,9,1)]) # development_set
txt_train = df_to_dict(tmp[tmp['date'] < pd.Timestamp(2018,9,1)]) # train_set
del tmp

In [None]:
# change text into sequences with Keras
tmp = txt_df[txt_df['date'] < pd.Timestamp(2019,1,1)]
tk_train = tmp[tmp['date'] < pd.Timestamp(2018,9,1)]
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(tk_train['text']))
del tmp,tk_train

def _text_to_sequences(alist):
    tokens = tokenizer.texts_to_sequences(alist)
    seqs = pad_sequences(tokens,maxlen=MAX_LEN,truncating='post')
    return seqs

def text_to_sequences_by_day(adict):
    # inplace
    for (date,text_list) in adict.items():
        adict[date] = _text_to_sequences(text_list)
    return adict

# overwrite
txt_dev = text_to_sequences_by_day(txt_dev)
txt_test = text_to_sequences_by_day(txt_test)
text_train = text_to_sequences_by_day(txt_train)

In [None]:
from pandas.tseries.offsets import DateOffset
# generate structed input with sliding_window
def get_x_seqs_by_sw(data_dict, days=DATE_INTERVAL):
    range_dict = dict()
    '''
    for (date,news_list) in data_dict.items():
        for n_date in pd.date_range(start=date+DateOffset(days=1), periods=days):
            if n_date not in range_dict:
                range_dict[n_date] = list()
            range_dict[n_date].append(news_list)
    for key in range_dict:
        range_dict[key] = np.array(range_dict[key])
    '''
    mindate = pd.Timestamp(2099,12,31)
    maxdate = pd.Timestamp(2000,1,1)
    for date in data_dict.keys():
        mindate = min(mindate, date)
        maxdate = max(maxdate, date)
    maxdate = maxdate+DateOffset(days=1)
    mindate = mindate+DateOffset(days=days)
    for c_date in pd.date_range(start=mindate, end=maxdate):
        range_dict[c_date] = list()
        for p_date in pd.date_range(end=c_date-DateOffset(days=days), periods=days):
            if p_date in data_dict:
                range_dict[c_date].append(np.array(data_dict[p_date]))
            else:
                range_dict[c_date].append(np.zeros((MAX_NEWS_NUM,MAX_LEN)))
        range_dict[c_date] = np.array(range_dict[c_date])
    return range_dict

# ATTENTION:same as other in 'Numerical'
def get_y(data_set):
    data_dict =dict()
    len9 = len(data_set)
    for i in range(len9):
        if i > 0:
            rate = data_set[i][1]/data_set[i-1][1]-1
            if rate <=0:
                data_dict[data_set[i][0]] = [1,0]
            else:
                data_dict[data_set[i][0]] = [0,1]
    return data_dict

# ATTENTION: same as other in 'Numerical'
def match_xy(x_dict,y_dict):
    x_list = list()
    y_list = list()
    for key in x_dict.keys():
        if key in y_dict:
            x_list.append(x_dict[key])
            y_list.append(y_dict[key])
    x_arr = np.array(x_list)
    y_arr = np.array(y_list)
    return (x_arr,y_arr)

# match x(news) with y
def get_xy_txt(news_data, num_data):
    x_dict = get_x_seqs_by_sw(news_data)
    y_dict = get_y(num_data)
    return match_xy(x_dict,y_dict)

In [None]:
(x_train,y_train) = get_xy_txt(txt_train,num_train)
(x_test,y_test) = get_xy_txt(txt_test,num_test)
(x_dev,y_dev) = get_xy_txt(txt_dev,num_dev)

# the baseline of word embedding
in the baseline, we just use the trained word vector matrix

In [None]:
# use GloVe
EMB_FILE = "tool/GloVe/glove.42B.300d.txt"
#！can't use 840B
def get_coefs(word,*arr):
    return word,np.asarray(arr,dtype='float32')
emb_index = dict(get_coefs(*o.strip().split()) for o in open(EMB_FILE))

In [None]:
emb_size=300
all_embs = np.stack(emb_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()
word_index = tokenizer.word_index
hit_rate = 0
ft_words = min(MAX_FEATURES,len(word_index))
emb_matrix = np.random.normal(emb_mean,emb_std,(ft_words+1,emb_size))
emb_matrix[0] = np.zeros(emb_size)  #！
for word, i in word_index.items():
    if i > ft_words:
        continue
    emb_vector = emb_index.get(word)
    if emb_vector is not None:
        hit_rate += 1
        emb_matrix[i] = emb_vector
hit_rate = hit_rate/ft_words
print('Hit Rate is: ', hit_rate)

In [None]:
emb_model = Sequential()
emb_model.add(Embedding(ft_words+1, emb_size, weights=[emb_matrix],trainable=False,\
                        input_shape = (,MAX_NEWS_NUM,MAX_LEN,)))
emb_model.compile('rmsprop', 'mse')

In [None]:
x_dev = emb_model.predict(x_dev)
x_test = emb_model.predict(x_test)
x_train = emb_model.predict(x_train)

# the baseline of news embedding
in the baseline, we just add all word vectors in every news up

In [None]:
def news_embedding_baseline(x_data):
    shape = x_data.shape
    shape_dim = len(shape)
    return np.mean(x_data,axis=shape_dim-2)

In [None]:
x_test = news_embedding_baseline(x_test)
x_train = news_embedding_baseline(x_train)
x_dev = news_embedding_baseline(x_dev)