In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import lib to clear the news
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
TXT_DATA_FILE = 'data2/news/output_GOOGL.csv'
NUM_DATA_FILE = 'data2/news/stockPrices_GOOGL.csv'

In [None]:
MAX_FEATURES = 10000
MAX_LEN = 30

In [None]:
txt_df = pd.read_csv(TXT_DATA_FILE)
txt_df['date'] = pd.to_datetime(txt_df['date'])
txt_df.sort_values('date',inplace=True)
txt_df = txt_df[txt_df['date'] < pd.Timestamp(2019,2,1)]
txt_df = txt_df[txt_df['date'] >= pd.Timestamp(2016,1,1)]
txt_df = txt_df.drop(['company'],axis=1)

In [None]:
print(txt_df.shape)
txt_df.head(10)

In [None]:
# count the number of news by date
# in order to check the dense of news
news_num_date = txt_df.groupby(txt_df['date']).count()
attribute =  'text'
plt.bar(news_num_date.index,news_num_date[attribute])
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel('number')
plt.show()
del news_num_date

In [None]:
# clear news 
# remove non-word and lemmatize words
def _clean_text(text):
    lemma=WordNetLemmatizer()
    text=str(text)
    text=re.sub('[^a-zA-Z\-\']', ' ',text)  # How to deal with 'NUMBER'?
    #text=[lemma.lemmatize(w) for w in word_tokenize(text)] 
    text=[lemma.lemmatize(w) for w in text.lower().split()]  # 词性还原
    text=' '.join(text)
    return text

def clean_news(df):
    text = df['text']
    text = _clean_text(text)
    return text

In [None]:
txt_df['text'] = txt_df.apply(clean_news, axis=1)

In [None]:
# change the dataframe into dict
# dict: pd.Timestamp->news_group
def df_to_dict(df):
    news_group_dict = dict()
    for index, row in df.iterrows():
        if row['date'] not in news_group_dict:
            news_group_dict[row['date']] = list()
        news_group_dict[row['date']].append(row['text'])
    return news_group_dict

In [None]:
# divide data in to three groups
txt_test = df_to_dict(txt_df[txt_df['date'] >= pd.Timestamp(2019,1,1)]) # test_set
tmp = txt_df[txt_df['date'] < pd.Timestamp(2019,1,1)]
txt_dev = df_to_dict(tmp[tmp['date'] >= pd.Timestamp(2018,9,1)]) # development_set
txt_train = df_to_dict(tmp[tmp['date'] < pd.Timestamp(2018,9,1)]) # train_set
del tmp

In [None]:
# change text into sequences with Keras
tmp = txt_df[txt_df['date'] < pd.Timestamp(2019,1,1)]
tk_train = tmp[tmp['date'] < pd.Timestamp(2018,9,1)]
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(tk_train['text']))
del tmp,tk_train

def _text_to_sequences(alist):
    tokens = tokenizer.texts_to_sequences(alist)
    seqs = pad_sequences(tokens,maxlen=MAX_LEN,truncating='post')
    return seqs

def text_to_sequences_by_day(adict):
    # inplace
    for (date,text_list) in adict.items():
        adict[date] = _text_to_sequences(text_list)
    return adict

# overwrite
txt_dev = text_to_sequences_by_day(txt_dev)
txt_test = text_to_sequences_by_day(txt_test)
text_train = text_to_sequences_by_day(txt_train)

In [None]:
from pandas.tseries.offsets import DateOffset
def get_x_seqs_by_sw(data_dict, days=3):
    # with sliding_window
    range_dict = dict()
    for (date,news_list) in data_dict.items():
        for n_date in pd.date_range(start=date+DateOffset(days=1), periods=days):
            if n_date not in range_dict:
                range_dict[n_date] = list()
            range_dict[n_date].append(news_list)
    for key in range_dict:
        range_dict[key] = np.array(range_dict[key])
    return range_dict