### Import libraries


In [1]:
# for data pre-processing 
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.corpus import stopwords 
#from sklearn.manifold import TSNE
#from gensim.models import KeyedVectors
import gensim.models
from gensim.utils import lemmatize
import nltk 
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

# for classification modelling 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import string
import sklearn.metrics as metrics
from datetime import datetime
from datetime import timedelta

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

### Import dataset (news headlines and financial data)

In [2]:
# import news headlines
df = pd.read_csv('finalData.csv')
df = df.drop(['Unnamed: 2','Unnamed: 3'], axis=1)

In [3]:
# import financial data
# include column based on today's close to tomorrow's open (binary classification)
stock_prices = pd.read_csv('AAPLfinal.csv')
stock_prices = stock_prices.drop(['Unnamed: 0'], axis=1)
stock_prices["Date"] = pd.to_datetime(stock_prices["Date"])

for i in range(stock_prices.shape[0]-1):
    if stock_prices['Close'][i]<stock_prices['Open'][i+1]:
        stock_prices.loc[i,'response_variable']= 1
    else:
        stock_prices.loc[i,'response_variable']= -1


stock_prices=stock_prices[:stock_prices.shape[0]-1] # remove the last row, no target class

In [4]:
#stock_prices = stock_prices.drop(['Open','High','Low','Adj.Close'], axis=1)
stock_prices.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj.Close,Volume,Revenue,Changes.in.working.capital,Dividend.paid,Net.changes.in.cash,response_variable
0,2016-12-01,110.370003,110.940002,109.029999,109.489998,104.344177,37086900,46852,4127,-3902,2247,-1.0
1,2016-12-02,109.169998,110.089996,108.849998,109.900002,104.734901,26528000,46852,4127,-3902,2247,1.0
2,2016-12-05,110.0,110.029999,108.25,109.110001,103.982048,34324500,46852,4127,-3902,2247,1.0
3,2016-12-06,109.5,110.360001,109.190002,109.949997,104.782547,26195500,46852,4127,-3902,2247,-1.0
4,2016-12-07,109.260002,111.190002,109.160004,111.029999,105.811821,29998700,46852,4127,-3902,2247,-1.0


In [5]:
# feature selection 
df_selection = stock_prices.copy()
Y_selection = df_selection["response_variable"].tolist() # response variable 
final_df_selection = df_selection.drop(["response_variable","Date"], axis=1)
X_selection = final_df_selection.values.tolist() # regressor variables
X_selection, Y_selection = shuffle(X_selection, Y_selection, random_state=42)

In [6]:
# run random forest model for feature selection
x_train = []
y_train = []
x_test = []
y_test = []
x_train, x_test, y_train, y_test = train_test_split(X_selection, Y_selection, test_size=0.2, random_state=42)
rfc = RandomForestClassifier(random_state=32)
rfc.fit(x_train,y_train)

RandomForestClassifier(random_state=32)

In [7]:
# obtain features importance
feature_importance = rfc.feature_importances_
indices = np.argsort(feature_importance)[::-1]

for f in range(len(stock_prices.columns)-2): # minus 2, one for response variable, one for date
    print("%d. feature %s (%f)" % (f + 1, stock_prices.columns[indices[f]+1], feature_importance[indices[f]]))

1. feature Volume (0.175800)
2. feature Open (0.153157)
3. feature High (0.151942)
4. feature Close (0.149763)
5. feature Low (0.146649)
6. feature Adj.Close (0.140836)
7. feature Net.changes.in.cash (0.022347)
8. feature Dividend.paid (0.020834)
9. feature Revenue (0.019720)
10. feature Changes.in.working.capital (0.018951)


In [8]:
# select the top __ features based on the features importance
number = 3 # number of features
temp_stock_prices = stock_prices.copy()
for i in range(number, len(stock_prices.columns)-2):
    stock_prices = stock_prices.drop([str(temp_stock_prices.columns[indices[i]+1])], axis=1)
    
stock_prices.head()    

Unnamed: 0,Date,Open,High,Volume,response_variable
0,2016-12-01,110.370003,110.940002,37086900,-1.0
1,2016-12-02,109.169998,110.089996,26528000,1.0
2,2016-12-05,110.0,110.029999,34324500,1.0
3,2016-12-06,109.5,110.360001,26195500,-1.0
4,2016-12-07,109.260002,111.190002,29998700,-1.0


# Feature engineering 

1. VADER 
2. Word Embedding (Word2Vec)

### VADER

In [9]:
df_jj = df.copy()
df_jj = df_jj.groupby(['Date'])['Header'].apply(lambda x: ','.join(x.astype(str))).reset_index() # combine rows with the same data
df_jj["Date"] = pd.to_datetime(df_jj["Date"])
df_jj = df_jj.sort_values("Date")
df_jj = df_jj.reset_index().drop(["index"], axis=1)

In [10]:
ps=PorterStemmer()
lemmatizer=WordNetLemmatizer()
SA=SentimentIntensityAnalyzer()
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\edmun\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
stop_words=set(stopwords.words("english"))

###for headers
for i in range(df_jj.shape[0]):
  tokens=[]
  filtered_tokens=[]
  final = " "
  if(type(df_jj.loc[i,'Header'])==float):
    df_jj.loc[i,'filtered_header'] =final
  else:
    tokens.append((word_tokenize(df_jj.loc[i,'Header'])))
    #filter &stem tokens
    for w in tokens[0]:
      if w not in stop_words:
        result=pos_tag([w])
        if result[0][1].startswith('J'):
            tag=wordnet.ADJ
        elif result[0][1].startswith('V'):
            tag=wordnet.VERB
        elif result[0][1].startswith('N'):
            tag=wordnet.NOUN
        elif result[0][1].startswith('R'):
            tag=wordnet.ADV
        else:
            tag=wordnet.NOUN
        filtered_tokens.append(lemmatizer.lemmatize(word=result[0][0],pos=tag))                      
    for j in filtered_tokens:
      final+=j
      final+= " "
    df_jj.loc[i,'filtered_header'] =final

#stop_words.add() # remove those lines at the top using this / clean the data before running

"""
###for text
for i in range(#):
  tokens=[]
  filtered_tokens=[]
  final = " "
  if(type(news.loc[i,'Text'])==float):
    news.loc[i,'filtered_text'] =final
  else:
    tokens.append((word_tokenize(news.loc[i,'Text'])))
    #filter &stem tokens
    for w in tokens[0]:
      if w not in stop_words:
        result=pos_tag([w])
        if result[0][1].startswith('J'):
            tag=wordnet.ADJ
        elif result[0][1].startswith('V'):
            tag=wordnet.VERB
        elif result[0][1].startswith('N'):
            tag=wordnet.NOUN
        elif result[0][1].startswith('R'):
            tag=wordnet.ADV
        else:
            tag=wordnet.NOUN
        filtered_tokens.append(lemmatizer.lemmatize(word=result[0][0],pos=tag))                      
    for j in filtered_tokens:
      final+=j
      final+= " "
    print(final)
    news.loc[i,'filtered_text'] =final
"""

'\n###for text\nfor i in range(#):\n  tokens=[]\n  filtered_tokens=[]\n  final = " "\n  if(type(news.loc[i,\'Text\'])==float):\n    news.loc[i,\'filtered_text\'] =final\n  else:\n    tokens.append((word_tokenize(news.loc[i,\'Text\'])))\n    #filter &stem tokens\n    for w in tokens[0]:\n      if w not in stop_words:\n        result=pos_tag([w])\n        if result[0][1].startswith(\'J\'):\n            tag=wordnet.ADJ\n        elif result[0][1].startswith(\'V\'):\n            tag=wordnet.VERB\n        elif result[0][1].startswith(\'N\'):\n            tag=wordnet.NOUN\n        elif result[0][1].startswith(\'R\'):\n            tag=wordnet.ADV\n        else:\n            tag=wordnet.NOUN\n        filtered_tokens.append(lemmatizer.lemmatize(word=result[0][0],pos=tag))                      \n    for j in filtered_tokens:\n      final+=j\n      final+= " "\n    print(final)\n    news.loc[i,\'filtered_text\'] =final\n'

In [12]:
###for headers
for i in range(df_jj.shape[0]):
    if df_jj['filtered_header'][i] == " ":
        pol_score=0
    else:
        pol_score= SA.polarity_scores(df_jj['filtered_header'][i])

    df_jj.loc[i,'filtered_header_score']= pol_score.get('compound') # raw score
    df_jj.loc[i,'filtered_header_sentiments']=0

    if pol_score.get('compound')>0.1:
        df_jj.loc[i,'filtered_header_sentiments']=1
    elif pol_score.get('compound')<-0.1:
        df_jj.loc[i,'filtered_header_sentiments']=-1
        
df_jj_final = df_jj.drop(["Header","filtered_header","filtered_header_score"],axis=1) #  remove unneccessary columns, only need the date and filtered sentiment score

In [13]:
stock_prices = stock_prices.merge(df_jj_final,on="Date")
stock_prices.head()

Unnamed: 0,Date,Open,High,Volume,response_variable,filtered_header_sentiments
0,2016-12-12,113.290001,115.0,26374400,1.0,0.0
1,2016-12-13,113.839996,115.919998,43733800,-1.0,0.0
2,2016-12-14,115.040001,116.199997,34031800,1.0,-1.0
3,2016-12-16,116.470001,116.5,44351100,-1.0,-1.0
4,2016-12-19,115.800003,117.379997,27779400,1.0,0.0


### Word Embedding (Word2Vec)

In [14]:
df_ed = df.copy()
df_ed = df_ed.drop(["Date"],axis=1)

In [15]:
def operation_to_token(x):
    # tokenize each headline
    tokens = word_tokenize(x)
    #tokens=[wd.decode('utf-8').split('/')[0] for wd in lemmatize(x)]
    # make all lowercase
    tokens=[token.lower() for token in tokens]
    # remove punctuation
    words = [word for word in tokens if (word.isalpha() and word!='s' and word!='t')]
    # remove stop words 
    stop_words = stopwords.words("english")
    words = [w for w in words if not w in stop_words]
    # lemmatize
    return words    

In [16]:
new_data = df_ed["Header"].apply(operation_to_token)
all_sentences = []
for text in new_data:
    all_sentences.append(text)

In [17]:
model = gensim.models.Word2Vec(all_sentences, min_count=1,size=10,workers=4, window=3,sg=1) # building word embedding model 

In [131]:
model.most_similar("apple")

  """Entry point for launching an IPython kernel.


[('new', 0.9988701939582825),
 ('reality', 0.9986035823822021),
 ('gadfly', 0.9985471963882446),
 ('iphone', 0.9982105493545532),
 ('tax', 0.9979653358459473),
 ('eu', 0.9978306889533997),
 ('tech', 0.9978207945823669),
 ('could', 0.9977580904960632),
 ('jobs', 0.9975736141204834),
 ('million', 0.9975494146347046)]

In [21]:
def get_vectors(x):
    
    length = len(x)
    temp_vector = model[x[0]]
    temp_vector = temp_vector.copy()
    for i in range(1,length):
        temp_vector+=model[x[i]]
    return temp_vector
    #return temp_vector/length
    

In [23]:
# clean dataset, combine str with same date index
df_ed = df.copy()
df_ed = df_ed.groupby(['Date'])['Header'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df_ed["Date"] = pd.to_datetime(df_ed["Date"])
df_ed = df_ed.sort_values("Date")
df_ed = df_ed.reset_index().drop(["index"], axis=1)

df_ed["tokens"] = df_ed["Header"].apply(operation_to_token)
df_ed["embedding"] = df_ed["tokens"].apply(get_vectors)
#df_ed['check_nan'] = df_ed['embedding'].isnull()
#df_ed = df_ed[df_ed['check_nan'] == False]

df_ed_final = pd.DataFrame(df_ed["embedding"].to_list(), columns=list(range(1,11)))
df_ed_final.insert(0,"Date",df_ed["Date"])

'''
df_ed["tokens"] = df_ed["Header"].apply(operation_to_token)
df_ed["embedding"] = df_ed["tokens"].apply(get_vectors)
df_ed_final = pd.DataFrame(df_ed["embedding"].to_list(), columns=['coordinate_1', 'coordinate_2','coordinate_3','coordinate_4','coordinate_5','coordinate_6', 'coordinate_7','coordinate_8','coordinate_9','coordinate_10'])
df_ed_final.insert(0,"Date",df_ed["Date"])
'''

  after removing the cwd from sys.path.
  import sys


'\ndf_ed["tokens"] = df_ed["Header"].apply(operation_to_token)\ndf_ed["embedding"] = df_ed["tokens"].apply(get_vectors)\ndf_ed_final = pd.DataFrame(df_ed["embedding"].to_list(), columns=[\'coordinate_1\', \'coordinate_2\',\'coordinate_3\',\'coordinate_4\',\'coordinate_5\',\'coordinate_6\', \'coordinate_7\',\'coordinate_8\',\'coordinate_9\',\'coordinate_10\'])\ndf_ed_final.insert(0,"Date",df_ed["Date"])\n'

In [66]:
final_df_full = stock_prices.merge(df_ed_final,on="Date")
#final_df_full = stock_prices.copy()
final_df_full.head()

Unnamed: 0,Date,Open,High,Volume,response_variable,filtered_header_sentiments,1,2,3,4,5,6,7,8,9,10
0,2016-12-12,113.290001,115.0,26374400,1.0,0.0,-1.631669,-1.135485,-0.934448,-0.548317,2.137584,0.384222,0.590332,0.197387,-1.625826,3.431614
1,2016-12-13,113.839996,115.919998,43733800,-1.0,0.0,-2.054861,-1.312573,-0.983634,-0.847616,2.698654,0.562702,0.820525,0.241145,-1.921839,4.282112
2,2016-12-14,115.040001,116.199997,34031800,1.0,-1.0,-2.25212,-1.525895,-1.139843,-0.912548,2.972388,0.542293,0.923129,0.352002,-2.09227,4.723604
3,2016-12-16,116.470001,116.5,44351100,-1.0,-1.0,-1.377172,-0.86922,-0.627053,-0.620866,1.923679,0.326583,0.651395,0.07815,-1.31163,2.924962
4,2016-12-19,115.800003,117.379997,27779400,1.0,0.0,-1.559828,-1.134874,-0.766531,-0.629012,2.249535,0.379297,0.713715,0.207526,-1.593646,3.515619


# Building Classification Model

1. Support-Vector Machine
2. Random Forest 
3. K-Nearest Neighbor

In [67]:
# convert regressors and response into list format 
Y = final_df_full["response_variable"].tolist() # response variable 
final_df_full = final_df_full.drop(["response_variable","Date"], axis=1)
X = final_df_full.values.tolist() # regressor variables
X, Y = shuffle(X, Y)

### Support-Vector Machine

In [122]:
x_train = []
y_train = []
x_test = []
y_test = []
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train)
x_train = scaling.transform(x_train)
x_test = scaling.transform(x_test)
# y_train = y_train.reshape(90,1)
# y_test = y_test.reshape(10,1)
clf = SVC(kernel='linear')
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("============== Support-Vector Machine ===============")
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))
#print('\n')
print("Classification Report:")
print(classification_report(y_test, y_pred,zero_division=0))
print('Accuracy score: ' + str(accuracy_score(y_test,y_pred)))
print("=====================================================")

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00        37
         1.0       0.65      1.00      0.79        70

    accuracy                           0.65       107
   macro avg       0.33      0.50      0.40       107
weighted avg       0.43      0.65      0.52       107

Accuracy score: 0.6542056074766355


### Random Forest

In [125]:
x_train = []
y_train = []
x_test = []
y_test = []
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred = rfc.predict(x_test)
temp=accuracy_score(y_test,y_pred)

print("================== Random Forest ===================")
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))
#print('\n')
print("Classification Report:")
print(classification_report(y_test, y_pred,zero_division=0))
print('Accuracy score: ' + str(accuracy_score(y_test,y_pred)))
print("=====================================================")

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.57      0.57      0.57        49
         1.0       0.64      0.64      0.64        58

    accuracy                           0.61       107
   macro avg       0.60      0.60      0.60       107
weighted avg       0.61      0.61      0.61       107

Accuracy score: 0.6074766355140186


### K Nearest Neighbor

In [129]:
x_train = []
y_train = []
x_test = []
y_test = []
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
temp=accuracy_score(y_test,y_pred)

print("================ K-Nearest Neighbor =================")
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))
#print('\n')
print("Classification Report:")
print(classification_report(y_test, y_pred))
print('Accuracy score: ' + str(accuracy_score(y_test,y_pred)))
print("=====================================================")

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.53      0.66      0.59        35
         1.0       0.81      0.72      0.76        72

    accuracy                           0.70       107
   macro avg       0.67      0.69      0.68       107
weighted avg       0.72      0.70      0.71       107

Accuracy score: 0.7009345794392523


### Logistic Regression

In [128]:
x_train = []
y_train = []
x_test = []
y_test = []
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

logr=LogisticRegression(solver='saga',max_iter=800,multi_class='ovr')
logr.fit(x_train,y_train)
y_pred = logr.predict(x_test)
temp=accuracy_score(y_test,y_pred)

print("================ Logistic Regression ================")
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))
#print('\n')
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print('Accuracy score: ' + str(accuracy_score(y_test,y_pred)))
print("=====================================================")





Classification Report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00        32
         1.0       0.70      1.00      0.82        75

    accuracy                           0.70       107
   macro avg       0.35      0.50      0.41       107
weighted avg       0.49      0.70      0.58       107

Accuracy score: 0.7009345794392523




# The End 