In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')


### Load datasets for Apple stock prices, News aricles and tweets about Apple

In [2]:
df = pd.read_csv('./datasets/AAPL.csv')
final_news = pd.read_csv('./datasets/news_clean.csv')
final_tweets = pd.read_csv('./datasets/tweets_clean.csv')

### Load the saved vectorizer

In [None]:
vectorizer = load('./models/tfidf.xz')

### The getSentiment function uses a given model to classify the tweets and news as positive or negative and then returns the avg value of the news and tweets

For example if there are 3 news articles and 2/3 are positive, the sentiment score for the news articles for that date becomes 0.6667

Similarly if there are 20 tweets and 15 of them are positive, the sentiment score for the tweets for that date becomes 0.75

So the average sentiment score for the day is 0.708, which is an average of the sentiment scores for the tweets and news

In [4]:
df_sent = pd.DataFrame(df['Date'])
df_sent['Open'] = df['Open']
def getSentiment():
    
    pred = []
    pred1 = []

    for x,y in zip(final_news['News'], final_tweets['Tweets']):
        sample = x.split("|")
        sample1 = y.split("|")
        

        test_sent = vectorizer.transform(sample)
        test_sent1 = vectorizer.transform(sample1)
        
        prediction = model.predict(test_sent)
        prediction1 = model.predict(test_sent1)

        su = 0
        for p in prediction:
            su += int(p)
            
        su1 = 0
        for p in prediction1:
            su1 += int(p)

        pred.append(su/len(sample))
        pred1.append(su1/len(sample1))
    df_sent['News'] = pred
    df_sent['Tweets'] = pred1
    
    df_sent['Avg'] = df_sent[['Tweets', 'News']].mean(axis=1)

## Load each model, get the avg sentiment score, and then save the output to a csv file

### 1. Linear regression

In [5]:
model = pickle.load(open('./models/linear_regression.sav', 'rb'))
getSentiment()
df_sent.to_csv('./datasets/AAPL_regression.csv', index=False)

### 2. SVM

In [7]:
model = pickle.load(open('./models/linear_svc.sav', 'rb'))
getSentiment()
df_sent.to_csv('./datasets/AAPL_svc.csv', index=False)

### 3. Naive Bayes

In [8]:
model = pickle.load(open('./models/multinomial_NB.sav', 'rb'))
getSentiment()
df_sent.to_csv('./datasets/AAPL_nb.csv', index=False)

### 4. LSTM

In [9]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [10]:
df_train = pd.read_csv("./datasets/clean_sent_160k_train.csv",low_memory=False,error_bad_lines=False)
df_train.head()

Unnamed: 0,sentiment,text
0,0,thats bummer shoulda got david carr third day
1,0,upset cant update facebook texting might cry r...
2,0,dived many times ball managed save 50 rest go ...
3,0,whole body feels itchy like fire
4,0,behaving im mad cant see


In [11]:
df_train.dropna(inplace=True)
df_train.reset_index(drop=True,inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1592046 entries, 0 to 1592045
Data columns (total 2 columns):
sentiment    1592046 non-null int64
text         1592046 non-null object
dtypes: int64(1), object(1)
memory usage: 24.3+ MB


In [12]:
tokenizer = Tokenizer(num_words=20000)

max_words = 20

tokenizer.fit_on_texts(df_train['text'].values)

### The input format for LSTM is different from the other models, so it reqires a different getSentiment method

In [30]:
def getSentimentLSTM():
    
    pred = []
    pred1 = []

    for x,y in zip(final_news['News'], final_tweets['Tweets']):
        sample = x.split("|")
        sample1 = y.split("|")
        
        sample = tokenizer.texts_to_sequences(sample)
        
        sample = pad_sequences(sample, maxlen=20, dtype='int32', value=0)
        sentiment = model.predict(sample,batch_size=1,verbose = 2)


        sample1 = tokenizer.texts_to_sequences(sample1)
        
        sample1 = pad_sequences(sample1, maxlen=20, dtype='int32', value=0)
        sentiment1 = model.predict(sample1,batch_size=1,verbose = 2)

        su = 0
        for p in sentiment:
            if p > 0.5:
                su += 1
            
        su1 = 0
        for p in sentiment1:
            if p > 0.5:
                su1 += 1

        pred.append(su/len(sample))
        pred1.append(su1/len(sample1))
    df_sent['News'] = pred
    df_sent['Tweets'] = pred1
    
    df_sent['Avg'] = df_sent[['Tweets', 'News']].mean(axis=1)

In [31]:
model = pickle.load(open('./models/LSTM.sav', 'rb'))
getSentimentLSTM()
df_sent.to_csv('./datasets/AAPL_LSTM.csv', index=False)
# df_sent.describe()

Unnamed: 0,Open,News,Tweets,Avg
count,2770.0,2770.0,2770.0,2770.0
mean,86.103062,0.400624,0.542588,0.471606
std,52.221889,0.311516,0.127341,0.167283
min,11.341429,0.0,0.0,0.045455
25%,43.960716,0.166667,0.45,0.342857
50%,80.154999,0.4,0.55,0.466667
75%,116.7875,0.571429,0.623512,0.575
max,230.779999,1.0,0.95,0.925


In [33]:
df_sent.head()

Unnamed: 0,Date,Open,News,Tweets,Avg
0,2008-02-01,19.462856,0.333333,0.65,0.491667
1,2008-02-04,19.172857,0.0,0.25,0.125
2,2008-02-05,18.632856,0.0,0.4,0.2
3,2008-02-06,18.690001,0.0,0.65,0.325
4,2008-02-07,17.138571,0.75,0.5,0.625
