In [12]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import naive_bayes, svm, metrics
import pandas as pd
# reset colwitdth options when running all cells 
pd.reset_option('display.max_colwidth')

#### Load dataset and group by day (all tweets and corresponding stock prices)

In [3]:
data = pd.read_json('processed_data/data_merged.json')
# remove columns that were unexpectedly generated during saving
# data.drop(columns=['level_0', 'index'], inplace=True)
data.tail(3)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
4946,2018-06-04,"[Tesla, Model3pic]",Report: Tesla Has Refunded 23% of Model 3 Depo...,Tesla Motors Club,91,13,13,294.34,296.74,True
4947,2018-06-04,"[Tesla, NikolaTeslapic]",Make electricity free again! #Tesla #NikolaTes...,Mr A 🎶🎵 ♏,2,1,1,294.34,296.74,True
4948,2018-06-04,"[Tesla, ElonMusk, CFD, forex, equitypic]",#Tesla #ElonMusk #CFD #forex #equitypic.twitte...,The Utkarsh,2,1,1,294.34,296.74,True


In [4]:
# group data by day
daily_data = data.groupby(data['timestamp'], as_index=False)
daily_data.first()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.00,320.53,True
1,2018-01-03,[Tesla],Día de piernas ... Estrenando mallas...#Tesla ...,El CaZador,107,3,4,321.00,317.25,False
2,2018-01-04,"[Innovation, Tesla, electricvehicles, Cars, br...",This is awesome! New brand technology #Innovat...,Gabriela Mascaró,6,1,3,312.87,314.62,True
3,2018-01-05,"[Tesla, TeslaModel3]",#Tesla #TeslaModel3 hahapic.twitter.com/DMlxOf...,WtaFiGO,19,1,7,316.62,316.58,False
4,2018-01-08,[Tesla],Tesla Planning Supercharger Station With ‘Old ...,Tesla Motors Club,132,4,16,316.00,336.41,True
...,...,...,...,...,...,...,...,...,...,...
98,2018-05-29,"[Tesla, Elektroautopic]","#Tesla Model 3: Europa-Start ""erste Jahreshälf...",ecomento.de,3,2,1,278.51,283.76,True
99,2018-05-30,[Tesla],Oh well... in case you were wondering why @Tes...,Safer Vehicles Proved,27,4,9,283.29,291.72,True
100,2018-05-31,[Tesla],Take comfort #Tesla friends. All the alleged ...,Groggy T. Bear,22,4,3,287.21,284.73,False
101,2018-06-01,"[FBI, Tesla]",Según documento desclasificado del #FBI Niko...,Misterio Desconocido,40,3,14,285.86,291.82,True


In [5]:
# count tweets per day to see if they're ok'ish distributed
tweets_per_day = daily_data['text'].count()
tweets_per_day.describe()

Unnamed: 0,text
count,103.0
mean,48.048544
std,34.30497
min,6.0
25%,34.5
50%,40.0
75%,51.0
max,266.0


We have in average almost 50 tweets per day with a minimum of 6 tweets, which should be ok. The standard deviation is quite high too, but since we're so far only looking at individual tweets, this is absolutely ok. Even when we go for averaging the tweets of a single day, it should still be fine.

In [6]:
# get groups' names
daily_data.groups.keys()
groups = [name for name, _ in daily_data]
groups[0]

Timestamp('2018-01-02 00:00:00')

In [7]:
# get all tweets from the first day
first_day_data = daily_data.get_group(groups[0])
first_day_data.head()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.0,320.53,True
1,2018-01-02,"[Tesla, P90D, Blog, Youtube]",Ya estamos en @louesfera probando un #Tesla #P...,Fco Javier,2,1,2,312.0,320.53,True
2,2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Goo...",Here's how old these companies will be turning...,Imran,53,7,41,312.0,320.53,True
3,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.0,320.53,True
4,2018-01-02,"[Braunschweig, VW, Tesla]",In #Braunschweig produziert #VW seine Batterie...,HAZ,5,3,2,312.0,320.53,True


## A first very simple classifier using Word Vectors from spacy
Use each tweet and predict whether it was written on a day where stock price has grown (PriceUp == True) or not



In [8]:
# generate the train and test sets
tweets_train, tweets_test, labels_train, labels_test = train_test_split(data['text'], data['PriceUp'], 
                                                   test_size=0.2, random_state=333, shuffle=True)
len(tweets_train)

3959

### Continue here...
Vectorize the data... create a train and test matrix using word vectors from spacy... check if classifiers overfit to the training data... then evaluate on test data using the code below

In [10]:
try:
    nlp = spacy.load("en_core_web_md")
except:
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
train_matrix = []
test_matrix = []

for tweet,priceup in zip(tweets_train,labels_train):
    tokens = nlp(tweet)
    average_token= tokens[0].vector
    for token in tokens[1:]:
        average_token = average_token+token.vector
    average_token = average_token/len(tokens)
    train_matrix.append(average_token)
    
for tweet,priceup in zip(tweets_test,labels_test):
    tokens = nlp(tweet)
    average_token= tokens[0].vector
    for token in tokens[1:]:
        average_token = average_token+token.vector
    average_token = average_token/len(tokens)
    test_matrix.append(average_token)


In [29]:
print(len(train_matrix),len(labels_train))
print(len(test_matrix),len(labels_test))
print(type(train_matrix[0]))
print(np.shape(train_matrix[0]))
print("One vector: ", train_matrix[1])

3959 3959
990 990
<class 'numpy.ndarray'>
(96,)
One vector:  [ 0.63146317 -0.15021296 -0.69044656  0.65407705  1.191257    0.9622612
 -0.02600448  0.19858395  1.0466805   1.1214075   0.92656946 -0.7156148
  0.4132349  -1.0630354  -1.1714144  -0.656997   -0.30681074  0.38227725
  0.04336251 -0.32191426  0.01133227 -0.8395444   0.5322146  -0.18918706
 -0.6189851  -0.27362013  0.46848962 -1.6048694   0.26084775 -1.0112982
 -0.02537703 -0.02159977  0.17876668 -0.75421256 -0.6825408  -1.095034
  2.3707743  -1.1760559  -0.69888246  0.3179667   1.7626355   0.10533053
  0.5835901  -2.1097708  -0.9751415   0.36869898  0.457877   -0.26329026
 -0.8081718   1.5279149   0.17385235 -1.009767   -0.07693577 -0.22660692
 -2.05115     1.1694368   0.25987887  1.5791965  -0.3060168  -0.1282751
 -0.57015914 -0.14959374  1.0731122   0.07965691  0.9205019   0.07158332
 -0.10366537 -0.91832733 -0.91200864  0.18523099 -0.08648715  0.32651722
 -0.07397693  0.4862367  -0.5852232   0.32568955  1.3672144   0.08566

In [14]:
# This block is only useful if we want to use all tweets of a day 
# to predict whether the stock price will close higher than it has opened

# vectorize train and test data with word vectors from spacy
for day in groups:
    # all tweets and priceUp's for a single day
    data_single_day = daily_data.get_group(day)
    # take only the relevant columns for the classifier
    data_single_day = data_single_day[['text', 'PriceUp']]
    rows, cols = data_single_day.shape
    # individual tweets per day
    for row in range(rows):
        tweet, price_up = data_single_day.iloc[row][['text','PriceUp']]
        # tokenize and get word vectors here
        # get the mean of all word vectors to get a vector representation of the tweet
        print('Tweet:\n', tweet)
        print('Corresponding PriceUp: ', price_up)        
        break
    break

Tweet:
 In the past 2 years, I've driven 18,823 miles and paid $1,120 in fuel costs, giving me a cost per mile of $0.06 in my #Tesla #ModelS
Corresponding PriceUp:  True


### Train and Test classifiers

In [24]:
print(type(train_matrix))
print(type(labels_train))

<class 'list'>
<class 'pandas.core.series.Series'>


In [39]:
# increase number of training examples (by repeating them n times)
n = 2
train_matrix_n_times = train_matrix*n
labels_train_n_times = pd.concat([labels_train]*n, ignore_index=True)

In [37]:
svm_classifier = svm.LinearSVC(max_iter=int(1e6))
svm_classifier.fit(train_matrix_n_times, labels_train_n_times)

nb_classifier = naive_bayes.GaussianNB()
nb_classifier.fit(train_matrix_n_times, labels_train_n_times)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Test if training was successful
As we have not enough data, a working classifier should overfit to the training data and hence perfectly predict the labels of the training set.

In [38]:
# check if classifier has really overfitted to the data by testing it on the training data
preds_svm = svm_classifier.predict(train_matrix)
svm_acc = metrics.accuracy_score(labels_train, preds_svm)

preds_nb = nb_classifier.predict(train_matrix)
nb_acc = metrics.accuracy_score(labels_train, preds_nb)


svm_prec, svm_rec, svm_fscore, svm_sup = \
metrics.precision_recall_fscore_support(labels_train, preds_svm, pos_label=True, average='binary')

nb_prec, nb_rec, nb_fscore, nb_sup = \
metrics.precision_recall_fscore_support(labels_train, preds_nb, pos_label=True, average='binary')

print('   \t\tSVM \t\tNaive Bayes')
print('Acc \t\t {0:.3f} \t\t {1:.3f}'.format(svm_acc, nb_acc))
print('Prec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_prec, nb_prec))
print('Rec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_rec, nb_rec))
print('FMeas \t\t {0:.3f} \t\t {1:.3f}'.format(svm_fscore, nb_fscore))

   		SVM 		Naive Bayes
Acc 		 0.581 		 0.538
Prec 		 0.588 		 0.566
Rec 		 0.685 		 0.524
FMeas 		 0.633 		 0.544


# Observations:
Both classifiers reached quite poor performance when evaluating on even the training set. This could mean, we have no overfitting and have a hope for getting a bit generalization, but we believe there are other problems that can explain this observation: 
* das Language Model wurde auf anderen Daten trainiert... Die Dimensionen der Vektoren haben alle eine für unsere Aufgabe irrelevante Bedeutung.
* Durschnitt eines Tweets verliert seine Bedeutung komplett 

In [40]:
# test the classifiers
preds_svm = svm_classifier.predict(test_matrix)
svm_acc = metrics.accuracy_score(labels_test, preds_svm)

preds_nb = nb_classifier.predict(test_matrix)
nb_acc = metrics.accuracy_score(labels_test, preds_nb)


svm_prec, svm_rec, svm_fscore, svm_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_svm, pos_label=True, average='binary')

nb_prec, nb_rec, nb_fscore, nb_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_nb, pos_label=True, average='binary')

print('   \t\tSVM \t\tNaive Bayes')
print('Acc \t\t {0:.3f} \t\t {1:.3f}'.format(svm_acc, nb_acc))
print('Prec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_prec, nb_prec))
print('Rec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_rec, nb_rec))
print('FMeas \t\t {0:.3f} \t\t {1:.3f}'.format(svm_fscore, nb_fscore))

   		SVM 		Naive Bayes
Acc 		 0.501 		 0.516
Prec 		 0.520 		 0.541
Rec 		 0.578 		 0.482
FMeas 		 0.548 		 0.510


To make a statement about the results, we first have to look at the distribution of labels in the test dataset.
An even simpler baseline we can use is a classifier that constantly predicts the class that is most common in the test set. 

In [17]:
num_trues, num_falses = labels_test.value_counts()
print("A classifier that always predicts 'True' would get an accuracy of: %.3f" % (num_trues/labels_test.count()))

A classifier that always predicts 'True' would get an accuracy of: 0.522
