In [1]:
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import naive_bayes, svm, metrics
import pandas as pd
# reset colwitdth options when running all cells 
pd.reset_option('display.max_colwidth')

#### Load dataset and group by day (all tweets and corresponding stock prices)

In [2]:
data = pd.read_json('processed_data/data_merged.json')
# remove columns that were unexpectedly generated during saving
# data.drop(columns=['level_0', 'index'], inplace=True)
data.tail(3)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
4946,2018-06-04,"[Tesla, Model3pic]",Report: Tesla Has Refunded 23% of Model 3 Depo...,Tesla Motors Club,91,13,13,294.34,296.74,True
4947,2018-06-04,"[Tesla, NikolaTeslapic]",Make electricity free again! #Tesla #NikolaTes...,Mr A 🎶🎵 ♏,2,1,1,294.34,296.74,True
4948,2018-06-04,"[Tesla, ElonMusk, CFD, forex, equitypic]",#Tesla #ElonMusk #CFD #forex #equitypic.twitte...,The Utkarsh,2,1,1,294.34,296.74,True


In [3]:
# group data by day
daily_data = data.groupby(data['timestamp'], as_index=False)
daily_data.first()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.00,320.53,True
1,2018-01-03,[Tesla],Día de piernas ... Estrenando mallas...#Tesla ...,El CaZador,107,3,4,321.00,317.25,False
2,2018-01-04,"[Innovation, Tesla, electricvehicles, Cars, br...",This is awesome! New brand technology #Innovat...,Gabriela Mascaró,6,1,3,312.87,314.62,True
3,2018-01-05,"[Tesla, TeslaModel3]",#Tesla #TeslaModel3 hahapic.twitter.com/DMlxOf...,WtaFiGO,19,1,7,316.62,316.58,False
4,2018-01-08,[Tesla],Tesla Planning Supercharger Station With ‘Old ...,Tesla Motors Club,132,4,16,316.00,336.41,True
...,...,...,...,...,...,...,...,...,...,...
98,2018-05-29,"[Tesla, Elektroautopic]","#Tesla Model 3: Europa-Start ""erste Jahreshälf...",ecomento.de,3,2,1,278.51,283.76,True
99,2018-05-30,[Tesla],Oh well... in case you were wondering why @Tes...,Safer Vehicles Proved,27,4,9,283.29,291.72,True
100,2018-05-31,[Tesla],Take comfort #Tesla friends. All the alleged ...,Groggy T. Bear,22,4,3,287.21,284.73,False
101,2018-06-01,"[FBI, Tesla]",Según documento desclasificado del #FBI Niko...,Misterio Desconocido,40,3,14,285.86,291.82,True


In [4]:
# count tweets per day to see if they're ok'ish distributed
tweets_per_day = daily_data['text'].count()
tweets_per_day.describe()

Unnamed: 0,text
count,103.0
mean,48.048544
std,34.30497
min,6.0
25%,34.5
50%,40.0
75%,51.0
max,266.0


We have in average almost 50 tweets per day with a minimum of 6 tweets, which should be ok. The standard deviation is quite high too, but since we're so far only looking at individual tweets, this is absolutely ok. Even when we go for averaging the tweets of a single day, it should still be fine.

In [24]:
# get groups' names
daily_data.groups.keys()
groups = [name for name, _ in daily_data]
groups[0]

Timestamp('2018-01-02 00:00:00')

In [6]:
# get all tweets from the first day
first_day_data = daily_data.get_group(groups[0])
first_day_data.head()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,"[Tesla, ModelS]","In the past 2 years, I've driven 18,823 miles ...",Ben Sullins 💪,110,6,10,312.0,320.53,True
1,2018-01-02,"[Tesla, P90D, Blog, Youtube]",Ya estamos en @louesfera probando un #Tesla #P...,Fco Javier,2,1,2,312.0,320.53,True
2,2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Goo...",Here's how old these companies will be turning...,Imran,53,7,41,312.0,320.53,True
3,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.0,320.53,True
4,2018-01-02,"[Braunschweig, VW, Tesla]",In #Braunschweig produziert #VW seine Batterie...,HAZ,5,3,2,312.0,320.53,True


## A first very simple classifier using Word Vectors from spacy
Use each tweet and predict whether it was written on a day where stock price has grown (PriceUp == True) or not



In [13]:
# generate the train and test sets
tweets_train, tweets_test, labels_train, labels_test = train_test_split(data['text'], data['PriceUp'], 
                                                   test_size=0.2, random_state=333, shuffle=True)
len(tweets_train)

3959

### Continue here...
Vectorize the data... create a train and test matrix using word vectors from spacy... check if classifiers overfit to the training data... then evaluate on test data using the code below

In [17]:
nlp = spacy.load("en_core_web_md")
train_matrix = []
test_matrix = []

for tweet,priceup in zip(tweets_train,labels_train):
    tokens = nlp(tweet)
    average_token= tokens[0].vector
    for token in tokens[1:]:
        average_token = average_token+token.vector
    average_token = average_token/len(tokens)
    train_matrix.append(average_token)
    
for tweet,priceup in zip(tweets_test,labels_test):
    tokens = nlp(tweet)
    average_token= tokens[0].vector
    for token in tokens[1:]:
        average_token = average_token+token.vector
    average_token = average_token/len(tokens)
    test_matrix.append(average_token)


In [18]:
print(len(train_matrix),len(labels_train))
print(len(test_matrix),len(labels_test))

3959 3959
990 990


In [57]:
# vectorize train and test data with word vectors from spacy
for day in groups:
    # all tweets and priceUp's for a single day
    data_single_day = daily_data.get_group(day)
    # take only the relevant columns for the classifier
    data_single_day = data_single_day[['text', 'PriceUp']]
    rows, cols = data_single_day.shape
    # individual tweets per day
    for row in range(rows):
        tweet, price_up = data_single_day.iloc[row][['text','PriceUp']]
        # tokenize and get word vectors here
        # get the mean of all word vectors to get a vector representation of the tweet
        print('Tweet:\n', tweet)
        print('Corresponding PriceUp: ', price_up)        
        break
    break

Tweet:
 In the past 2 years, I've driven 18,823 miles and paid $1,120 in fuel costs, giving me a cost per mile of $0.06 in my #Tesla #ModelS
Corresponding PriceUp:  True


### As soon as we have a train and test matrix, we can just reuse the classifier code from the baseline notebook

In [20]:


svm_classifier = svm.LinearSVC(max_iter=int(1e6))
svm_classifier.fit(train_matrix, labels_train)

nb_classifier = naive_bayes.GaussianNB()
nb_classifier.fit(train_matrix, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
# test the classifiers
preds_svm = svm_classifier.predict(test_matrix)
svm_acc = metrics.accuracy_score(labels_test, preds_svm)

preds_nb = nb_classifier.predict(test_matrix)
nb_acc = metrics.accuracy_score(labels_test, preds_nb)


svm_prec, svm_rec, svm_fscore, svm_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_svm, pos_label=True, average='binary')

nb_prec, nb_rec, nb_fscore, nb_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_nb, pos_label=True, average='binary')

print('   \t\tSVM \t\tNaive Bayes')
print('Acc \t\t {0:.3f} \t\t {1:.3f}'.format(svm_acc, nb_acc))
print('Prec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_prec, nb_prec))
print('Rec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_rec, nb_rec))
print('FMeas \t\t {0:.3f} \t\t {1:.3f}'.format(svm_fscore, nb_fscore))

   		SVM 		Naive Bayes
Acc 		 0.532 		 0.487
Prec 		 0.549 		 0.514
Rec 		 0.590 		 0.329
FMeas 		 0.568 		 0.401


To make a statement about the results, we first have to look at the distribution of labels in the test dataset.
An even simpler baseline we can use is a classifier that constantly predicts the class that is most common in the test set. 

In [None]:
num_trues, num_falses = labels_test.value_counts()
print("A classifier that always predicts 'True' would get an accuracy of: %.3f" % (num_trues/labels_test.count()))