In [1]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import naive_bayes, svm, metrics
import pandas as pd
# reset colwitdth options when running all cells 
pd.reset_option('display.max_colwidth')

#### Load dataset and group by day (all tweets and corresponding stock prices)

In [2]:
data = pd.read_json('processed_data/data_final_merged.json')
# remove columns that were unexpectedly generated during saving
# data.drop(columns=['level_0', 'index'], inplace=True)
data.tail(3)

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
23516,2018-06-04,[],So Tesla treats manufacturing people like shit...,Mark B. Spiegel,35,5,4,294.34,296.74,True
23517,2018-06-04,[],Did you know Nikola Tesla has a whole ass book...,SEBEK RA,23,3,6,294.34,296.74,True
23518,2018-06-04,"[Podsećanje, Insajder]",Grad se zbog koncesije bez naknade odrekao par...,Insajder,63,2,20,294.34,296.74,True


In [3]:
# group data by day
daily_data = data.groupby(data['timestamp'], as_index=False)
daily_data.first()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.00,320.53,True
1,2018-01-03,[Tesla],#Tesla just released record delivery numbers f...,InsideEVs Forum,11,1,5,321.00,317.25,False
2,2018-01-04,[],Tesla struggles with Model 3 production pic.t...,Automotive News,5,0,5,312.87,314.62,True
3,2018-01-05,[munilandhttps],Head of Puerto Rico electric utility says they...,Cate Long,17,4,9,316.62,316.58,False
4,2018-01-08,[],“Bırakın doğruları gelecek söylesin ve herkesi...,[n]Beyin,324,2,86,316.00,336.41,True
...,...,...,...,...,...,...,...,...,...,...
98,2018-05-29,[],You know Erin let us forget for a short period...,Darji,23,1,4,278.51,283.76,True
99,2018-05-30,[],Tesla Autopilot blamed for crash with parked p...,BBC News Technology,11,3,11,283.29,291.72,True
100,2018-05-31,[Tesla],Weekly #Tesla short update. $TSLA short intere...,Ihor Dusaniwsky,12,4,8,287.21,284.73,False
101,2018-06-01,[1u],Tesla and Elon Musk face tough questions from ...,Minnesota AFL-CIO,19,0,10,285.86,291.82,True


In [4]:
# count tweets per day to see if they're ok'ish distributed
tweets_per_day = daily_data['text'].count()
tweets_per_day.describe()

Unnamed: 0,text
count,103.0
mean,228.339806
std,167.414922
min,26.0
25%,159.0
50%,184.0
75%,235.5
max,1376.0


We have in average almost 50 tweets per day with a minimum of 6 tweets, which should be ok. The standard deviation is quite high too, but since we're so far only looking at individual tweets, this is absolutely ok. Even when we go for averaging the tweets of a single day, it should still be fine.

In [5]:
# get groups' names
daily_data.groups.keys()
groups = [name for name, _ in daily_data]
groups[0]

Timestamp('2018-01-02 00:00:00')

In [6]:
# get all tweets from the first day
first_day_data = daily_data.get_group(groups[0])
first_day_data.head()

Unnamed: 0,timestamp,hashtags,text,username,likes,replies,retweets,Open,Close,PriceUp
0,2018-01-02,[Muskwatchpic],"From SpaceX to Tesla, here are our biggest que...",Nerdist,37,5,10,312.0,320.53,True
1,2018-01-02,"[Snapchat, Uber, Twitter, Facebook, Tesla, Goo...",Here's how old these companies will be turning...,Imran,53,7,41,312.0,320.53,True
2,2018-01-02,"[Model3, Autopilot2, pasatealoelectrico, Tesla]","Primera prueba del @Tesla #Model3 en la nieve,...",PasatealoElectrico,23,0,6,312.0,320.53,True
3,2018-01-02,[],Know the whirr sound a Tesla makes?\n\nThat's ...,Elon Musk News,8,0,5,312.0,320.53,True
4,2018-01-02,[],"In Norway, @Tesla finished Q4 with 3,753 Model...",Tesla Daily,28,0,6,312.0,320.53,True


## A first very simple classifier using Word Vectors from spacy
Use each tweet and predict whether it was written on a day where stock price has grown (PriceUp == True) or not



In [7]:
try:
    nlp = spacy.load("en_core_web_md")
except:
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
tweet_vectors = []

empty_tweets = []

# Vectorize each tweet
for i,tweet in enumerate(data.text):
    tokens = nlp(tweet)
    if len(tokens) > 0:
        average_token= tokens[0].vector
        summed_token_count = 1
        for token in tokens[1:]:
            # Only add to our sentence average token is token is not a stop word or if it's a negation
            if token.dep_ == 'neg' or not token.is_stop:
                average_token = average_token + token.vector
                summed_token_count += 1
        average_token = average_token / summed_token_count
        tweet_vectors.append(average_token)
    else:
        empty_tweets.append(i)
data = data.drop(index=empty_tweets

SyntaxError: unexpected EOF while parsing (<ipython-input-7-3c5cfd5b2d8a>, line 26)

In [None]:
# generate the train and test sets
tweets_train, tweets_test, labels_train, labels_test = train_test_split(tweet_vectors, data['PriceUp'], 
                                                   test_size=0.2, random_state=333, shuffle=True)
len(tweets_train)

### Continue here...
Vectorize the data... create a train and test matrix using word vectors from spacy... check if classifiers overfit to the training data... then evaluate on test data using the code below

In [None]:
print(len(tweets_train),len(labels_train))
print(len(tweets_test),len(labels_test))
print(type(tweets_train[0]))
print(np.shape(tweets_train[0]))
print("One vector: ", tweets_train[2])

### Train and Test classifiers

In [None]:
print(type(tweets_train))
print(type(labels_train))

In [None]:
# increase number of training examples (by repeating them n times)
n = 2
train_matrix_n_times = tweets_train*n
labels_train_n_times = pd.concat([labels_train]*n, ignore_index=True)

In [None]:
svm_classifier = svm.LinearSVC(max_iter=int(1e6))
svm_classifier.fit(train_matrix_n_times, labels_train_n_times)

nb_classifier = naive_bayes.GaussianNB()
nb_classifier.fit(train_matrix_n_times, labels_train_n_times)

#### Test if training was successful
As we have not enough data, a working classifier should overfit to the training data and hence perfectly predict the labels of the training set.

In [None]:
# check if classifier has really overfitted to the data by testing it on the training data
preds_svm = svm_classifier.predict(tweets_train)
svm_acc = metrics.accuracy_score(labels_train, preds_svm)

preds_nb = nb_classifier.predict(tweets_train)
nb_acc = metrics.accuracy_score(labels_train, preds_nb)


svm_prec, svm_rec, svm_fscore, svm_sup = \
metrics.precision_recall_fscore_support(labels_train, preds_svm, pos_label=True, average='binary')

nb_prec, nb_rec, nb_fscore, nb_sup = \
metrics.precision_recall_fscore_support(labels_train, preds_nb, pos_label=True, average='binary')

print('   \t\tSVM \t\tNaive Bayes')
print('Acc \t\t {0:.3f} \t\t {1:.3f}'.format(svm_acc, nb_acc))
print('Prec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_prec, nb_prec))
print('Rec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_rec, nb_rec))
print('FMeas \t\t {0:.3f} \t\t {1:.3f}'.format(svm_fscore, nb_fscore))

# Observations:
Both classifiers reached quite poor performance when evaluating on even the training set. This could mean, we have no overfitting and have a hope for getting a bit generalization, but we believe there are other problems that can explain this observation: 
* das Language Model wurde auf anderen Daten trainiert... Die Dimensionen der Vektoren haben alle eine für unsere Aufgabe irrelevante Bedeutung.
* Durschnitt eines Tweets verliert seine Bedeutung komplett 

In [None]:
# test the classifiers
preds_svm = svm_classifier.predict(tweets_test)
svm_acc = metrics.accuracy_score(labels_test, preds_svm)

preds_nb = nb_classifier.predict(tweets_test)
nb_acc = metrics.accuracy_score(labels_test, preds_nb)


svm_prec, svm_rec, svm_fscore, svm_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_svm, pos_label=True, average='binary')

nb_prec, nb_rec, nb_fscore, nb_sup = \
metrics.precision_recall_fscore_support(labels_test, preds_nb, pos_label=True, average='binary')

print('   \t\tSVM \t\tNaive Bayes')
print('Acc \t\t {0:.3f} \t\t {1:.3f}'.format(svm_acc, nb_acc))
print('Prec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_prec, nb_prec))
print('Rec \t\t {0:.3f} \t\t {1:.3f}'.format(svm_rec, nb_rec))
print('FMeas \t\t {0:.3f} \t\t {1:.3f}'.format(svm_fscore, nb_fscore))

To make a statement about the results, we first have to look at the distribution of labels in the test dataset.
An even simpler baseline we can use is a classifier that constantly predicts the class that is most common in the test set. 

In [None]:
num_trues, num_falses = labels_test.value_counts()
print("A classifier that always predicts 'True' would get an accuracy of: %.3f" % (num_trues/labels_test.count()))

## Stock Market Simulation:


Use our classifier with the simulation tool

In [None]:
from stock_market_simulation import load_stock_prices_from_json, Trader

stock_prices = load_stock_prices_from_json()


# Impact of the tweet of the classification of each tweet result
data['impact'] = data.likes + data.replies + data.retweets
# Recommendation obtained by the classifier
data['recommendation'] = svm_classifier.predict(tweet_vectors)

# Decide on the action of the day, based on the recommendations per tweet weighted by their impact.
actions = []
for day, group in data.groupby(data['timestamp'], as_index=False):
     
    weighted_recommendation = 0
    # polarize the impact with the recommendation obtained by the classifier
    for impact, recommendation in zip(group.impact, group.recommendation):
        if recommendation:
            weighted_recommendation += impact
        else:
            weighted_recommendation -= impact
    # Normalize into [-1; 1] range
    weighted_recommendation /= sum(group.impact)
    # Experiment with thresholds for the actions on the stock market. If 3/4 of the impact are buy we buy, more impact is sell than buy, we sell.
    actions.append([day, 1 if weighted_recommendation > 0.5 else -1 if weighted_recommendation < 0 else 0])

actions = pd.DataFrame(actions, columns=['day', 'action'])
actions = actions.set_index('day').resample('D').asfreq(fill_value=0)
actions = actions.loc[stock_prices.index]['action'].to_list()

print(actions)
print(len(stock_prices))
print(len(actions))

simulation = Trader(stock_prices, actions)
simulation.stock_action()


In [None]:
# Impact of the tweet of the classification of each tweet result
data['impact'] = data.likes + data.replies + data.retweets
# Recommendation obtained by the classifier
data['recommendation'] = nb_classifier.predict(tweet_vectors)

# Decide on the action of the day, based on the recommendations per tweet weighted by their impact.
actions = []
for day, group in data.groupby(data['timestamp'], as_index=False):
     
    weighted_recommendation = 0
    for impact, recommendation in zip(group.impact, group.recommendation):
        if recommendation:
            weighted_recommendation += impact
        else:
            weighted_recommendation -= impact
    weighted_recommendation /= sum(group.impact)
    # Experiment with thresholds for the actions on the stock market
    actions.append([day, 1 if weighted_recommendation > 0.5 else -1 if weighted_recommendation < 0 else 0])

actions = pd.DataFrame(actions, columns=['day', 'action'])
actions = actions.set_index('day').resample('D').asfreq(fill_value=0)
actions = actions.loc[stock_prices.index]['action'].to_list()

print(actions)
print(len(stock_prices))
print(len(actions))

simulation = Trader(stock_prices, actions)
simulation.stock_action()

In [None]:
from stock_market_simulation import load_stock_prices_from_yahoo_finance
display(load_stock_prices_from_yahoo_finance())