# Modelling bitcoin price using Twitter data

## 1.0 Imports

In [1]:
# Basic Python imports
import pandas as pd
import numpy as np

# Word lemmatisation imports
from nltk import WordNetLemmatizer
lem = WordNetLemmatizer()

# sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Modelling imports
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

# Stop words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from datetime import timedelta
import time

import ast

# Graph imports
import matplotlib.pyplot as plt
import seaborn as sns

import itertools



In [None]:
import warnings
warnings.filterwarnings('ignore')

## 2.0 Functions

### 2.1 Text cleaning functions

In [None]:
# Function to get lemma of word
def word_lemma(text):
    
    text_ls = text.split(' ')
    
    #lemmatised_words = [lem.lemmatize(word, 'v') for word in text]
    lemmatised_words = [lem.lemmatize(word, 'v') for word in text_ls]
    
    return lemmatised_words

In [None]:
# function to find and replace words in a string using a dictionary
def find_replace(string, dictionary):
    
    replacement_list = []
    
    # is the item in the dict?
    for item in string.split():
        # iterate by keys
        if item in dictionary.keys():
            # look up and replace
            replacement_list.append(item)
            #string.replace(item, dictionary[item])
            
    for word in replacement_list:
        
        string = string.replace(word, dictionary[word])
    # return updated string
    return string

In [None]:
# Dictionary of apostrophe conversions to find and replace on
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

### 2.2 Sentiment function

In [None]:
def sentiment_analyser(text):
    sentiment_scores = analyzer.polarity_scores(str(text))
    
    return sentiment_scores

### 2.3 Positive price change column function

In [None]:
def positive_5_mins(x):
    if x > 0:
        postive_change = 1
    else:
        postive_change = 0
        
    return postive_change

### 2.3 Function for plotting confusion matrix

In [2]:
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Reds):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

## 3.0 Read in the data

In [None]:
master_bitcoin_data = pd.read_csv('2.Processed_data/master_bitcoin_data_20180211.csv', index_col=0)
tweets_master = pd.read_csv('2.Processed_data/Tweets_master_20180211.csv', index_col=0)

In [None]:
# Look at shape of dataframes
print(f"master_bitcoin_data shape: {master_bitcoin_data.shape}")
print(f"tweets_master shape: {tweets_master.shape}")

In [None]:
# Check data for nulls
tweets_master.isnull().sum()

In [None]:
master_bitcoin_data.isnull().sum()

## 4.0 Pre-modelling data processing

In [None]:
# Check to see whether the tweets column has any duplicates
print(tweets_master.shape)
print(tweets_master.drop_duplicates(subset='tweet_text').shape)

In [None]:
# drop the duplicates
tweets_master.drop_duplicates(subset='tweet_text', inplace=True)

### 4.1 Merging bitcoin data to the tweets

In [None]:
master_bitcoin_data.rename(columns={'Timestamp_DT' : 'Timestamp_DT_mins'}, inplace=True)

In [None]:
tweets_with_BTC_data = tweets_master.merge(master_bitcoin_data, on='Timestamp_DT_mins', how='left')

In [None]:
# check shapes to confirm merge was successful
print(f"tweets_master shape: {tweets_master.shape}")
print(f"tweets_with_BTC_data shape: {tweets_with_BTC_data.shape}")

In [None]:
# Look at first few rows of merged data
tweets_with_BTC_data.head(3)

In [None]:
# Check for null values
tweets_with_BTC_data.isnull().sum()

In [None]:
# drop rows with nulls in the tweet text
tweets_BTC_modelling_df = tweets_with_BTC_data.dropna(subset=['tweet_text'])

In [None]:
# # drop rows with nulls in the weighted price
tweets_BTC_modelling_df.dropna(subset=['Weighted_Price'], inplace=True)

### 4.2 Text cleaning

In [None]:
# Set tweet text to lower
tweets_BTC_modelling_df['text_clean'] = tweets_BTC_modelling_df['tweet_text'].map(lambda x: str(x).lower())

In [None]:
# Remove commas, and replace odd formatted apostrophes with normal ones
tweets_BTC_modelling_df['text_clean'] = tweets_BTC_modelling_df['text_clean'].map(lambda x: x.replace(',', ' ').replace('’',"'"))

In [None]:
# Run a find and replace using the apostrophe dictionary
tweets_BTC_modelling_df['text_clean'] = tweets_BTC_modelling_df['text_clean'].map(lambda x: find_replace(str(x), appos))

In [None]:
tweets_BTC_modelling_df['text_lemma'] = tweets_BTC_modelling_df['text_clean'].map(lambda x: word_lemma(str(x)))

### 4.3 Add in sentiment scores

In [None]:
tweets_BTC_modelling_df['sentiment_scores'] = tweets_BTC_modelling_df['tweet_text'].map(lambda x: sentiment_analyser(str(x)))
tweets_BTC_modelling_df['overall_sentiment'] = tweets_BTC_modelling_df['sentiment_scores'].map(lambda x: x['compound'])

### 4.4 Add price change columns

#### 4.41 Add in 'price 5 mins later'

In [None]:
# Change to timestamp columns to datetime format
tweets_BTC_modelling_df['Timestamp_DT'] = pd.to_datetime(tweets_BTC_modelling_df['Timestamp_DT'])
tweets_BTC_modelling_df['Timestamp_DT_mins'] = pd.to_datetime(tweets_BTC_modelling_df['Timestamp_DT_mins'])

In [None]:
# Add in column of the time 5 ins later
tweets_BTC_modelling_df['Time_5_mins_later'] = tweets_BTC_modelling_df['Timestamp_DT_mins'].map(lambda x: x + timedelta(seconds=300))

In [None]:
# create copy dataframe to match against
later_date_matching_df = tweets_BTC_modelling_df[['Timestamp_DT_mins','Weighted_Price']]

In [None]:
# Rename columns in new dataframe
later_date_matching_df.rename(columns={'Timestamp_DT_mins' : 'merge_time', 'Weighted_Price':'Weighted_Price_5minsL'}, inplace=True)

In [None]:
# Merge in new dataframe to tweets data using the 5_mins_later column to get price at that time
tweets_BTC_modelling_df = tweets_BTC_modelling_df.merge(later_date_matching_df, left_on='Time_5_mins_later', right_on='merge_time', how='inner')

In [None]:
# drop out 'merge_time' column since it's no longer needed
tweets_BTC_modelling_df.drop(columns=['merge_time'], inplace=True)

#### 4.42 Add in price change

In [None]:
# subtract current price from price 5 mins in future to get the price change
tweets_BTC_modelling_df['price_change_5_mins'] = tweets_BTC_modelling_df['Weighted_Price_5minsL'] - tweets_BTC_modelling_df['Weighted_Price']

#### 4.43 Add in categorical positive price change column

In [None]:
tweets_BTC_modelling_df['positive_change_5mins'] = tweets_BTC_modelling_df['price_change_5_mins'].map(lambda x: positive_5_mins(x))

### 4.5 Create dummy columns

In [None]:
# create dummies on 'user' column
user_dummies = pd.get_dummies(tweets_BTC_modelling_df['user'])

In [None]:
# merge dummy columns into tweets dataframe
tweets_BTC_modelling_df = pd.concat([tweets_BTC_modelling_df,user_dummies], axis=1)

In [None]:
# Save to excel...
tweets_BTC_modelling_df.to_excel('Modeeling Dataframe.xlsx')

## 5.0 Modelling

### Load saved data

In [3]:
# Load df
tweets_BTC_modelling_df = pd.read_excel('Modeeling Dataframe.xlsx')

In [4]:
# Change into list
tweets_BTC_modelling_df['text_lemma'] = tweets_BTC_modelling_df['text_lemma'].map(lambda x: ast.literal_eval(x))

In [5]:
tweets_BTC_modelling_df.isnull().sum()

date                      0
exchange                  0
influencer                0
news                      0
project                   0
retweets                  0
tweet_text               22
user                      0
Timestamp_DT              0
Timestamp_DT_mins         0
Timestamp                 0
Open                      0
High                      0
Low                       0
Close                     0
Volume_(BTC)              0
Volume_(Currency)         0
Weighted_Price            0
text_clean               22
text_lemma                0
sentiment_scores          0
overall_sentiment         0
Time_5_mins_later         0
Weighted_Price_5minsL     0
price_change_5_mins       0
positive_change_5mins     0
@0xProject                0
@AriDavidPaul             0
@Aurora_dao               0
@BitMEXdotcom             0
                         ..
@VitalikButerin           0
@aantonop                 0
@adam3us                  0
@alexsunnarborg           0
@aradchenko1        

In [6]:
# drop nulls
print(tweets_BTC_modelling_df.shape)
tweets_BTC_modelling_df.dropna(inplace=True)
print(tweets_BTC_modelling_df.shape)

(98244, 81)
(98222, 81)


In [7]:
# Ensure it is a string
tweets_BTC_modelling_df['type_string'] = tweets_BTC_modelling_df.tweet_text.map(lambda x: True if type(x) == str else False)
tweets_BTC_modelling_df['type_string'].value_counts()

True     98221
False        1
Name: type_string, dtype: int64

In [8]:
tweets_BTC_modelling_df = tweets_BTC_modelling_df.loc[tweets_BTC_modelling_df['type_string']].copy()

### 5.1 TFID

In [11]:
additional_stopwords = ['https','http'] 

In [12]:
add_stop_words = ENGLISH_STOP_WORDS.union(additional_stopwords)

In [None]:
vect = TfidfVectorizer(max_features=5000, ngram_range=(3,5),stop_words=add_stop_words)

vect.fit(tweets_BTC_modelling_df['tweet_text'])

text_X = vect.transform(tweets_BTC_modelling_df['tweet_text']).toarray()

In [None]:
text_X.shape

In [None]:
# Set up model columns
feat_model_cols = [col for col in tweets_BTC_modelling_df.columns if col.startswith('@')]

feature_cols = list(tweets_BTC_modelling_df.columns[2:6]) + feat_model_cols + ['overall_sentiment']

In [None]:
non_text_features = tweets_BTC_modelling_df[feature_cols].as_matrix()

In [None]:
%%time
x = np.hstack((text_X, non_text_features))

### 5.2 Logistic Regression

#### 5.21 Predicting positive change 5 mins later using cleaned tweet text

In [None]:
%%time
# Run gridsearch
lr = LogisticRegression()
y = tweets_BTC_modelling_df['positive_change_5mins']

params = {'C' : [0.01, 0.1, 0.9],
         'penalty' : ['l1']}

gs_lr = GridSearchCV(lr, param_grid=params, verbose=1, n_jobs=-1)
gs_lr.fit(x, y)

In [None]:
gs_lr.best_score_

In [None]:
gs_lr.best_params_

In [None]:
%%time
# Model
lr = LogisticRegression(C= 0.01, penalty= 'l1')

y = tweets_BTC_modelling_df['positive_change_5mins']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

lr.fit(x_train, y_train)

pred = lr.predict(x_test)

score = lr.score(x_test, y_test)
print(score)

In [None]:
score

In [None]:
# Look into model coefficients
coeffs_df = pd.DataFrame({'Coefficients' : lr.coef_[0], 'Features' : vect.get_feature_names() + feature_cols}).sort_values('Coefficients', ascending = False)
coeffs_df.head(20)

In [None]:
# Quick look at the tail
coeffs_df.tail(20)

In [None]:
# Look specifically at the coefficients for each user account
coeffs_df['user'] = coeffs_df.Features.map(lambda x: 1 if x.startswith('@') else 0)
coeffs_df.loc[coeffs_df['user']  == 1]

In [None]:
len(pred)

##### 5.211 Evaluate model

In [None]:
# create a new df for test scores
model_output = tweets_BTC_modelling_df.loc[y_test.index]

model_output['prediction'] = pred

In [None]:
# Take a look at confusion matrix
conmat = metrics.confusion_matrix(model_output['positive_change_5mins'].values, model_output['prediction'].values)
classes = np.array([ 0.,  1.])
plot_confusion_matrix(conmat, classes)

In [None]:
print(classification_report(model_output['positive_change_5mins'], model_output['prediction']))

#### 5.22 Predicting positive change 5 mins later using lemmatized words

In [9]:
# Change the column
tweets_BTC_modelling_df['lemma_tweet'] = tweets_BTC_modelling_df['text_lemma'].map(lambda x: ' '.join(x) if len(x) > 0 else '')

In [13]:
# Increase max_features to get better results
vect_lem = TfidfVectorizer(max_features=5000, ngram_range=(3,5),stop_words=add_stop_words)

vect_lem.fit(tweets_BTC_modelling_df['lemma_tweet'])

text_lemma = vect_lem.transform(tweets_BTC_modelling_df['lemma_tweet']).toarray()

In [14]:
# Set up model columns
feat_model_cols = [col for col in tweets_BTC_modelling_df.columns if col.startswith('@')]

feature_cols = list(tweets_BTC_modelling_df.columns[2:6]) + feat_model_cols + ['overall_sentiment']

In [15]:
non_text_features = tweets_BTC_modelling_df[feature_cols].as_matrix()

In [16]:
%%time
x = np.hstack((text_lemma, non_text_features))

Wall time: 6.01 s


In [17]:
%%time
# Model
lr = LogisticRegression()

y = tweets_BTC_modelling_df['positive_change_5mins']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

lr.fit(x_train, y_train)

pred = lr.predict(x_test)

score = lr.score(x_test, y_test)
print(score)

(68754, 5060) (29467, 5060)
(68754,) (29467,)
0.5162045678216309
Wall time: 13.4 s


In [19]:
# Look into model coefficients
coeffs_df = pd.DataFrame({'Coefficients' : lr.coef_[0], 'Features' : vect_lem.get_feature_names() + feature_cols}).sort_values('Coefficients', ascending = False)
coeffs_df.head(10)

Unnamed: 0,Coefficients,Features
2933,1.378523,long term investment
2472,1.318756,good time buy
3441,1.284843,position ow ly
800,1.283294,bitcoinhttp bit ly
134,1.280258,2018 bit ly
1147,1.273334,china bitcoin exchange
3899,1.266846,sound like fun
2692,1.241147,index php topic 421615
2715,1.234962,initial coin offer
1793,1.234869,content uploads 2016


In [20]:
coeffs_df.tail(10)

Unnamed: 0,Coefficients,Features
3092,-1.31049,mgt capital investments
1033,-1.323051,business buff ly
4677,-1.35309,won twitter com
3135,-1.397428,monero research lab
4973,-1.400102,youtu kljuvc22l7y 47m30s
4724,-1.459987,www businessinsider com
2402,-1.491776,genesis block address
458,-1.50374,australian securities exchange
2863,-1.513628,like twitter com
2929,-1.745479,long term coin


### 5.3 Linear Regression

#### 5.31 Weighted Price

In [21]:
# Set x to all feature columns, with one of the dummy columns removed
y = tweets_BTC_modelling_df['Weighted_Price']

y_train = y.loc[y_train.index]
y_test = y.loc[y_test.index]

#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(len(x_train), len(x_test))
print(len(y_train), len(y_test))

68754 29467
68754 29467


In [22]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


y_pred_mean = [y_train.mean()] * len(y_test)

print('RMSE (model):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('RMSE: (dumb model):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mean)))

MAE: 12560635.837167066
MSE: 5.890714810449683e+17
RMSE: 767509922.4407254
RMSE (model): 767509922.4407254
RMSE: (dumb model): 4298.632611289572


In [23]:
model = LinearRegression()

cv_scores = cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv=4)

print(cv_scores)

[-6.18069471e+19 -2.15173937e+19 -1.43553285e+07 -1.40856371e+07]


#### 5.32 Weighted price change

In [24]:
y = tweets_BTC_modelling_df['price_change_5_mins']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(len(x_train), len(x_test))
print(len(y_train), len(y_test))

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


y_pred_mean = [y_train.mean()] * len(y_test)

print('RMSE (model):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('RMSE: (dumb model):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mean)))

68754 29467
68754 29467
MAE: 41318.86703133986
MSE: 6373458585106.915
RMSE: 2524570.9705030904
RMSE (model): 2524570.9705030904
RMSE: (dumb model): 33.36516402285933


In [25]:
model = LinearRegression()

cv_scores = cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv=4)

print(cv_scores)

[-5.27297522e+15 -1.70345236e+15 -1.05324015e+03 -1.36864368e+03]


## 6.0 Evaluate model

### 6.1 Calculate baseline

In [26]:
tweets_BTC_modelling_df['positive_change_5mins'].value_counts()

1    50025
0    48196
Name: positive_change_5mins, dtype: int64

In [27]:
(tweets_BTC_modelling_df['positive_change_5mins'].value_counts()[1]/(tweets_BTC_modelling_df['positive_change_5mins'].value_counts()[1]+tweets_BTC_modelling_df['positive_change_5mins'].value_counts()[0]))*100

50.93106362183239