In [224]:
import pandas as pd
import os
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from nltk.stem import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nelsonlim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [176]:
# change
BINANCE_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/binance/processed/day_discrete/"
TWITTER_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/twitter"
REDDIT_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/reddit"

TWITTER_FILE = "twitter.csv"
REDDIT_FILE  = "reddit.csv"

# dont change
BINANCE_EXTENSION = "csv"

START_DATE = "2018-01-01"
END_DATE = "2021-10-31"

In [177]:
def concatenate_content_values(content_list):
	return content_list.str.cat(sep=" ")

# Dataset

### Binance

In [178]:
# load binance data
df_binance_list = []
for filename in sorted(os.listdir(BINANCE_DIR)):
    if filename.endswith(BINANCE_EXTENSION):
        df_binance_list.append(pd.read_csv( os.path.join(BINANCE_DIR,filename)))
df_binance = pd.concat(df_binance_list,ignore_index=True)

# convert time to date obj
df_binance['time'] = pd.to_datetime(df_binance['time'])

# change index to time
df_binance = df_binance.set_index('time')

# filter date range
df_binance = df_binance.loc[START_DATE:END_DATE]

### Reddit

In [179]:
# read reddit
df_reddit = pd.read_csv(  os.path.join(REDDIT_DIR, REDDIT_FILE), lineterminator='\n' )

# convert to date obj
df_reddit['date'] = pd.to_datetime(df_reddit['date'])

# ignore reddit posts after binance last date
df_reddit = df_reddit[ df_reddit['date'] <= binance_last_date ]

# Join all reddit posts for each day into one big string
df_reddit = df_reddit.groupby("date").aggregate({
    "content": concatenate_content_values,
    "popularity" : "sum"
})

# filter date range
df_reddit = df_reddit.loc[START_DATE:END_DATE]

### Twitter

In [180]:
# load twitter data
df_twitter = pd.read_csv( os.path.join(TWITTER_DIR, TWITTER_FILE), lineterminator="\n")

# convert to date obj
df_twitter['date'] = pd.to_datetime(df_twitter['date'])

# ignore tweets after binance last date
df_twitter = df_twitter[ df_twitter['date'] <= binance_last_date ]

# Join all tweets for each day into one big string
df_twitter = df_twitter.groupby("date").aggregate({
    "content": concatenate_content_values,
    "popularity" : "sum"
})

# filter date range
df_twitter = df_twitter.loc[START_DATE:END_DATE]

In [188]:
daily_content = df_twitter.content

## Prunning

In [189]:
stemmer = PorterStemmer()

# Adding stemmming with CountVectorizer
# https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

# prunning
vectorizer = CountVectorizer(
    ngram_range=(1, 1),
    stop_words=nltk.corpus.stopwords.words("english"),
    min_df=0.1,
    max_df=1.0,
    analyzer=stemmed_words
)

x = vectorizer.fit_transform(daily_content)

In [190]:
print(f"{len(vectorizer.get_feature_names())} features")

2889 features


In [191]:
y = df_binance.label

### Training

In [None]:
y[y == 0] = 1

In [219]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

In [216]:
svm_model = []
mean_error = []
std_error = []
Ci_range = [0.001, 1.0, 100.0]

for index, Ci in enumerate(Ci_range):    
    svm_model.append( LinearSVC(C=Ci, max_iter=10000) )
    scores = cross_val_score(svm_model[index], Xtrain, ytrain, cv=5, scoring='f1')
    mean_error.append(np.array(scores).mean())
    std_error.append(np.array(scores).std())    
plt.errorbar(Ci_range, mean-error, yerr=std_error, linewidth=3)
plt.xlabel('Ci')
plt.ylabel('F1 Score')
plt.show()

Traceback (most recent call last):
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 88, in __call__
    *args, **kwargs)
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 243, in _score
    **self._kwargs)
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1074, in f1_score
    zero_division=zero_division)
  File "/Users/nelsonlim/.virtualenv/tcd_ml/lib/python3.7/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/nelsonlim/.virtua

KeyboardInterrupt: 

In [226]:
FIXED_C = 1

model = LinearSVC(C=FIXED_C, max_iter=10000)
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
print(classification_report(ytest,ypred))
print(confusion_matrix(ytest,ypred))

              precision    recall  f1-score   support

        -1.0       0.57      0.51      0.54       131
         1.0       0.61      0.66      0.63       149

    accuracy                           0.59       280
   macro avg       0.59      0.59      0.59       280
weighted avg       0.59      0.59      0.59       280

[[67 64]
 [50 99]]
