In [53]:
import pandas as pd
import os
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from nltk.stem import PorterStemmer
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nelsonlim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# change
BINANCE_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/binance/processed/day_discrete/"
TWITTER_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/twitter"
REDDIT_DIR = "/Users/nelsonlim/Documents/Trinity College Dublin/Semester 1/Machine Learning/Assignment/Week 5/data/reddit"

TWITTER_FILE = "twitter.csv"
REDDIT_FILE  = "reddit.csv"

# dont change
BINANCE_EXTENSION = "csv"

START_DATE = "2018-01-01"
END_DATE = "2021-10-31"

In [35]:
def concatenate_content_values(content_list):
	return content_list.str.cat(sep=" ")

# Dataset

### Binance

In [36]:
# load binance data
df_binance_list = []
for filename in sorted(os.listdir(BINANCE_DIR)):
    if filename.endswith(BINANCE_EXTENSION):
        df_binance_list.append(pd.read_csv( os.path.join(BINANCE_DIR,filename)))
df_binance = pd.concat(df_binance_list,ignore_index=True)

# convert time to date obj
df_binance['time'] = pd.to_datetime(df_binance['time'])

# change index to time
df_binance = df_binance.set_index('time')

# filter date range
df_binance = df_binance.loc[START_DATE:END_DATE]

### Reddit

In [37]:
# read reddit
df_reddit = pd.read_csv(  os.path.join(REDDIT_DIR, REDDIT_FILE), lineterminator='\n' )

# convert to date obj
df_reddit['date'] = pd.to_datetime(df_reddit['date'])

# Join all reddit posts for each day into one big string
df_reddit = df_reddit.groupby("date").aggregate({
    "content": concatenate_content_values,
    "popularity" : "sum"
})

# filter date range
df_reddit = df_reddit.loc[START_DATE:END_DATE]

In [38]:
df_reddit.head()

Unnamed: 0_level_0,content,popularity
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,how you do it? i cant change the onboard to pc...,2.104167
2018-01-02,"If you have nvidia experience, it'll prompt yo...",2.432655
2018-01-03,&gt; I'm getting pretty low hashrates on my ri...,1.912791
2018-01-04,http://brianford.tech/2017-10-10-etherMiningRi...,2.580911
2018-01-05,Currently I'm on wifi. I hear ethermine mines ...,2.102229


### Twitter

In [39]:
# load twitter data
df_twitter = pd.read_csv( os.path.join(TWITTER_DIR, TWITTER_FILE), lineterminator="\n")

# convert to date obj
df_twitter['date'] = pd.to_datetime(df_twitter['date'])

# Join all tweets for each day into one big string
df_twitter = df_twitter.groupby("date").aggregate({
    "content": concatenate_content_values,
    "popularity" : "sum"
})

# filter date range
df_twitter = df_twitter.loc[START_DATE:END_DATE]

In [40]:
df_twitter.head()

Unnamed: 0_level_0,content,popularity
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,#Ethereum trade bot is attempting to sell. Cur...,0.049917
2018-01-02,#AIRDROP #EtherFlip #FLIP #LIVE @Ethereum_Flip...,0.012856
2018-01-03,#AIRDROP #EtherFlip #FLIP #LIVE @Ethereum_Flip...,0.0681
2018-01-04,Estonian enterprises will use the Ethereum blo...,0.025132
2018-01-05,Update: took 10 hours for my Ethereum payment ...,0.077095


In [41]:
input_data = pd.concat([df_reddit,df_twitter]).groupby("date").aggregate({
    "content": concatenate_content_values,
    "popularity" : "sum"
})

In [64]:
daily_content = input_data.content

## Preprocess Data

In [65]:
# check if there exist any empty column
daily_content.isna().sum()

0

## Prunning

In [66]:
stemmer = PorterStemmer()

# Adding stemmming with CountVectorizer
# https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    words = []
    for w in analyzer(doc):
        
        # remove words with number
        if len(re.findall('\d+', w)) > 0:
            continue
        
        # only english letters
        w=re.sub('[^a-zA-Z]','',w)    
        if w == '':
            continue
        
        # stemming 
        stemmed_word = stemmer.stem(w)
        
        words.append(stemmed_word)
    return words

# prunning
vectorizer = CountVectorizer(
    ngram_range=(1, 1),
    stop_words=nltk.corpus.stopwords.words("english"),
    min_df=0.1,
    max_df=1.0,
    analyzer=stemmed_words
)

x = vectorizer.fit_transform(daily_content)

In [67]:
print(f"{len(vectorizer.get_feature_names())} features")

7176 features


In [68]:
vectorizer.get_feature_names()

['a',
 'aa',
 'aantonop',
 'aapl',
 'aav',
 'ab',
 'abandon',
 'abc',
 'abid',
 'abil',
 'abl',
 'aboard',
 'about',
 'abov',
 'abra',
 'abraaj',
 'abroad',
 'absenc',
 'absolut',
 'absorb',
 'abstract',
 'absurd',
 'abund',
 'abus',
 'ac',
 'academ',
 'academi',
 'acc',
 'acceler',
 'accept',
 'access',
 'accid',
 'accident',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accredit',
 'accru',
 'acct',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'ach',
 'achiev',
 'acknowledg',
 'acquaint',
 'acquir',
 'acquisit',
 'acronym',
 'across',
 'act',
 'action',
 'activ',
 'actor',
 'actual',
 'ad',
 'ada',
 'adam',
 'adapt',
 'adblock',
 'add',
 'addict',
 'addit',
 'addon',
 'address',
 'adelaid',
 'adequ',
 'adher',
 'adjust',
 'admin',
 'administr',
 'admir',
 'admit',
 'admittedli',
 'adopt',
 'adrenalin',
 'adress',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'adversari',
 'advert',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'advisori',
 'adv

In [None]:
y = df_binance.label

### Training

In [None]:
y[y == 0] = 1

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

In [None]:
svm_model = []
mean_error = []
std_error = []
Ci_range = [0.001, 1.0, 100.0]

for index, Ci in enumerate(Ci_range):    
    svm_model.append( LinearSVC(C=Ci, max_iter=10000) )
    scores = cross_val_score(svm_model[index], Xtrain, ytrain, cv=5, scoring='f1')
    mean_error.append(np.array(scores).mean())
    std_error.append(np.array(scores).std())    
plt.errorbar(Ci_range, mean-error, yerr=std_error, linewidth=3)
plt.xlabel('Ci')
plt.ylabel('F1 Score')
plt.show()

In [None]:
FIXED_C = 1

model = LinearSVC(C=FIXED_C, max_iter=10000)
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
print(classification_report(ytest,ypred))
print(confusion_matrix(ytest,ypred))