# Training a classifier

In this part, I will specifically train a classifier on the sentiment scores, as well as word vectorizations of the respective reddit posts

To limit the complexity, I limit the study this time to three stocks, the most discussed stocks, and utilize all subreddit posts for these stocks.

I choose GME, AZN and AMC as stocks since they are the most discussed stocks of 2021 (based on the dataset)

In [1]:
import pandas as pd
import os

stock = 'AMZN'
filename = 'classifier_dataset_'+stock+'.csv'

def contains_stock(x, stock):
    return stock in x.split(',')

def idx_containts_stock(series, stock):
    return series.apply(lambda x : contains_stock(x, stock))

df = pd.read_csv(os.path.join(os.getcwd(), 'data', 'with_sentiments.csv'), index_col=0)
df = df[idx_containts_stock(df['stock'], stock)]
df.head()


Unnamed: 0,created,title,selftext,upvote_ratio,score,gilded,total_awards_received,num_comments,stock,subreddit,...,nltk_text_neg,nltk_text_neu,nltk_text_pos,nltk_text_comb,finbert_title_pos,finbert_title_neg,finbert_title_neu,finbert_text_pos,finbert_text_neg,finbert_text_neu
28,2021-01-30,🚀100% way to know shorts are covering🚀,Stop thinking 🥜1000 - 5000 🥜 .You would know w...,0.93,12,1,1,1,"GME,AMZN",gme,...,0.049,0.839,0.112,0.8348,0.033798,0.037089,0.929113,0.038002,0.049384,0.912615
38,2021-02-01,ROBINHOOD IS LIMITING SHARES! Get this now!,\n\nBest Robinhood alternative so you can down...,0.57,1,0,0,0,"GME,TSLA,AMZN",gme,...,0.0,0.667,0.333,0.9323,0.032801,0.343509,0.623689,0.312908,0.012323,0.674769
91,2021-02-10,Will GME be recognized as an ecommerce company...,"Just an idiot trying to think, which is never ...",0.96,23,0,0,8,"GME,AMZN",gme,...,0.096,0.891,0.013,-0.892,0.313173,0.009026,0.677801,0.079225,0.045461,0.875313
208,2021-02-19,"“Whenever there’s a lot of skepticism, it usua...","\n""When Ryan Cohen was launching online pet re...",1.0,18,0,0,4,AMZN,gme,...,0.0,0.893,0.107,0.7096,0.049664,0.030498,0.919839,0.104473,0.014842,0.880685
323,2021-02-24,RetardedApe420 - YOLO 24/02/2021,"First of all, let me introduce myself. I'm no ...",1.0,15,0,0,5,"GME,AMZN",gme,...,0.082,0.793,0.125,0.9841,0.11691,0.014526,0.868565,0.11741,0.032179,0.850412


The following steps are to:
Drop unnecessary columns
Find a way to aggregate the data for each day.

In [34]:
df.columns

Index(['created', 'title', 'selftext', 'upvote_ratio', 'score', 'gilded',
       'total_awards_received', 'num_comments', 'stock', 'subreddit',
       'nltk_title_neg', 'nltk_title_neu', 'nltk_title_pos', 'nltk_title_comb',
       'nltk_text_neg', 'nltk_text_neu', 'nltk_text_pos', 'nltk_text_comb',
       'finbert_title_pos', 'finbert_title_neg', 'finbert_title_neu',
       'finbert_text_pos', 'finbert_text_neg', 'finbert_text_neu'],
      dtype='object')

In [35]:
# drop the stock column, we are only interested in GME
df = df.drop('stock', axis=1)

Next, I want to vectorize the texts and titles of the reddit posts. First, I combine the text and the title into a single column (and separate the two parts with a dot)

In [36]:
df['text'] = df['title'] + '. ' + df['selftext']
df = df.drop(['title', 'selftext'], axis=1)


In [37]:
df.head()

Unnamed: 0,created,upvote_ratio,score,gilded,total_awards_received,num_comments,subreddit,nltk_title_neg,nltk_title_neu,nltk_title_pos,...,nltk_text_neu,nltk_text_pos,nltk_text_comb,finbert_title_pos,finbert_title_neg,finbert_title_neu,finbert_text_pos,finbert_text_neg,finbert_text_neu,text
28,2021-01-30,0.93,12,1,1,1,gme,0.0,1.0,0.0,...,0.839,0.112,0.8348,0.033798,0.037089,0.929113,0.038002,0.049384,0.912615,🚀100% way to know shorts are covering🚀. Stop t...
38,2021-02-01,0.57,1,0,0,0,gme,0.0,0.63,0.37,...,0.667,0.333,0.9323,0.032801,0.343509,0.623689,0.312908,0.012323,0.674769,ROBINHOOD IS LIMITING SHARES! Get this now!. \...
91,2021-02-10,0.96,23,0,0,8,gme,0.0,1.0,0.0,...,0.891,0.013,-0.892,0.313173,0.009026,0.677801,0.079225,0.045461,0.875313,Will GME be recognized as an ecommerce company...
208,2021-02-19,1.0,18,0,0,4,gme,0.074,0.772,0.154,...,0.893,0.107,0.7096,0.049664,0.030498,0.919839,0.104473,0.014842,0.880685,"“Whenever there’s a lot of skepticism, it usua..."
323,2021-02-24,1.0,15,0,0,5,gme,0.0,0.414,0.586,...,0.793,0.125,0.9841,0.11691,0.014526,0.868565,0.11741,0.032179,0.850412,RetardedApe420 - YOLO 24/02/2021. First of all...


Now, I want to vectorize the text

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(df['text'])
vectorized = vectorizer.transform(df['text'])


In [39]:
print(vectorized.shape, type(vectorized))

(1584, 35639) <class 'scipy.sparse.csr.csr_matrix'>


This results in a way too large matrix, so large that is has to be represented as a sparse matrix. Based on this, I think it is better to use a document embedding, so I use gensim to create document embeddings

In [40]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [41]:
doc = nlp('There is a fire by the sea')
doc.vector.shape

(300,)

In [42]:
from tqdm.notebook import tqdm
tqdm.pandas()

embeddings_df = df['text'].progress_apply(lambda text : nlp(text).vector)
    

  0%|          | 0/1584 [00:00<?, ?it/s]

In [43]:
embeddings_df = pd.DataFrame.from_records(embeddings_df.values)

In [44]:
df = df.drop('text', axis=1)
df = pd.concat([df.reset_index(drop=True), embeddings_df.reset_index(drop=True)], axis=1)
df.head()

Unnamed: 0,created,upvote_ratio,score,gilded,total_awards_received,num_comments,subreddit,nltk_title_neg,nltk_title_neu,nltk_title_pos,...,290,291,292,293,294,295,296,297,298,299
0,2021-01-30,0.93,12,1,1,1,gme,0.0,1.0,0.0,...,-0.216281,0.088456,-0.089577,-0.116207,-0.05092,0.007032,-0.094976,-0.079962,0.091473,0.07604
1,2021-02-01,0.57,1,0,0,0,gme,0.0,0.63,0.37,...,-0.1614,0.014059,0.034371,-0.052565,0.103239,-0.056483,-0.031164,-0.127904,0.031145,0.155472
2,2021-02-10,0.96,23,0,0,8,gme,0.0,1.0,0.0,...,-0.17454,0.021262,-0.028193,-0.033913,0.01175,-0.039136,-0.062851,-0.055079,0.031028,0.077873
3,2021-02-19,1.0,18,0,0,4,gme,0.074,0.772,0.154,...,-0.173574,0.017097,0.001084,-0.049554,-0.020751,-0.002842,-0.06798,-0.069997,-0.01172,0.057062
4,2021-02-24,1.0,15,0,0,5,gme,0.0,0.414,0.586,...,-0.187825,0.06914,-0.078737,-0.09065,0.007986,-0.015818,-0.051887,-0.068453,0.041442,0.09452


In [45]:
print(df.columns[1:25])

Index([         'upvote_ratio',                 'score',
                      'gilded', 'total_awards_received',
                'num_comments',             'subreddit',
              'nltk_title_neg',        'nltk_title_neu',
              'nltk_title_pos',       'nltk_title_comb',
               'nltk_text_neg',         'nltk_text_neu',
               'nltk_text_pos',        'nltk_text_comb',
           'finbert_title_pos',     'finbert_title_neg',
           'finbert_title_neu',      'finbert_text_pos',
            'finbert_text_neg',      'finbert_text_neu',
                             0,                       1,
                             2,                       3],
      dtype='object')


In [46]:
df.to_csv(os.path.join(os.getcwd(), 'data', filename))

In [2]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(os.getcwd(), 'data', filename))

In [4]:
print(df.columns)

Index(['Unnamed: 0', 'created', 'upvote_ratio', 'score', 'gilded',
       'total_awards_received', 'num_comments', 'subreddit', 'nltk_title_neg',
       'nltk_title_neu',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=322)
