# ML4F Semester Project

Current blocks/issues: 
- What precision do we want to operate on? The created_utc column (date of the comment) has precision up to the seconds, imo we could group by hour and create more observations this way, than if we simply aggregated and predicted daily
- Need to find stock data depending on the frequency we desire
- Check out Preprocessing function, maybe more interesting columns we could use too now that we can really inspect the data
- Note: I used mostly nltk so far, will add some spacy functionalities soon for PartsOfSpeech tags etc

In [143]:
import os
import re  
import pandas as pd  
from time import time 
import sys

import spacy  
import logging  
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')
import multiprocessing

DATA_DIR = "../data/"
wsb_data_path = os.path.join(DATA_DIR, 'wsb_comments/wsb_comments_raw.csv') #note: wsb=wallstreetbets
stock_data_path = os.path.join(DATA_DIR, 'GME')

def load_wsb_data(data_path, nrows=None):
    "Load wsb data, nrows None indicates all rows, otherwise specified integer of rows"
    return pd.read_csv(wsb_data_path, nrows = nrows, delimiter=',')

wsb_df = load_wsb_data(wsb_data_path, nrows=1000)
wsb_df['raw'] = wsb_df['body']
display(wsb_df.head())

# Scraped from IB via https://gist.github.com/wrighter/dd201adb09518b3c1d862255238d2534
def load_stock_data(data_path):
    "Load GME stock price data"
    gme_dfs = [pd.read_csv(os.path.join(data_path, f), delimiter=',') for f in os.listdir(stock_data_path) if os.path.isfile(os.path.join(stock_data_path, f))]
    gme_final_df = pd.concat(gme_dfs, ignore_index=True)
    return gme_final_df

stock_df = load_stock_data(stock_data_path)
display(stock_df.head())

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,total_awards_received,treatment_tags,top_awarded_type,edited,distinguished,comment_type,author_cakeday,editable,media_metadata,raw
0,[],,LazyMeal,,,[],,,,text,...,0,,,,,,,,,We’re retarded and claim to be often. If you l...
1,[],,math_salts,,,[],,,,text,...,0,,,,,,,,,Yes
2,[],,Legendary_Squirrel,,,[],,,,text,...,0,,,,,,,,,markets been open for 13 min...
3,[],,WSBMORONICTRADER,,,[],,,,text,...,0,,,,,,,,,Spy can fuck around all it wants just as long ...
4,[],,[deleted],,,,,,dark,,...,0,,,,,,,,,[removed]


Unnamed: 0,date,open,high,low,close,volume,barCount,average
0,20210416 10:00:00,158.58,159.88,157.8,159.88,6,5,158.97
1,20210416 10:15:00,160.47,160.47,160.4,160.4,2,2,160.435
2,20210416 10:30:00,159.33,160.0,159.33,160.0,4,3,159.783
3,20210416 10:45:00,160.0,160.0,160.0,160.0,1,1,160.0
4,20210416 11:00:00,160.0,160.0,160.0,160.0,0,0,160.0


In [77]:
class PreProcessing: 
    
    def __init__(self, wsb_data, lemmatize=True, lower_case=True, rem_stopwords=True, rem_punctuation=True, tokenize=True):
        """
        Initialise all class parameters
        
        :param data: nonempty pandas dataframe, wsb dataframe 
        :param lemmatize: bool, whether to perform lemmatization
        :param lower_case: bool, whether to lowercase
        :param rem_stopwords: bool, whether to remove stopwords
        :param tokenize: bool, whether to tokenize
        """
        
        self.wsb_data = wsb_data
        self.lemmatize = lemmatize
        self.lower_case = lower_case
        self.rem_stopwords = rem_stopwords
        self.rem_punctuation = rem_punctuation
        self.tokenize = tokenize
        
    ### Ensure Parameter types 
    #K: need to add to this one gradually as we add columns that we use etc.
    @property
    def wsb_data(self):
        return self._wsb_data
    @wsb_data.setter
    def wsb_data(self, wsb_data):
        
        req_columns = ['author','body','created_utc']
        str_columns = ['body']
        date_columns = ['created_utc']

        # Ensure the provided object is a dataframe
        if not isinstance(wsb_data, pd.DataFrame):
            raise Exception("The provided data must be a pandas Dataframe")
        
        # Ensure wsb dataframe is non empty
        if wsb_data.shape[0] == 0: 
            raise Exception("Provided Dataframe is empty")
        
        # Ensure all required columns are provided
        missing_columns = set(req_columns).difference(set(wsb_data.columns.tolist()))
        if len(missing_columns) > 0:
            raise Exception(f"The columns {missing_columns} are missing from the provided dataframe!")
            
        # Ensure all column names don't have unexpected periods
        if '.' in list(''.join(wsb_data.columns.tolist())):
            raise Exception("All Column names must not include periods :'.'")
            
        # Ensure all string columns are strings
        non_str_columns = set(str_columns).difference(set(wsb_data.select_dtypes(include='object')))
        if len(non_str_columns) > 0:
            raise Exception(f'The columns {non_str_columns} are expected as string (pandas object) columns.')
        
        # Ensure dates are interpretable
        for date_col in date_columns: 
            if pd.to_datetime(wsb_data[date_col], unit='s').notnull().all():
                try:
                    # Otherwise Convert date using unixtimestamp to datetime object
                    wsb_data[date_col] = pd.to_datetime(wsb_data[date_col], unit='s')
                except: 
                    raise Exception(f"{date_col} must be a valid unixtimestamp format")
                    
        self._wsb_data = wsb_data
        
    @property
    def lemmatize(self):
        return self._lemmatize
    @lemmatize.setter
    def lemmatize(self, lemmatize):
        if not isinstance(lemmatize, bool): 
            raise Exception('lemmatize must be provided as a boolean parameter (True/False) to the class')
        self._lemmatize = lemmatize
    
    @property
    def lower_case(self):
        return self._lower_case
    @lower_case.setter
    def lower_case(self, lower_case):
        if not isinstance(lower_case, bool): 
            raise Exception('lower_case must be provided as a boolean parameter (True/False) to the class')
        self._lower_case = lower_case
      
    @property
    def rem_stopwords(self):
        return self._rem_stopwords
    @rem_stopwords.setter
    def rem_stopwords(self, rem_stopwords):
        if not isinstance(rem_stopwords, bool): 
            raise Exception('rem_stopwords must be provided as a boolean parameter (True/False) to the class')
        self._rem_stopwords = rem_stopwords
        
    @property
    def rem_punctuation(self):
        return self._rem_punctuation
    @rem_punctuation.setter
    def rem_punctuation(self, rem_punctuation):
        if not isinstance(rem_punctuation, bool): 
            raise Exception('rem_punctuation must be provided as a boolean parameter (True/False) to the class')
        self._rem_punctuation = rem_punctuation
        
    @property
    def tokenize(self):
        return self._tokenize
    @tokenize.setter
    def tokenize(self, tokenize):
        if not isinstance(tokenize, bool): 
            raise Exception('tokenize must be provided as a boolean parameter (True/False) to the class')
        self._tokenize = tokenize
        
    def clean_textual_data(self, textual_columns):
        
        ### Ensure the provided textual columns exist, and if single string column name convert it into a list
        if len(textual_columns)<1:
            raise Exception('The number of textual columns to clean must be greater than 0')
        if isinstance(textual_columns, str):
            textual_columns = [textual_columns]
        missing_columns = set(textual_columns).difference(set(self.wsb_data.columns.tolist()))
        if len(missing_columns) > 0:
            raise Exception(f"The columns {missing_columns} to clean are missing from the wsb dataframe!")

        def lower_case_fn(self, col_name): 
            self.wsb_data[col_name] = self.wsb_data[col_name].str.lower()
            return self.wsb_data

        def lemmatize_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            lemmatizer = nltk.stem.WordNetLemmatizer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def stemming_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            stemmer = nltk.stem.porter.PorterStemmer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [stemmer.stem(w) for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def tokenize_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def rem_punctuation_fn(self, col_name):
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in x if w.isalnum()])
            return self.wsb_data

        def rem_stopwords_fn(self, col_name):
            "stopwords dictionary considered English, wsb is an english forum"
            remove_elements = set(nltk.corpus.stopwords.words('english'))
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in x if not w in remove_elements])
            return self.wsb_data

        def remove_tokenization(self, col_name):
            "Necessary as final step to untokenize in case desired, tokenization required for other functions to not break"
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: ' '.join(x))
            return self.wsb_data

        for textual_col in textual_columns:

            if self.lower_case:
                lower_case_fn(self, textual_col)

            # lemmatize tokens if true, if false, stem tokens, if None then just tokenize
            if self.lemmatize:
                lemmatize_fn(self, textual_col)
            elif self.lemmatize:
                stemming_fn(self, textual_col)
            else: 
                tokenize_fn(self, textual_col)

            if self.rem_punctuation:
                rem_punctuation_fn(self, textual_col)
            if self.rem_stopwords:
                rem_stopwords_fn(self, textual_col)
            if not self.tokenize:
                remove_tokenization(self, textual_col)

        return self.wsb_data
        
        
    # to later remove: for development
    def output_data(self):
        return self.wsb_data 

In [144]:
testPreProcessing = PreProcessing(wsb_df, lemmatize=True, lower_case=True, rem_stopwords=True, rem_punctuation=True, tokenize=True)
testPreProcessing.clean_textual_data('body')
useful_columns = ['author','raw', 'body','created_utc', 'score', 'link_id', 'is_submitter']
data_test = testPreProcessing.output_data()[useful_columns]

In [145]:
data_test

Unnamed: 0,author,raw,body,created_utc,score,link_id,is_submitter
0,LazyMeal,We’re retarded and claim to be often. If you l...,"[retarded, claim, listen, doe, make]",2020-03-25 08:11:50,1,t3_fom9g6,False
1,math_salts,Yes,[yes],2020-03-25 08:11:49,1,t3_fod66b,False
2,Legendary_Squirrel,markets been open for 13 min...,"[market, open, 13]",2020-03-25 08:11:45,1,t3_fod66b,False
3,WSBMORONICTRADER,Spy can fuck around all it wants just as long ...,"[spy, fuck, around, want, long, 220, put, prin...",2020-03-25 08:11:41,1,t3_fod66b,False
4,[deleted],[removed],[],2020-03-25 08:11:37,1,t3_fom0hg,False
...,...,...,...,...,...,...,...
995,Jujubewise,Haha 😆,[haha],2020-12-05 20:08:36,7,t3_k6tl0d,False
996,[deleted],[deleted],[],2020-12-05 20:08:33,1,t3_k7eh32,False
997,The_Ron_Swansonson,I’m in,[],2020-12-05 20:08:33,1,t3_k6tl0d,False
998,steve_pops_01,Real,[real],2020-12-05 20:08:33,5,t3_k6tl0d,False


## Data Cleaning
- Get rid of deleted comments and row with empty or nan values
- Add POS, Tag, Dep, Shape
- Remove URL and @

In [257]:
import spacy
nlp = spacy.load("en_core_web_sm") # efficiency and accuracy --> "en_core_web_trf"

data_test = data_test[data_test['author'] != '[deleted]']

raw=[]
pos = []
tag = []
dep = []
shape = []

for sentence in data_test['raw']:
    sentence = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', sentence)
    sentence = re.sub("(@[A-Za-z0-9_]+)","", sentence)
    raw.append(sentence)
    
    pos_tmp = []
    tag_tmp = []
    dep_tmp = []
    shape_tmp = []
    for token in nlp(sentence):
        pos_tmp.append(token.pos_)
        tag_tmp.append(token.tag_)
        dep_tmp.append(token.dep_)
        shape_tmp.append(token.shape_)
    pos.append(pos_tmp)
    tag.append(tag_tmp)
    dep.append(dep_tmp)
    shape.append(shape_tmp) 
     
data_test['Pos'] = pos
data_test['Tag'] = tag
data_test['Dep'] = dep
data_test['Shape'] = shape    

data_test['raw'] = raw

data_test.replace('', np.nan, inplace=True)
data_test = data_test.dropna(how='any',axis=0)
data_test = data_test[data_test['body'].map(lambda d: len(d)) > 0]

## Sentiment Analyser
- Flair
- Vader
- Blob

In [138]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [268]:
fler = []
vader = []
blob = []

for i in range(len(data_test)):
    raw = data_test['raw'].iloc[i]
    sentence = data_test['body'].iloc[i]
    # FLAIR
    s = Sentence(sentence)
    classifier.predict(s)
    total_sentiment = str(s.labels[0])
    num = float(re.findall(r'\d+\.\d+', total_sentiment)[0])
    if total_sentiment.find('POSITIVE') == -1:
        num = num * -1
    fler.append(num)

    # VADER
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(raw)['compound']
    vader.append(float(vs))

    # BLOB
    _tmp = TextBlob(raw).sentiment.polarity
    blob.append(float(_tmp))
    
data_test['FLAIR'] = fler
data_test['VADER'] = vader
data_test['BLOB'] = blob


In [269]:
data_test

Unnamed: 0,author,raw,body,created_utc,score,link_id,is_submitter,Pos,Tag,Dep,Shape,FLAIR,VADER,BLOB
0,LazyMeal,We’re retarded and claim to be often. If you l...,"[retarded, claim, listen, doe, make]",2020-03-25 08:11:50,1,t3_fom9g6,False,"[PRON, VERB, ADJ, CCONJ, VERB, PART, AUX, ADV,...","[PRP, VBP, JJ, CC, VBP, TO, VB, RB, ., IN, PRP...","[nsubj, ROOT, acomp, cc, conj, aux, xcomp, aco...","[Xx, ’xx, xxxx, xxx, xxxx, xx, xx, xxxx, ., Xx...",-0.9713,-0.5719,-0.8000
1,math_salts,Yes,[yes],2020-03-25 08:11:49,1,t3_fod66b,False,[INTJ],[UH],[ROOT],[Xxx],0.9918,0.4019,0.0000
2,Legendary_Squirrel,markets been open for 13 min...,"[market, open, 13]",2020-03-25 08:11:45,1,t3_fod66b,False,"[NOUN, VERB, ADJ, ADP, NUM, NOUN, PUNCT]","[NNS, VBD, JJ, IN, CD, NN, .]","[nsubj, ROOT, acomp, prep, nummod, pobj, punct]","[xxxx, xxxx, xxxx, xxx, dd, xxx, ...]",0.8276,0.0000,0.0000
3,WSBMORONICTRADER,Spy can fuck around all it wants just as long ...,"[spy, fuck, around, want, long, 220, put, prin...",2020-03-25 08:11:41,1,t3_fod66b,False,"[PROPN, AUX, VERB, ADP, PRON, PRON, VERB, ADV,...","[NNP, MD, VB, RP, DT, PRP, VBZ, RB, RB, RB, IN...","[nsubj, aux, ROOT, prt, dobj, nsubj, relcl, ad...","[Xxx, xxx, xxxx, xxxx, xxx, xx, xxxx, xxxx, xx...",-0.9948,-0.5423,-0.0625
5,madamlazonga,"you lost me at ""bulls fucked""",[lost],2020-03-25 08:11:36,1,t3_fod66b,False,"[PRON, VERB, PRON, ADP, PUNCT, NOUN, VERB, PUNCT]","[PRP, VBD, PRP, IN, ``, NNS, VBN, '']","[nsubj, ROOT, dobj, prep, punct, pobj, acl, pu...","[xxx, xxxx, xx, xx, "", xxxx, xxxx, ""]",-0.9932,-0.7717,-0.6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,plimsickins16978,ANAL 12/11 $50,[anal],2020-12-05 20:08:38,4,t3_k6tl0d,False,"[PROPN, NUM, SYM, NUM]","[NNP, CD, $, CD]","[ROOT, nummod, nmod, nummod]","[XXXX, dd/dd, $, dd]",0.8674,0.0000,0.0000
994,PlentyC,Mmm fuck yeah,"[mmm, fuck, yeah]",2020-12-05 20:08:37,7,t3_k6tl0d,False,"[INTJ, NOUN, INTJ]","[UH, NN, UH]","[intj, ROOT, ROOT]","[Xxx, xxxx, xxxx]",0.9925,-0.3182,-0.4000
995,Jujubewise,Haha 😆,[haha],2020-12-05 20:08:36,7,t3_k6tl0d,False,"[PROPN, NOUN]","[NNP, NN]","[ROOT, punct]","[Xxxx, 😆]",-0.9691,0.4588,0.2000
998,steve_pops_01,Real,[real],2020-12-05 20:08:33,5,t3_k6tl0d,False,[ADJ],[JJ],[ROOT],[Xxxx],0.9993,0.0000,0.2000


## Posts Cluster
- By Cluster
- By Time 

In [293]:
searchfor = ['aapl', 'apple', 'Apple']
ticker_posts = data_test[data_test['raw'].str.contains('|'.join(searchfor))]

ticker_cluster =


Unnamed: 0,author,raw,body,created_utc,score,link_id,is_submitter,Pos,Tag,Dep,Shape,FLAIR,VADER,BLOB
107,philmacrack123,"Y'all see amazon, if apple is bad, we will sel...","[see, apple, gonna, need, appl, easily, carry,...",2020-04-30 20:11:50,6,t3_gaszeo,False,"[PROPN, VERB, NOUN, PUNCT, SCONJ, NOUN, AUX, A...","[NNP, VBP, NN, ,, IN, NN, VBZ, JJ, ,, PRP, MD,...","[nsubj, ccomp, dobj, punct, mark, nsubj, advcl...","[X'xxx, xxx, xxxx, ,, xx, xxxx, xx, xxx, ,, xx...",-0.9823,0.359,0.038294
112,originalmuggins,Lol Apple with the slow roll. You gotta respec...,"[lol, apple, slow, gotta, respect]",2020-04-30 20:11:46,2,t3_gaszeo,False,"[PROPN, PROPN, ADP, DET, ADJ, NOUN, PUNCT, PRO...","[NNP, NNP, IN, DT, JJ, NN, ., PRP, VBP, TO, VB...","[compound, ROOT, prep, det, amod, pobj, punct,...","[Xxx, Xxxxx, xxxx, xxx, xxxx, xxxx, ., Xxx, xx...",0.9679,0.7096,0.25
196,facehuggerpoop,bought one apple call before close.\n\nI think...,"[bought, one, apple, call, think, going, lose,...",2020-04-30 20:11:08,1,t3_gaszeo,False,"[VERB, NUM, NOUN, NOUN, ADP, NOUN, PUNCT, SPAC...","[VBD, CD, NN, NN, IN, NN, ., _SP, PRP, VBP, PR...","[ROOT, nummod, compound, dobj, prep, pobj, pun...","[xxxx, xxx, xxxx, xxxx, xxxx, xxxx, ., \n\n, X...",-0.9975,-0.4019,0.285714
387,strawberry-jam-boy,Spce tsla se lulu plug wmt upwk amd aapl msft ...,"[spce, tsla, se, lulu, plug, wmt, upwk, amd, a...",2020-07-12 20:09:20,6,t3_hq11ao,False,"[PROPN, NOUN, X, PROPN, PROPN, PROPN, ADV, PRO...","[NNP, NN, FW, NNP, NNP, NNP, RB, NNP, NNP, NN,...","[compound, nsubj, compound, compound, compound...","[Xxxx, xxxx, xx, xxxx, xxxx, xxx, xxxx, xxx, x...",0.9968,0.3612,0.0
532,Agent248,Holding apple calls is making me gay from last...,"[holding, apple, call, making, gay, last, 2, day]",2020-09-23 20:11:29,2,t3_iyhxsc,False,"[VERB, NOUN, NOUN, AUX, VERB, PRON, ADJ, ADP, ...","[VBG, NN, NNS, VBZ, VBG, PRP, JJ, IN, JJ, CD, ...","[amod, compound, nsubj, aux, ROOT, nsubj, ccom...","[Xxxxx, xxxx, xxxx, xx, xxxx, xx, xxx, xxxx, x...",-0.9977,0.0,0.208333
880,LegNest,oof aapl **109.26**,"[oof, aapl]",2020-10-30 08:06:39,1,t3_jkhkw8,False,"[NOUN, PROPN, SPACE, PUNCT, PUNCT, NUM, PUNCT,...","[NN, NNP, _SP, NFP, NFP, CD, NFP, NFP]","[ROOT, prep, dep, punct, punct, appos, punct, ...","[xxx, xxxx, , *, *, ddd.dd, *, *]",-0.9887,0.0,0.0


In [294]:
data_test[data_test['link_id'] == 't3_gaszeo']

Unnamed: 0,author,raw,body,created_utc,score,link_id,is_submitter,Pos,Tag,Dep,Shape,FLAIR,VADER,BLOB
100,TheD-,let me add amazon spreads on the list of thing...,"[let, add, amazon, spread, list, thing, pussied]",2020-04-30 20:11:53,2,t3_gaszeo,False,"[VERB, PRON, VERB, NOUN, VERB, ADP, DET, NOUN,...","[VB, PRP, VB, NN, VBZ, IN, DT, NN, IN, NNS, PR...","[ROOT, nsubj, ccomp, dobj, dobj, prep, det, po...","[xxx, xx, xxx, xxxx, xxxx, xx, xxx, xxxx, xx, ...",-0.9974,0.1779,0.0
103,Domgrath42,[$TEAM]( Reports Q3 (Mar) earnings of $0.25 p...,"[report, q3, earnings, per, better, capital, i...",2020-04-30 20:11:51,1,t3_gaszeo,False,"[SPACE, PUNCT, SYM, NOUN, PUNCT, PUNCT, PROPN,...","[_SP, -LRB-, $, NN, -RRB-, -LRB-, NNPS, NNP, -...","[dep, punct, nmod, npadvmod, punct, punct, nmo...","[ , [, $, XXXX, ], (, Xxxxx, Xd, (, Xxx, ), xx...",-0.9901,0.6249,0.55
104,mvpatlife,most stop trading at 4:15,"[stop, trading]",2020-04-30 20:11:51,1,t3_gaszeo,False,"[ADJ, VERB, VERB, ADP, NUM]","[JJS, VB, VBG, IN, CD]","[advmod, ROOT, xcomp, prep, pobj]","[xxxx, xxxx, xxxx, xx, d:dd]",-0.9952,-0.3597,0.5
107,philmacrack123,"Y'all see amazon, if apple is bad, we will sel...","[see, apple, gonna, need, appl, easily, carry,...",2020-04-30 20:11:50,6,t3_gaszeo,False,"[PROPN, VERB, NOUN, PUNCT, SCONJ, NOUN, AUX, A...","[NNP, VBP, NN, ,, IN, NN, VBZ, JJ, ,, PRP, MD,...","[nsubj, ccomp, dobj, punct, mark, nsubj, advcl...","[X'xxx, xxx, xxxx, ,, xx, xxxx, xx, xxx, ,, xx...",-0.9823,0.359,0.03829365
110,FeralHat,My 280p would love that,"[280p, would, love]",2020-04-30 20:11:47,1,t3_gaszeo,False,"[PRON, NOUN, AUX, VERB, PRON]","[PRP$, NNS, MD, VB, DT]","[poss, nsubj, aux, ROOT, dobj]","[Xx, dddx, xxxx, xxxx, xxxx]",0.9527,0.6369,0.5
111,DonnySmallHandsTrump,they did meet rev..,[meet],2020-04-30 20:11:46,1,t3_gaszeo,False,"[PRON, AUX, VERB, PROPN, PUNCT]","[PRP, VBD, VB, NNP, .]","[nsubj, aux, ROOT, dobj, punct]","[xxxx, xxx, xxxx, xxx, ..]",0.9902,0.0,0.0
112,originalmuggins,Lol Apple with the slow roll. You gotta respec...,"[lol, apple, slow, gotta, respect]",2020-04-30 20:11:46,2,t3_gaszeo,False,"[PROPN, PROPN, ADP, DET, ADJ, NOUN, PUNCT, PRO...","[NNP, NNP, IN, DT, JJ, NN, ., PRP, VBP, TO, VB...","[compound, ROOT, prep, det, amod, pobj, punct,...","[Xxx, Xxxxx, xxxx, xxx, xxxx, xxxx, ., Xxx, xx...",0.9679,0.7096,0.25
121,ch1p_skylark,"AAPL at 4:30, UAL 4:15 EST","[aapl, ual, est]",2020-04-30 20:11:42,1,t3_gaszeo,False,"[PROPN, ADP, NUM, PUNCT, PROPN, NUM, NOUN]","[NNP, IN, CD, ,, NNP, CD, NN]","[ROOT, prep, pobj, punct, compound, nummod, ap...","[XXXX, xx, d:dd, ,, XXX, d:dd, XXX]",0.7741,0.0,0.0
124,ottermodee,Yes pls,"[yes, pls]",2020-04-30 20:11:39,1,t3_gaszeo,False,"[INTJ, INTJ]","[UH, UH]","[ROOT, intj]","[Xxx, xxx]",0.987,0.4588,0.0
126,agoodnightasleep,Stock had run up...but their revenue is fuckin...,"[stock, run, revenue, fucking]",2020-04-30 20:11:38,2,t3_gaszeo,False,"[PROPN, AUX, VERB, ADP, PUNCT, CCONJ, PRON, NO...","[NNP, VBD, VBN, RP, :, CC, PRP$, NN, VBZ, VBG,...","[nsubj, aux, ROOT, prt, punct, cc, poss, nsubj...","[Xxxxx, xxx, xxx, xx, ..., xxx, xxxx, xxxx, xx...",-0.9885,0.0,1.0


In [285]:
data_test['raw'].str.contains('lost')

0      False
1      False
2      False
3      False
5       True
       ...  
993    False
994    False
995    False
998    False
999    False
Name: raw, Length: 883, dtype: bool

In [280]:
data_test['body']

0                   [retarded, claim, listen, doe, make]
1                                                  [yes]
2                                     [market, open, 13]
3      [spy, fuck, around, want, long, 220, put, prin...
5                                                 [lost]
                             ...                        
993                                               [anal]
994                                    [mmm, fuck, yeah]
995                                               [haha]
998                                               [real]
999                                                [lol]
Name: body, Length: 883, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


## Fetch Ticker Data 
- Ticker
- Frequency

In [None]:
import yfinance as yf
data = yf.download(ticker, interval=frequency, start="2012-01-31", end="2021-02-16")

In [None]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1,
  random_state=RANDOM_SEED)