# ML4F Semester Project

Current blocks/issues: 
- What precision do we want to operate on? The created_utc column (date of the comment) has precision up to the seconds, imo we could group by hour and create more observations this way, than if we simply aggregated and predicted daily
- Need to find stock data depending on the frequency we desire
- Check out Preprocessing function, maybe more interesting columns we could use too now that we can really inspect the data
- Note: I used mostly nltk so far, will add some spacy functionalities soon for PartsOfSpeech tags etc

In [16]:
import os
import re  
import pandas as pd  
from time import time 
import sys

import spacy  
import logging  
import nltk
import multiprocessing

DATA_DIR = "../data/"
wsb_data_path = os.path.join(DATA_DIR, 'wsb_comments/wsb_comments_raw.csv')
stock_data_path = os.path.join(DATA_DIR, 'GME')

def load_wsb_data(data_path, nrows=None):
    "Load wsb data, nrows None indicates all rows, otherwise specified integer of rows"
    return pd.read_csv(wsb_data_path, nrows = nrows, delimiter=',')

wsb_df = load_wsb_data(wsb_data_path, nrows=1000)
display(wsb_df.head())

# Scraped from IB via https://gist.github.com/wrighter/dd201adb09518b3c1d862255238d2534
def load_stock_data(data_path):
    "Load GME stock price data"
    gme_dfs = [pd.read_csv(os.path.join(data_path, f), delimiter=',') for f in os.listdir(stock_data_path) if os.path.isfile(os.path.join(stock_data_path, f))]
    gme_final_df = pd.concat(gme_dfs, ignore_index=True)
    return gme_final_df

stock_df = load_stock_data(stock_data_path)
display(stock_df.head())

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,subreddit_id,total_awards_received,treatment_tags,top_awarded_type,edited,distinguished,comment_type,author_cakeday,editable,media_metadata
0,[],,LazyMeal,,,[],,,,text,...,t5_2th52,0,,,,,,,,
1,[],,math_salts,,,[],,,,text,...,t5_2th52,0,,,,,,,,
2,[],,Legendary_Squirrel,,,[],,,,text,...,t5_2th52,0,,,,,,,,
3,[],,WSBMORONICTRADER,,,[],,,,text,...,t5_2th52,0,,,,,,,,
4,[],,[deleted],,,,,,dark,,...,t5_2th52,0,,,,,,,,


Unnamed: 0,date,open,high,low,close,volume,barCount,average
0,20201224 10:45:00,21.18,21.18,21.18,21.18,1,1,21.18
1,20201224 11:00:00,21.18,21.18,21.18,21.18,0,0,21.18
2,20201224 11:15:00,21.18,21.18,21.18,21.18,0,0,21.18
3,20201224 11:30:00,21.18,21.18,21.18,21.18,0,0,21.18
4,20201224 11:45:00,20.8,20.8,20.8,20.8,1,1,20.8


In [2]:
class PreProcessing: 
    
    def __init__(self, wsb_data, lemmatize=True, lower_case=True, rem_stopwords=True, rem_punctuation=True, tokenize=True):
        """
        Initialise all class parameters
        
        :param data: nonempty pandas dataframe, wsb dataframe 
        :param lemmatize: bool, whether to perform lemmatization
        :param lower_case: bool, whether to lowercase
        :param rem_stopwords: bool, whether to remove stopwords
        :param tokenize: bool, whether to tokenize
        """
        
        self.wsb_data = wsb_data
        self.lemmatize = lemmatize
        self.lower_case = lower_case
        self.rem_stopwords = rem_stopwords
        self.rem_punctuation = rem_punctuation
        self.tokenize = tokenize
        
    ### Ensure Parameter types 
    #K: need to add to this one gradually as we add columns that we use etc.
    @property
    def wsb_data(self):
        return self._wsb_data
    @wsb_data.setter
    def wsb_data(self, wsb_data):
        
        req_columns = ['author','body','created_utc']
        str_columns = ['body']
        date_columns = ['created_utc']

        # Ensure the provided object is a dataframe
        if not isinstance(wsb_data, pd.DataFrame):
            raise Exception("The provided data must be a pandas Dataframe")
        
        # Ensure wsb dataframe is non empty
        if wsb_data.shape[0] == 0: 
            raise Exception("Provided Dataframe is empty")
        
        # Ensure all required columns are provided
        missing_columns = set(req_columns).difference(set(wsb_data.columns.tolist()))
        if len(missing_columns) > 0:
            raise Exception(f"The columns {missing_columns} are missing from the provided dataframe!")
            
        # Ensure all column names don't have unexpected periods
        if '.' in list(''.join(wsb_data.columns.tolist())):
            raise Exception("All Column names must not include periods :'.'")
            
        # Ensure all string columns are strings
        non_str_columns = set(str_columns).difference(set(wsb_data.select_dtypes(include='object')))
        if len(non_str_columns) > 0:
            raise Exception(f'The columns {non_str_columns} are expected as string (pandas object) columns.')
        
        # Ensure dates are interpretable
        for date_col in date_columns: 
            if pd.to_datetime(wsb_data[date_col], format='%Y-%m-%d %H:%M:%S', errors='coerce').notnull().all():
                try:
                    # Otherwise Convert date using unixtimestamp to datetime object
                    wsb_data[date_col] = pd.to_datetime(wsb_data[date_col], format='%Y-%m-%d %H:%M:%S')
                except: 
                    raise Exception(f"{date_col} must be a valid unixtimestamp format")
                    
        self._wsb_data = wsb_data
        
    @property
    def lemmatize(self):
        return self._lemmatize
    @lemmatize.setter
    def lemmatize(self, lemmatize):
        if not isinstance(lemmatize, bool): 
            raise Exception('lemmatize must be provided as a boolean parameter (True/False) to the class')
        self._lemmatize = lemmatize
    
    @property
    def lower_case(self):
        return self._lower_case
    @lower_case.setter
    def lower_case(self, lower_case):
        if not isinstance(lower_case, bool): 
            raise Exception('lower_case must be provided as a boolean parameter (True/False) to the class')
        self._lower_case = lower_case
      
    @property
    def rem_stopwords(self):
        return self._rem_stopwords
    @rem_stopwords.setter
    def rem_stopwords(self, rem_stopwords):
        if not isinstance(rem_stopwords, bool): 
            raise Exception('rem_stopwords must be provided as a boolean parameter (True/False) to the class')
        self._rem_stopwords = rem_stopwords
        
    @property
    def rem_punctuation(self):
        return self._rem_punctuation
    @rem_punctuation.setter
    def rem_punctuation(self, rem_punctuation):
        if not isinstance(rem_punctuation, bool): 
            raise Exception('rem_punctuation must be provided as a boolean parameter (True/False) to the class')
        self._rem_punctuation = rem_punctuation
        
    @property
    def tokenize(self):
        return self._tokenize
    @tokenize.setter
    def tokenize(self, tokenize):
        if not isinstance(tokenize, bool): 
            raise Exception('tokenize must be provided as a boolean parameter (True/False) to the class')
        self._tokenize = tokenize
        
    def clean_textual_data(self, textual_columns):
        
        ### Ensure the provided textual columns exist, and if single string column name convert it into a list
        if len(textual_columns)<1:
            raise Exception('The number of textual columns to clean must be greater than 0')
        if isinstance(textual_columns, str):
            textual_columns = [textual_columns]
        missing_columns = set(textual_columns).difference(set(self.wsb_data.columns.tolist()))
        if len(missing_columns) > 0:
            raise Exception(f"The columns {missing_columns} to clean are missing from the wsb dataframe!")

        def lower_case_fn(self, col_name): 
            self.wsb_data[col_name] = self.wsb_data[col_name].str.lower()
            return self.wsb_data

        def lemmatize_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            lemmatizer = nltk.stem.WordNetLemmatizer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def stemming_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            stemmer = nltk.stem.porter.PorterStemmer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [stemmer.stem(w) for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def tokenize_fn(self, col_name):
            w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
            return self.wsb_data

        def rem_punctuation_fn(self, col_name):
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in x if w.isalnum()])
            return self.wsb_data

        def rem_stopwords_fn(self, col_name):
            "stopwords dictionary considered English, wsb is an english forum"
            remove_elements = set(nltk.corpus.stopwords.words('english'))
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: [w for w in x if not w in remove_elements])
            return self.wsb_data

        def remove_tokenization(self, col_name):
            "Necessary as final step to untokenize in case desired, tokenization required for other functions to not break"
            self.wsb_data[col_name] = self.wsb_data[col_name].apply(lambda x: ' '.join(x))
            return self.wsb_data

        for textual_col in textual_columns:

            if self.lower_case:
                lower_case_fn(self, textual_col)

            # lemmatize tokens if true, if false, stem tokens, if None then just tokenize
            if self.lemmatize:
                lemmatize_fn(self, textual_col)
            elif self.lemmatize:
                stemming_fn(self, textual_col)
            else: 
                tokenize_fn(self, textual_col)

            if self.rem_punctuation:
                rem_punctuation_fn(self, textual_col)
            if self.rem_stopwords:
                rem_stopwords_fn(self, textual_col)
            if not self.tokenize:
                remove_tokenization(self, textual_col)

        return self.wsb_data
        
        
    # to later remove: for development
    def output_data(self):
        return self.wsb_data 

In [3]:
testPreProcessing = PreProcessing(wsb_df, lemmatize=True, lower_case=True, rem_stopwords=True, rem_punctuation=True, tokenize=True)
testPreProcessing.clean_textual_data('body')
useful_columns = ['author','body','created_utc']
testPreProcessing.output_data()[useful_columns]

Unnamed: 0,author,body,created_utc
0,LazyMeal,"[retarded, claim, listen, doe, make]",1970-01-01 00:00:01.585123910
1,math_salts,[yes],1970-01-01 00:00:01.585123909
2,Legendary_Squirrel,"[market, open, 13]",1970-01-01 00:00:01.585123905
3,WSBMORONICTRADER,"[spy, fuck, around, want, long, 220, put, prin...",1970-01-01 00:00:01.585123901
4,[deleted],[],1970-01-01 00:00:01.585123897
...,...,...,...
995,Jujubewise,[haha],1970-01-01 00:00:01.607198916
996,[deleted],[],1970-01-01 00:00:01.607198913
997,The_Ron_Swansonson,[],1970-01-01 00:00:01.607198913
998,steve_pops_01,[real],1970-01-01 00:00:01.607198913
