# Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import timeit

In [2]:
titles = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_title_data.parquet')
comments = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_comment_data.parquet')

In [3]:
titles.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,pu1k55,epic games announce full easy anti cheat for l...,linux_gaming,74,12,0.308,0.523,0.169,-0.34
1,9pkgwj,the struggles of being a modern age linux gamer,linux_gaming,48,9,0.263,0.737,0.0,-0.3612
2,q8trki,steam has banned all games that utilise blockc...,linux_gaming,100,15,0.176,0.824,0.0,-0.4588
3,ijat04,they didn't expect linux gaming in 2007,linux_gaming,39,7,0.0,1.0,0.0,0.0
4,re13t3,ltt are planning to include linux compatibilit...,linux_gaming,74,11,0.0,1.0,0.0,0.0


In [4]:
comments.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,jibfcl5,fun fact the steam deck logo is just a pokebal...,SteamDeck,190,33,0.077,0.675,0.247,0.7783
1,jib4baj,no it's not the logo you see on the memory mak...,SteamDeck,635,90,0.077,0.889,0.034,-0.5994
2,jibacht,glad this isn't getting the traction you hoped...,SteamDeck,50,9,0.0,0.556,0.444,0.6808
3,jib5m0m,can't tell if you actually thought it was,SteamDeck,41,8,0.0,1.0,0.0,0.0
4,jib5qio,this was inspired by this because it looks lik...,SteamDeck,115,23,0.0,0.769,0.231,0.6908


In [5]:
titles.shape

(6305, 9)

In [6]:
comments.shape

(485125, 9)

In [7]:
titles['subreddit'].value_counts(normalize=True)

SteamDeck       0.63632
linux_gaming    0.36368
Name: subreddit, dtype: float64

In [8]:
comments['subreddit'].value_counts(normalize=True)

SteamDeck       0.662013
linux_gaming    0.337987
Name: subreddit, dtype: float64

## Model Balancing

We have lots of data, and an imbalance of around 2/3 and 1/3. We can afford to cut out some r/SteamDeck data to get a nice 50-50 splits between the two. We can perform random sampling to pick out the data. This will also help our models run a bit faster. 

In [9]:
titles['subreddit'].value_counts()

SteamDeck       4012
linux_gaming    2293
Name: subreddit, dtype: int64

In [10]:
comments['subreddit'].value_counts()

SteamDeck       321159
linux_gaming    163966
Name: subreddit, dtype: int64

In [11]:
linux_title_count = titles['subreddit'].value_counts()[1]

linux_title_count

2293

In [12]:
linux_comment_count = comments['subreddit'].value_counts()[1]

linux_comment_count

163966

We can actually just get and store the number of observations in our lower represented value, so let's write a function that can balance our datasets for us.

In [13]:
def balance_df(df, count):
    linux = df[df['subreddit'] == 'linux_gaming']
    steam = df[df['subreddit'] == 'SteamDeck'].sample(n=count, random_state=42)
    
    # concatenate the two subsets into a new DataFrame
    new_df = pd.concat([linux, steam])
    
    # shuffle the new DataFrame
    new_df = new_df.sample(frac=1, random_state=42)
    
    return new_df

In [14]:
titles = balance_df(titles, linux_title_count)
comments = balance_df(comments, linux_comment_count)

In [15]:
titles.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
2345,133ybku,my icons won't go to portrait i've tried unins...,SteamDeck,208,39,0.057,0.914,0.029,-0.2732
3674,u1loln,i found the perfect stand for my steam deck,SteamDeck,44,9,0.0,0.654,0.346,0.5719
1183,12srbqu,controller issues on fedora 38,linux_gaming,30,5,0.0,1.0,0.0,0.0
100,r74av0,november marked 7 months of linux rising on st...,linux_gaming,69,14,0.0,1.0,0.0,0.0
6008,12qfc00,best option for 4 controllers,SteamDeck,29,5,0.0,0.417,0.583,0.6369


In [16]:
titles.shape

(4586, 9)

In [17]:
comments.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
126187,janpo5s,no you'd still need to remove the original scr...,SteamDeck,79,15,0.126,0.743,0.131,0.0258
480760,ga2tive,i surely hope so hopefully someone can pressur...,linux_gaming,65,11,0.119,0.27,0.611,0.8439
420963,d906goj,i do blame them for everything that's wrong wi...,linux_gaming,210,38,0.225,0.683,0.093,-0.6124
310631,ifh5csf,no one person downloading roms is too trivial ...,SteamDeck,372,65,0.158,0.725,0.117,-0.6254
391488,iqh6cip,true not all games are competitive which is wh...,linux_gaming,657,109,0.151,0.713,0.136,-0.25


In [18]:
comments.shape

(327932, 9)

Even though we cut out a fair bit of data, we're still left with more than enough of what we would want.

## We'll First do Titles

In [19]:
X = comments['comments']
y = comments['comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Pre-Processing

In [20]:
# Define stopwords, lemmatizer, and stemmer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [21]:
# Define a function to preprocess each document
def preprocess(text):
    # Tokenize text into words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Stem words
    words = [stemmer.stem(word) for word in words]
    # Join words back into a string
    return ' '.join(words)

## Random Forest

In [25]:
def esitmate_gridsearch_time(model, param_grid:dict, cv:int=5, processors:int=6):
    times = []
    for _ in range(5):
        start = timeit.default_timer()
        model.fit(X_train, y_train)
        model.score(X_train, y_train)
        times.append(timeit.default_timer() - start)

    single_train_time = np.array(times).mean() # seconds

    combos = 1
    for vals in param_grid.values():
        combos *= len(vals)

    num_models = combos * cv / processors
    seconds = num_models * single_train_time
    minutes = seconds / 60
    hours = minutes / 60

    print(hours, minutes, seconds)

In [22]:
rf_pipe = Pipeline([
    ('cvec', CountVectorizer(preprocessor=preprocess, max_features=1000)),
    ('rf', RandomForestClassifier())
])

In [23]:
rf_params = {
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [None, 1, 2, 3, 4, 5],
    'rf__min_samples_split': [2, 3, 4],
    'rf__min_samples_leaf': [1, 2, 3],
    'rf__max_leaf_nodes': [1, 2]
}

In [None]:
esitmate_gridsearch_time(rf_pipe, rf_params, processors=8)