# Modeling

In [179]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import timeit
import pickle

In [180]:
titles = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_title_data.parquet')
comments = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_comment_data.parquet')

In [181]:
titles.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,pu1k55,epic games announce full easy anti cheat for l...,linux_gaming,74,12,0.308,0.523,0.169,-0.34
1,9pkgwj,the struggles of being a modern age linux gamer,linux_gaming,48,9,0.263,0.737,0.0,-0.3612
2,q8trki,steam has banned all games that utilise blockc...,linux_gaming,100,15,0.176,0.824,0.0,-0.4588
3,ijat04,they didn't expect linux gaming in 2007,linux_gaming,39,7,0.0,1.0,0.0,0.0
4,re13t3,ltt are planning to include linux compatibilit...,linux_gaming,74,11,0.0,1.0,0.0,0.0


In [182]:
comments.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,jibfcl5,fun fact the steam deck logo is just a pokebal...,SteamDeck,190,33,0.077,0.675,0.247,0.7783
1,jib4baj,no it's not the logo you see on the memory mak...,SteamDeck,635,90,0.077,0.889,0.034,-0.5994
2,jibacht,glad this isn't getting the traction you hoped...,SteamDeck,50,9,0.0,0.556,0.444,0.6808
3,jib5m0m,can't tell if you actually thought it was,SteamDeck,41,8,0.0,1.0,0.0,0.0
4,jib5qio,this was inspired by this because it looks lik...,SteamDeck,115,23,0.0,0.769,0.231,0.6908


In [183]:
titles.shape

(6305, 9)

In [184]:
comments.shape

(485125, 9)

In [185]:
titles['subreddit'].value_counts(normalize=True)

SteamDeck       0.63632
linux_gaming    0.36368
Name: subreddit, dtype: float64

In [186]:
comments['subreddit'].value_counts(normalize=True)

SteamDeck       0.662013
linux_gaming    0.337987
Name: subreddit, dtype: float64

## Model Balancing

We have lots of data, and an imbalance of around 2/3 and 1/3. We can afford to cut out some r/SteamDeck data to get a nice 50-50 splits between the two. We can perform random sampling to pick out the data. This will also help our models run a bit faster. 

In [187]:
titles['subreddit'].value_counts()

SteamDeck       4012
linux_gaming    2293
Name: subreddit, dtype: int64

In [188]:
comments['subreddit'].value_counts()

SteamDeck       321159
linux_gaming    163966
Name: subreddit, dtype: int64

In [189]:
linux_title_count = titles['subreddit'].value_counts()[1]

linux_title_count

2293

In [190]:
# Decided not to use this and severely lower comment data because there is just too much.

# linux_comment_count = comments['subreddit'].value_counts()[1]
# 
# linux_comment_count

We can actually just get and store the number of observations in our lower represented value, so let's write a function that can balance our datasets for us.

In [191]:
def balance_df(df, count):
    linux = df[df['subreddit'] == 'linux_gaming'].sample(n=count, random_state=42)
    steam = df[df['subreddit'] == 'SteamDeck'].sample(n=count, random_state=42)
    
    # concatenate the two subsets into a new DataFrame
    new_df = pd.concat([linux, steam])
    
    # shuffle the new DataFrame
    new_df = new_df.sample(frac=1, random_state=42)
    
    return new_df

In [192]:
titles = balance_df(titles, linux_title_count)
comments = balance_df(comments, 100000)

In [193]:
titles.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
2345,133ybku,my icons won't go to portrait i've tried unins...,SteamDeck,208,39,0.057,0.914,0.029,-0.2732
3674,u1loln,i found the perfect stand for my steam deck,SteamDeck,44,9,0.0,0.654,0.346,0.5719
2205,120zrem,overwatch mouse capture issue after respawn,linux_gaming,43,6,0.0,1.0,0.0,0.0
1292,12oa8w0,does the battery health optimization on the lo...,linux_gaming,80,12,0.0,0.809,0.191,0.3818
6008,12qfc00,best option for 4 controllers,SteamDeck,29,5,0.0,0.417,0.583,0.6369


In [194]:
titles.shape

(4586, 9)

In [195]:
comments.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
226220,iaoauox,who said anything about quality they were obvi...,SteamDeck,132,22,0.0,0.752,0.248,0.765
326714,jibe9l7,good suggestion pts is awesome,linux_gaming,31,5,0.0,0.3,0.7,0.7906
135215,hzwhmk1,q3 and good i ve got some money burning a hole...,SteamDeck,171,32,0.0,0.906,0.094,0.4404
382285,hltpaht,instead of actually learning what foss and con...,linux_gaming,178,30,0.082,0.838,0.08,0.2396
349926,iknxkzk,do you have any plan thoughts about switching ...,linux_gaming,109,17,0.0,1.0,0.0,0.0


In [196]:
comments.shape

(200000, 9)

Even though we cut out a fair bit of data, we're still left with more than enough of what we would want.

## We'll First do Titles

In [197]:
X = titles['title']
y = titles['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Pre-Processing

In [198]:
# Define stopwords, lemmatizer, and stemmer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [199]:
# Define a function to preprocess each document
def preprocess(text):
    # Tokenize text into words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Stem words
    words = [stemmer.stem(word) for word in words]
    # Join words back into a string
    return ' '.join(words)

In [200]:
cvec = CountVectorizer(preprocessor=preprocess, max_features=1000)

X_train_cv = cvec.fit_transform(X_train)
X_test_cv = cvec.transform(X_test)

### Timer Function

This is a neat little function I got from [here](https://datascience.stackexchange.com/a/117793) that helps us estimate how long a gridsearch will take.

In [201]:
def esitmate_gridsearch_time(model, param_grid:dict, cv:int=5, processors:int=6):
    times = []
    for _ in range(5):
        start = timeit.default_timer()
        model.fit(X_train_cv, y_train)
        model.score(X_train_cv, y_train)
        times.append(timeit.default_timer() - start)

    single_train_time = np.array(times).mean() # seconds

    combos = 1
    for vals in param_grid.values():
        combos *= len(vals)

    num_models = combos * cv / processors
    seconds = num_models * single_train_time
    minutes = seconds / 60
    hours = minutes / 60

    print(hours, minutes, seconds)

## Random Forest

In [202]:
rf_pipe = Pipeline([
    ('rf', RandomForestClassifier())
])

In [203]:
rf_params = {
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [None, 1, 2, 3, 4, 5],
    'rf__min_samples_split': [2, 3, 4],
    'rf__min_samples_leaf': [1, 2, 3],
    'rf__max_leaf_nodes': [2],
    'rf__random_state': [42]
}

In [204]:
# esitmate_gridsearch_time(rf_pipe, rf_params, processors=8)

In [205]:
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, verbose=1, n_jobs=-1)

In [206]:
rf_gs.fit(X_train_cv, y_train)

with open('../models/random_forest_gridsearch_title_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [207]:
rf_gs.best_estimator_

In [208]:
rf_gs.best_score_

0.743530601536847

In [209]:
rf_gs.best_params_

{'rf__max_depth': None,
 'rf__max_leaf_nodes': 2,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 200,
 'rf__random_state': 42}

In [210]:
rf_gs.score(X_train_cv, y_train), rf_gs.score(X_test_cv, y_test)

(0.72637394591451, 0.7027027027027027)

## ADA Boost

In [211]:
ada_pipe = Pipeline([
    ('ada', AdaBoostClassifier())
])

In [212]:
ada_params = {
    'ada__n_estimators': [100, 150, 200],
    'ada__learning_rate': [0.1, 1, 5, 10],
    'ada__random_state': [42]
}

In [213]:
# esitmate_gridsearch_time(ada_pipe, ada_params, processors=8)

In [214]:
ada_gs = GridSearchCV(ada_pipe, param_grid=ada_params, verbose=1, n_jobs=-1)

In [215]:
ada_gs.fit(X_train_cv, y_train)

with open('../models/ada_boost_gridsearch_title_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


## Comments

In [216]:
X = comments['comments']
y = comments['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [217]:
X_train_cv = cvec.fit_transform(X_train)
X_test_cv = cvec.transform(X_test)

In [218]:
rf_gs.fit(X_train_cv, y_train)

with open('../models/random_forest_gridsearch_comment_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [219]:
rf_gs.best_score_

0.6954

In [220]:
rf_gs.best_params_

{'rf__max_depth': None,
 'rf__max_leaf_nodes': 2,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 200,
 'rf__random_state': 42}

In [221]:
rf_gs.score(X_train_cv, y_train), rf_gs.score(X_test_cv, y_test)

(0.6950466666666667, 0.69132)

In [222]:
ada_gs.fit(X_train_cv, y_train)

with open('../models/ada_boost_gridsearch_comment_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
