# Modeling Comments

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import timeit
import pickle

In [2]:
df = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_comment_data.parquet')

In [4]:
df.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,jibfcl5,fun fact the steam deck logo is just a pokebal...,SteamDeck,190,33,0.077,0.675,0.247,0.7783
1,jib4baj,no it's not the logo you see on the memory mak...,SteamDeck,635,90,0.077,0.889,0.034,-0.5994
2,jibacht,glad this isn't getting the traction you hoped...,SteamDeck,50,9,0.0,0.556,0.444,0.6808
3,jib5m0m,can't tell if you actually thought it was,SteamDeck,41,8,0.0,1.0,0.0,0.0
4,jib5qio,this was inspired by this because it looks lik...,SteamDeck,115,23,0.0,0.769,0.231,0.6908


In [5]:
df.shape

(485125, 9)

In [6]:
df['subreddit'].value_counts(normalize=True)

SteamDeck       0.662013
linux_gaming    0.337987
Name: subreddit, dtype: float64

## Model Balancing

We have lots of data, and an imbalance of around 2/3 and 1/3. We can afford to cut out some r/SteamDeck data to get a nice 50-50 splits between the two. We can perform random sampling to pick out the data. This will also help our models run a bit faster. 

In [14]:
df['subreddit'].value_counts()

SteamDeck       321159
linux_gaming    163966
Name: subreddit, dtype: int64

In [15]:
linux_comment_count = df['subreddit'].value_counts()[1]

linux_comment_count

163966

In [16]:
def balance_df(df, count):
    linux = df[df['subreddit'] == 'linux_gaming'].sample(n=count, random_state=42)
    steam = df[df['subreddit'] == 'SteamDeck'].sample(n=count, random_state=42)
    
    # concatenate the two subsets into a new DataFrame
    new_df = pd.concat([linux, steam])
    
    # shuffle the new DataFrame
    new_df = new_df.sample(frac=1, random_state=42)
    
    return new_df

In [17]:
df = balance_df(df, linux_comment_count)

In [18]:
df.head()

Unnamed: 0,comment_id,comments,subreddit,comment_length,comment_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
126187,janpo5s,no you'd still need to remove the original scr...,SteamDeck,79,15,0.126,0.743,0.131,0.0258
331670,jhff7k0,add proton_log 1 command as launch option for ...,linux_gaming,368,61,0.0,0.931,0.069,0.5574
468563,gz2sv56,well iirc it doesn't say that it disables half...,linux_gaming,300,56,0.07,0.866,0.064,-0.34
310631,ifh5csf,no one person downloading roms is too trivial ...,SteamDeck,372,65,0.158,0.725,0.117,-0.6254
453337,hp2q4yp,oh yeah you're right i saw the zen 2 and i gue...,linux_gaming,230,46,0.098,0.805,0.096,-0.0258


In [19]:
df.shape

(327932, 9)

In [19]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
# Define stopwords, lemmatizer, and stemmer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [21]:
# Define a function to preprocess each document
def preprocess(text):
    # Tokenize text into words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Stem words
    words = [stemmer.stem(word) for word in words]
    # Join words back into a string
    return ' '.join(words)

In [22]:
cvec = CountVectorizer(preprocessor=preprocess, max_features=2000)

X_train_cv = cvec.fit_transform(X_train)
X_test_cv = cvec.transform(X_test)

In [23]:
def esitmate_gridsearch_time(model, param_grid:dict, cv:int=5, processors:int=6):
    times = []
    for _ in range(5):
        start = timeit.default_timer()
        model.fit(X_train_cv, y_train)
        model.score(X_train_cv, y_train)
        times.append(timeit.default_timer() - start)

    single_train_time = np.array(times).mean() # seconds

    combos = 1
    for vals in param_grid.values():
        combos *= len(vals)

    num_models = combos * cv / processors
    seconds = num_models * single_train_time
    minutes = seconds / 60
    hours = minutes / 60

    print(hours, minutes, seconds)

In [38]:
X = comments['comments']
y = comments['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
X_train_cv = cvec.fit_transform(X_train)
X_test_cv = cvec.transform(X_test)

In [40]:
rf_gs.fit(X_train_cv, y_train)

with open('../models/random_forest_gridsearch_comment_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


KeyboardInterrupt: 

In [None]:
rf_gs.best_score_

In [None]:
rf_gs.best_params_

In [None]:
rf_gs.score(X_train_cv, y_train), rf_gs.score(X_test_cv, y_test)

In [None]:
ada_gs.fit(X_train_cv, y_train)

with open('../models/ada_boost_gridsearch_comment_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(rf_gs, pickle_out)