# Modeling Titles

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import timeit
import pickle

In [2]:
df = pd.read_parquet('../data/linux_gaming_SteamDeck_clean_title_data.parquet')

In [3]:
df.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
0,pu1k55,epic games announce full easy anti cheat for l...,linux_gaming,74,12,0.308,0.523,0.169,-0.34
1,9pkgwj,the struggles of being a modern age linux gamer,linux_gaming,48,9,0.263,0.737,0.0,-0.3612
2,q8trki,steam has banned all games that utilise blockc...,linux_gaming,100,15,0.176,0.824,0.0,-0.4588
3,ijat04,they didn't expect linux gaming in 2007,linux_gaming,39,7,0.0,1.0,0.0,0.0
4,re13t3,ltt are planning to include linux compatibilit...,linux_gaming,74,11,0.0,1.0,0.0,0.0


In [4]:
df.shape

(6305, 9)

In [5]:
df['subreddit'].value_counts(normalize=True)

SteamDeck       0.63632
linux_gaming    0.36368
Name: subreddit, dtype: float64

## Model Balancing

We have lots of data, and an imbalance of around 2/3 and 1/3. We can afford to cut out some r/SteamDeck data to get a nice 50-50 splits between the two. We can perform random sampling to pick out the data. This will also help our models run a bit faster. 

In [6]:
df['subreddit'].value_counts()

SteamDeck       4012
linux_gaming    2293
Name: subreddit, dtype: int64

In [7]:
linux_title_count = df['subreddit'].value_counts()[1]

linux_title_count

2293

We can actually just get and store the number of observations in our lower represented value, so let's write a function that can balance our datasets for us.

In [8]:
def balance_df(df, count):
    linux = df[df['subreddit'] == 'linux_gaming'].sample(n=count, random_state=42)
    steam = df[df['subreddit'] == 'SteamDeck'].sample(n=count, random_state=42)
    
    # concatenate the two subsets into a new DataFrame
    new_df = pd.concat([linux, steam])
    
    # shuffle the new DataFrame
    new_df = new_df.sample(frac=1, random_state=42)
    
    return new_df

In [9]:
df = balance_df(df, linux_title_count)

In [10]:
df.head()

Unnamed: 0,title_id,title,subreddit,title_length,title_word_count,negative_sentiment,neutral_sentiment,positive_sentiment,compund_sentiment
2345,133ybku,my icons won't go to portrait i've tried unins...,SteamDeck,208,39,0.057,0.914,0.029,-0.2732
3674,u1loln,i found the perfect stand for my steam deck,SteamDeck,44,9,0.0,0.654,0.346,0.5719
2205,120zrem,overwatch mouse capture issue after respawn,linux_gaming,43,6,0.0,1.0,0.0,0.0
1292,12oa8w0,does the battery health optimization on the lo...,linux_gaming,80,12,0.0,0.809,0.191,0.3818
6008,12qfc00,best option for 4 controllers,SteamDeck,29,5,0.0,0.417,0.583,0.6369


In [11]:
df.shape

(4586, 9)

Even though we cut out a fair bit of data, we're still left with more than enough of what we would want.

## Train Test Split

In [12]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Pre-Processing

In [13]:
# Define stopwords, lemmatizer, and stemmer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [14]:
# Define a function to preprocess each document
def preprocess(text):
    # Tokenize text into words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Stem words
    words = [stemmer.stem(word) for word in words]
    # Join words back into a string
    return ' '.join(words)

In [15]:
cvec = CountVectorizer(preprocessor=preprocess, max_features=2000)

X_train_cv = cvec.fit_transform(X_train)
X_test_cv = cvec.transform(X_test)

### Timer Function

This is a neat little function I got from [here](https://datascience.stackexchange.com/a/117793) that helps us estimate how long a gridsearch will take.

In [16]:
def esitmate_gridsearch_time(model, param_grid:dict, cv:int=5, processors:int=6):
    times = []
    for _ in range(5):
        start = timeit.default_timer()
        model.fit(X_train_cv, y_train)
        model.score(X_train_cv, y_train)
        times.append(timeit.default_timer() - start)

    single_train_time = np.array(times).mean() # seconds

    combos = 1
    for vals in param_grid.values():
        combos *= len(vals)

    num_models = combos * cv / processors
    seconds = num_models * single_train_time
    minutes = seconds / 60
    hours = minutes / 60

    print(hours, minutes, seconds)

## ADA Boosted Random Forest

In [17]:
ada_rf = AdaBoostClassifier(estimator=RandomForestClassifier())

ada_rf.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': 'deprecated',
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'sqrt',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(),
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [18]:
ada_rf_params = {
    'estimator__n_estimators': [100, 150],
    'estimator__max_depth': [1, 2, 3],
    'estimator__min_samples_split': [2, 3, 4],
    'estimator__min_samples_leaf': [1, 2, 3],
    'estimator__random_state': [42],
    'learning_rate': [0.1, 1, 5],
    'n_estimators': [50, 100],
    'random_state': [42]
}

In [19]:
# esitmate_gridsearch_time(ada_rf, ada_rf_params, processors=8)

In [20]:
ada_rf_gs = GridSearchCV(ada_rf, param_grid=ada_rf_params, verbose=1, n_jobs=-1)

In [21]:
ada_rf_gs.fit(X_train_cv, y_train)

In [60]:
with open('../models/ada_boost_random_forest_gridsearch_title_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(ada_rf_gs, pickle_out)

In [22]:
ada_rf_gs.best_estimator_

In [23]:
ada_rf_gs.best_score_

0.8243707897498392

In [24]:
ada_rf_gs.best_params_

{'estimator__max_depth': 3,
 'estimator__min_samples_leaf': 2,
 'estimator__min_samples_split': 2,
 'estimator__n_estimators': 150,
 'estimator__random_state': 42,
 'learning_rate': 1,
 'n_estimators': 100,
 'random_state': 42}

In [26]:
ada_rf_gs.score(X_train_cv, y_train), ada_rf_gs.score(X_test_cv, y_test)

(0.8973538819424252, 0.8160418482999128)

## ADA Boosted Logistic Regression

In [27]:
ada_log = AdaBoostClassifier(estimator=LogisticRegression())

In [52]:
ada_log_params = {
    'estimator__penalty': ['l1', 'l2', 'elasticnet', None],
    'estimator__solver': ['saga'],
    'estimator__C': [0.1, 1, 10],
    'estimator__random_state': [42],
    'estimator__max_iter': [10000],
    'learning_rate': [0.1, 1, 5],
    'n_estimators': [50, 100],
    'random_state': [42]
}

In [53]:
# esitmate_gridsearch_time(ada_log, ada_log_params, processors=8)

In [54]:
ada_log_gs = GridSearchCV(ada_log, param_grid=ada_log_params, verbose=1, n_jobs=-1)

In [55]:
ada_log_gs.fit(X_train_cv, y_train)

In [59]:
with open('../models/ada_boost_log_gridsearch_title_model.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(ada_log_gs, pickle_out)

In [56]:
ada_log_gs.best_score_

0.8252428827730949

In [57]:
ada_log_gs.best_params_

{'estimator__C': 10,
 'estimator__max_iter': 10000,
 'estimator__penalty': 'l2',
 'estimator__random_state': 42,
 'estimator__solver': 'saga',
 'learning_rate': 1,
 'n_estimators': 100,
 'random_state': 42}

In [58]:
ada_log_gs.score(X_train_cv, y_train), ada_log_gs.score(X_test_cv, y_test)

(0.902587961616749, 0.8221447253705318)