In [1]:
import pickle
import pandas as pd
import numpy as np
import re

from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

In [2]:
# Read in the test data used to evaluate models.
test_df = pd.read_csv("./data/Processed/increment_train_size/5_to_20_words_preprocessed_TEST.csv")

test_df.head()

Unnamed: 0,created_utc,all_text_data,subreddit
0,1532973281,tron justin sun stop short reveal secret project,1
1,1550248670,reuters hsbc forex trading cost cut sharply bl...,1
2,1612224578,put expire 5th think clown brain read,0
3,1543091767,toilet french start name trone concidence,1
4,1571098829,apollo 11 jpeg yes roomba ltolgt microsoft wor...,1


# Helper Functions

In [3]:
def print_test_score(test_df, model, params):
    
    X_test = test_df['all_text_data']
    y_true = test_df['subreddit']
    
    print("==================================================================================================")
    print(f"Model params:\n {params}\n")
    print(f"Test set score: {model.score(X_test, y_true)}")
    print("==================================================================================================\n")

In [4]:
# ===========================================================================================================================
# Helper function used to convert a gridsearch output into a pandas dataframe with the columns formatted the way I like them. 
# ===========================================================================================================================
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None, make_RMSE=False):

    gs_results_df = pd.DataFrame(search_results)

    gs_result_columns = list(gs_results_df.columns)
    throw_away_columns = []
    columns_to_keep = []
    columns_renamed = []
    valid_metrics = []

    for column_name in gs_result_columns: 

        if column_name.startswith('split'):
            if keep_split == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif 'time' in column_name: 
            if keep_time == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name.startswith('std'):
            if keep_std == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name == 'params':
            if keep_params == True:
                columns_to_keep.append(column_name)
            else:
                throw_away_columns.append(column_name)
        else: 
            columns_to_keep.append(column_name)

    gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
    renaming_dict = {}

    for column_name in columns_to_keep: 
        name = ""

        if column_name.startswith('param') and column_name != 'params': 
            name_components = column_name.split('__')

            name_components = name_components[1:]

            for component in name_components:
                name = name + '_' + component 
                
            name = name.lstrip('_')
            
        elif '_test' in column_name:
            name = column_name.replace('_test', '')

        renaming_dict[column_name] = name

        if name.startswith('rank') or name.startswith('mean'):
            valid_metrics.append(name)

    gs_results_df.rename(columns=renaming_dict, inplace=True)

    if sort_by in valid_metrics:
        gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

    if make_RMSE:
        gs_results_df['mean_RMSE'] = (abs(gs_results_df['mean_MSE'])) ** (1/2)

    return gs_results_df


In [5]:
# ===============================================================================================
# Print the number of words in the shortest and longest post in the dataset.
# ===============================================================================================
def print_word_counts(df):
    
    df = df.copy(deep=True)
    
    df['all_text_data'] = df['all_text_data'].astype(str)
    
    df['word_count'] = df['all_text_data'].apply(lambda text: len(text.split()))
    
    max_words = df['word_count'].max()
    min_words = df['word_count'].min()
    
    print(f"Maximum length post: {max_words} words")
    print(f"Minimum length post: {min_words} words")
    
    return df

In [6]:
# ===============================================================================================
# This function takes as input a dataframe containing reddit posts and returns the text data and
# target split apart.
#
# This function also performs some final checks to make sure the data is correct prior to
# attempting to build any models with it.
# ===============================================================================================
def data_check(train_df):
    
    print(f"Distribution:\n {train_df['subreddit'].value_counts(normalize=True)}\n")
    
    print(f"Missing values: {train_df['all_text_data'].isna().sum()}\n")
            
    X = train_df.loc[:, 'all_text_data'].to_numpy()
    y = train_df.loc[:, 'subreddit'].to_numpy().ravel() 
    
    print(f"Number of duplicates in training set: {train_df.duplicated().sum()}\n")
    print(f"Number of samples in training set: {len(train_df.index)}\n")
    
    print_word_counts(train_df)
    
    return X, y

## 5000 sample grid searches 

Data only includes posts 5 to 20 words in length.

Only given 5000 samples of training data.

In [7]:
train_df = pd.read_csv("./data/Processed/increment_train_size/train5000/train_5000.csv")

X, y = data_check(train_df)

Distribution:
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Missing values: 0

Number of duplicates in training set: 0

Number of samples in training set: 5000

Maximum length post: 20 words
Minimum length post: 5 words


In [8]:
'''
tree_num_distribution = [100, 300, 500]
max_depth = [-1, 5, 9]

model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("boosted_trees", LGBMClassifier(silent=False, random_state=42))])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'Tfidf_Vect__max_features':[8761],
                   'boosted_trees__n_estimators' : tree_num_distribution, 
                   'boosted_trees__max_depth' : max_depth,
                   'boosted_trees__subsample' : [0.8, 1.0],                    # Enable random selection of training cases (rows).
                   'boosted_trees__colsample_bytree' : [0.8, 1.0],             # Percentage of features to randomly consider at each split.
                   'boosted_trees__learning_rate' : [0.003, 0.01, 0.05]}]



gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=5, verbose=3)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs1.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [8]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs1.pkl'

# Read in the model file created by the above gridsearch.
with open(PATH, 'rb') as file:
    gs_results1 = pickle.load(file)
    
search_results1 = gs_results1.cv_results_
top_estimator1 = gs_results1.best_estimator_
top_score1 = gs_results1.best_score_
top_parameters1 = gs_results1.best_params_

# Display the gridsearch results
gs1_df = gs_to_clean_df(search_results1, sort_by='rank_score')
gs1_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",1.0,0.05,9,300,1.0,0.8822,1
1,word,8761,"(1, 2)",1.0,0.05,9,300,0.8,0.8822,1
2,word,8761,"(1, 2)",1.0,0.05,5,500,1.0,0.882,3
3,word,8761,"(1, 2)",1.0,0.05,5,500,0.8,0.882,3
4,word,8761,"(1, 2)",0.8,0.05,5,500,1.0,0.8814,5


In [9]:
top_parameters1

{'Tfidf_Vect__analyzer': 'word',
 'Tfidf_Vect__max_features': 8761,
 'Tfidf_Vect__ngram_range': (1, 2),
 'boosted_trees__colsample_bytree': 1.0,
 'boosted_trees__learning_rate': 0.05,
 'boosted_trees__max_depth': 9,
 'boosted_trees__n_estimators': 300,
 'boosted_trees__subsample': 0.8}

In [10]:
'''
tree_num_distribution = [200, 300, 500, 1000]
max_depth = [8, 9, 11]

model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("boosted_trees", LGBMClassifier(silent=False, random_state=42))])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'Tfidf_Vect__max_features':[8761],
                   'boosted_trees__n_estimators' : tree_num_distribution, 
                   'boosted_trees__max_depth' : max_depth,
                   'boosted_trees__subsample' : [0.8, 1.0],                    # Enable random selection of training cases (rows).
                   'boosted_trees__colsample_bytree' : [0.8, 1.0],             # Percentage of features to randomly consider at each split.
                   'boosted_trees__learning_rate' : [0.008, 0.03, 0.05, 0.08]}]



gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=5, verbose=3)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs2.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)

''';

In [11]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs2.pkl'

# Read in the model file created by the above gridsearch.
with open(PATH, 'rb') as file:
    gs_results2 = pickle.load(file)
    
search_results2 = gs_results2.cv_results_
top_estimator2 = gs_results2.best_estimator_
top_score2 = gs_results2.best_score_
top_parameters2 = gs_results2.best_params_

# Display the gridsearch results
gs2_df = gs_to_clean_df(search_results2, sort_by='rank_score')
gs2_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",1.0,0.08,11,200,0.8,0.8832,1
1,word,8761,"(1, 2)",0.8,0.08,9,200,0.8,0.8832,1
2,word,8761,"(1, 2)",1.0,0.08,11,200,1.0,0.8832,1
3,word,8761,"(1, 2)",0.8,0.08,9,200,1.0,0.8832,1
4,word,8761,"(1, 2)",1.0,0.03,9,500,0.8,0.8826,5


In [12]:
'''
tree_num_distribution = [150, 200, 250, 300, 800]
max_depth = [10, 11, 12, 13, 14]

model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("boosted_trees", LGBMClassifier(silent=False, random_state=42))])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,2), (1,3)],
                   'Tfidf_Vect__max_features':[8761],
                   'boosted_trees__n_estimators' : tree_num_distribution, 
                   'boosted_trees__max_depth' : max_depth,
                   'boosted_trees__subsample' : [0.7, 0.8, 0.9],                    # Enable random selection of training cases (rows).
                   'boosted_trees__colsample_bytree' : [0.8, 0.9, 1.0],             # Percentage of features to randomly consider at each split.
                   'boosted_trees__learning_rate' : [0.01, 0.03, 0.07, 0.08, 0.1]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=5, verbose=3)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs3.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [13]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs3.pkl'

# Read in the model file created by the above gridsearch.
with open(PATH, 'rb') as file:
    gs_results3 = pickle.load(file)
    
search_results3 = gs_results3.cv_results_
top_estimator3 = gs_results3.best_estimator_
top_score3 = gs_results3.best_score_
top_parameters3 = gs_results3.best_params_

# Display the gridsearch results
gs3_df = gs_to_clean_df(search_results3, sort_by='rank_score')
gs3_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",1.0,0.1,11,150,0.9,0.8844,1
1,word,8761,"(1, 2)",1.0,0.1,11,150,0.8,0.8844,1
2,word,8761,"(1, 2)",1.0,0.1,11,150,0.7,0.8844,1
3,word,8761,"(1, 2)",1.0,0.08,11,200,0.7,0.8832,4
4,word,8761,"(1, 2)",1.0,0.08,11,200,0.8,0.8832,4


In [14]:
'''
tree_num_distribution = [1000, 1100, 1200]
max_depth = [10, 11, 12]

model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("boosted_trees", LGBMClassifier(silent=False, random_state=42))])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,2)],
                   'Tfidf_Vect__max_features':[8761],
                   'boosted_trees__n_estimators' : tree_num_distribution, 
                   'boosted_trees__max_depth' : max_depth,
                   'boosted_trees__subsample' : [0.8, 0.9, 1.0],                    # Enable random selection of training cases (rows).
                   'boosted_trees__colsample_bytree' : [0.8, 0.9, 1.0],             # Percentage of features to randomly consider at each split.
                   'boosted_trees__learning_rate' : [0.005, 0.01, 0.05]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=5, verbose=3)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs4.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [15]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs4.pkl'

# Read in the model file created by the above gridsearch.
with open(PATH, 'rb') as file:
    gs_results4 = pickle.load(file)
    
search_results4 = gs_results4.cv_results_
top_estimator4 = gs_results4.best_estimator_
top_score4 = gs_results4.best_score_
top_parameters4 = gs_results4.best_params_

# Display the gridsearch results
gs4_df = gs_to_clean_df(search_results4, sort_by='rank_score')
gs4_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",1.0,0.01,12,1200,0.9,0.8818,1
1,word,8761,"(1, 2)",1.0,0.01,12,1200,0.8,0.8818,1
2,word,8761,"(1, 2)",1.0,0.01,12,1200,1.0,0.8818,1
3,word,8761,"(1, 2)",0.8,0.01,12,1200,0.8,0.8814,4
4,word,8761,"(1, 2)",0.8,0.01,12,1200,0.9,0.8814,4


In [16]:
'''
tree_num_distribution = [1150, 1200, 1250]
max_depth = [12, 13, 14, 15]

model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("boosted_trees", LGBMClassifier(silent=False, random_state=42))])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,2)],
                   'Tfidf_Vect__max_features':[8761],
                   'boosted_trees__n_estimators' : tree_num_distribution, 
                   'boosted_trees__max_depth' : max_depth,
                   'boosted_trees__subsample' : [0.8, 0.9, 1.0],                    # Enable random selection of training cases (rows).
                   'boosted_trees__colsample_bytree' : [0.8, 0.9, 1.0],             # Percentage of features to randomly consider at each split.
                   'boosted_trees__learning_rate' : [0.008, 0.01, 0.12]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=5, verbose=3)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs5.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [9]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs5.pkl'

# Read in the model file created by the above gridsearch.
with open(PATH, 'rb') as file:
    gs_results5 = pickle.load(file)
    
search_results5 = gs_results5.cv_results_
top_estimator5 = gs_results5.best_estimator_
top_score5 = gs_results5.best_score_

# Display the gridsearch results
gs5_df = gs_to_clean_df(search_results5, sort_by='rank_score')
gs5_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",0.9,0.01,13,1200,0.8,0.8834,1
1,word,8761,"(1, 2)",0.9,0.01,13,1200,0.9,0.8834,1
2,word,8761,"(1, 2)",0.9,0.01,13,1200,1.0,0.8834,1
3,word,8761,"(1, 2)",0.9,0.01,14,1150,1.0,0.8832,4
4,word,8761,"(1, 2)",0.9,0.01,14,1150,0.9,0.8832,4


In [18]:
# Print the best parameters for each of the Gradient Boosted Decision Tree models. 
print_test_score(test_df, top_estimator1, top_parameters1)
print_test_score(test_df, top_estimator2, top_parameters2)
print_test_score(test_df, top_estimator3, top_parameters3)
print_test_score(test_df, top_estimator4, top_parameters4)
print_test_score(test_df, top_estimator5, top_parameters5)

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'boosted_trees__colsample_bytree': 1.0, 'boosted_trees__learning_rate': 0.05, 'boosted_trees__max_depth': 9, 'boosted_trees__n_estimators': 300, 'boosted_trees__subsample': 0.8}

Test set score: 0.88264

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'boosted_trees__colsample_bytree': 0.8, 'boosted_trees__learning_rate': 0.08, 'boosted_trees__max_depth': 9, 'boosted_trees__n_estimators': 200, 'boosted_trees__subsample': 0.8}

Test set score: 0.88132

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'boosted_trees__colsample_bytree': 1.0, 'boosted_trees__learning_rate': 0.1, 'boosted_trees__max_depth': 11, 'boosted_trees__n_estimators': 150, 'boosted_trees__subsample': 0.7}

Test set score: 0.88252

Model params:
 {'Tfidf_Vect__analyzer