In [1]:
import pickle
import pandas as pd
import numpy as np
import os 

from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

In [2]:
# Read in the test data used to evaluate models.
test_df = pd.read_csv("./data/Processed/increment_train_size/5_to_20_words_preprocessed_TEST.csv")

test_df.head()

Unnamed: 0,created_utc,all_text_data,subreddit
0,1532973281,tron justin sun stop short reveal secret project,1
1,1550248670,reuters hsbc forex trading cost cut sharply bl...,1
2,1612224578,put expire 5th think clown brain read,0
3,1543091767,toilet french start name trone concidence,1
4,1571098829,apollo 11 jpeg yes roomba ltolgt microsoft wor...,1


In [3]:
# ===========================================================================================================================
# Helper function used to convert a gridsearch output into a pandas dataframe with the columns formatted the way I like them. 
# ===========================================================================================================================
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None, make_RMSE=False):

    gs_results_df = pd.DataFrame(search_results)

    gs_result_columns = list(gs_results_df.columns)
    throw_away_columns = []
    columns_to_keep = []
    columns_renamed = []
    valid_metrics = []

    for column_name in gs_result_columns: 

        if column_name.startswith('split'):
            if keep_split == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif 'time' in column_name: 
            if keep_time == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name.startswith('std'):
            if keep_std == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name == 'params':
            if keep_params == True:
                columns_to_keep.append(column_name)
            else:
                throw_away_columns.append(column_name)
        else: 
            columns_to_keep.append(column_name)

    gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
    renaming_dict = {}

    for column_name in columns_to_keep: 
        name = ""

        if column_name.startswith('param') and column_name != 'params': 
            name_components = column_name.split('__')

            name_components = name_components[1:]

            for component in name_components:
                name = name + '_' + component 
                
            name = name.lstrip('_')
            
        elif '_test' in column_name:
            name = column_name.replace('_test', '')

        renaming_dict[column_name] = name

        if name.startswith('rank') or name.startswith('mean'):
            valid_metrics.append(name)

    gs_results_df.rename(columns=renaming_dict, inplace=True)

    if sort_by in valid_metrics:
        gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

    if make_RMSE:
        gs_results_df['mean_RMSE'] = (abs(gs_results_df['mean_MSE'])) ** (1/2)

    return gs_results_df


In [4]:
# ===============================================================================================
# Print the number of words in the shortest and longest post in the dataset.
# ===============================================================================================
def print_word_counts(df):
    
    df = df.copy(deep=True)
    
    df['all_text_data'] = df['all_text_data'].astype(str)
    
    df['word_count'] = df['all_text_data'].apply(lambda text: len(text.split()))
    
    max_words = df['word_count'].max()
    min_words = df['word_count'].min()
    
    print(f"Maximum length post: {max_words} words")
    print(f"Minimum length post: {min_words} words")
    
    return df

In [5]:
# ===============================================================================================
# This function takes as input a dataframe with reddit posts and returns the text data and
# target as X and y.
#
# This function also performs some final checks to make sure the data is correct prior to
# attempting to build any models with it.
# ===============================================================================================
def data_check(train_df):
    
    print(f"Distribution:\n {train_df['subreddit'].value_counts(normalize=True)}\n")
    
    print(f"Missing values: {train_df['all_text_data'].isna().sum()}\n")
            
    X = train_df.loc[:, 'all_text_data'].to_numpy()
    y = train_df.loc[:, 'subreddit'].to_numpy().ravel() 
    
    print(f"Number of duplicates in training set: {train_df.duplicated().sum()}\n")
    print(f"Number of samples in training set: {len(train_df.index)}\n")
    
    print_word_counts(train_df)
    
    return X, y

In [6]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs1.pkl'

# Read in the model file for the best "small" gradient boosted decision tree model.
with open(PATH, 'rb') as file:
    gs_results_little_trees = pickle.load(file)

# Save the gridsearch results
search_results_little_trees = gs_results_little_trees.cv_results_
top_estimator_little_trees = gs_results_little_trees.best_estimator_
top_parameters_little_trees = gs_results_little_trees.best_params_

In [7]:
# Display the gridsearch results
gs_lt_df = gs_to_clean_df(search_results_little_trees, sort_by='rank_score')
gs_lt_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",1.0,0.05,9,300,1.0,0.8822,1
1,word,8761,"(1, 2)",1.0,0.05,9,300,0.8,0.8822,1
2,word,8761,"(1, 2)",1.0,0.05,5,500,1.0,0.882,3
3,word,8761,"(1, 2)",1.0,0.05,5,500,0.8,0.882,3
4,word,8761,"(1, 2)",0.8,0.05,5,500,1.0,0.8814,5


In [8]:
PATH = './data/Processed/increment_train_size/pickle/gbf_5000_gs5.pkl'

# Read in the model file for the best "large" gradient boosted decision tree model.
with open(PATH, 'rb') as file:
    gs_results_big_trees = pickle.load(file)

# Save the gridsearch results
search_results_big_trees = gs_results_big_trees.cv_results_
top_estimator_big_trees = gs_results_big_trees.best_estimator_
top_score_big_trees = gs_results_big_trees.best_score_

In [9]:
# Display the gridsearch results
gs_bt_df = gs_to_clean_df(search_results_big_trees, sort_by='rank_score')
gs_bt_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_score,rank_score
0,word,8761,"(1, 2)",0.9,0.01,13,1200,0.8,0.8834,1
1,word,8761,"(1, 2)",0.9,0.01,13,1200,0.9,0.8834,1
2,word,8761,"(1, 2)",0.9,0.01,13,1200,1.0,0.8834,1
3,word,8761,"(1, 2)",0.9,0.01,14,1150,1.0,0.8832,4
4,word,8761,"(1, 2)",0.9,0.01,14,1150,0.9,0.8832,4


In [10]:
# Best Naive Bayes Model that did not use the hyperparameter fit_prior=False
PATH_No_FitPrior = './data/Processed/increment_train_size/pickle/nb_5000_noFitPrior_gs1.pkl'

# Read in the trained model file at the path listed above.
with open(PATH_No_FitPrior, 'rb') as file:
    gs_results_nb_np = pickle.load(file)
    top_estimator_nb_np = gs_results_nb_np.best_estimator_

# Save the gridsearch results
search_results_nb_np = gs_results_nb_np.cv_results_

# Display the gridsearch results
gs_nb_np_df = gs_to_clean_df(search_results_nb_np, sort_by='rank_score')
gs_nb_np_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,alpha,fit_prior,mean_score,rank_score
0,word,8761,"(1, 2)",1.7,False,0.9086,1
1,word,8761,"(1, 2)",1.6,False,0.9082,2
2,word,8761,"(1, 2)",1.9,False,0.9082,2
3,word,8761,"(1, 1)",1.0,False,0.9082,2
4,word,8761,"(1, 1)",1.1,False,0.908,5


In [11]:
# Best Naive Bayes Model that did use the hyperparameter fit_prior=True
PATH_FitPrior = './data/Processed/increment_train_size/pickle/nb_5000_gs1.pkl'

# Read in the trained model file at the path listed above.
with open(PATH_FitPrior, 'rb') as file:
    gs_results_nb_wp = pickle.load(file)
    top_estimator_nb_wp = gs_results_nb_wp.best_estimator_
    
# Save the gridsearch results
search_results_nb_wp = gs_results_nb_wp.cv_results_

# Display the gridsearch results
gs_nb_wp_df = gs_to_clean_df(search_results_nb_wp, sort_by='rank_score')
gs_nb_wp_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,alpha,fit_prior,mean_score,rank_score
0,word,8761,"(1, 2)",1.5,True,0.9084,1
1,word,8761,"(1, 1)",1.0,False,0.9082,2
2,word,8761,"(1, 1)",1.0,True,0.908,3
3,word,8761,"(1, 2)",1.5,False,0.9078,4
4,word,8761,"(1, 1)",1.5,False,0.907,5


In [12]:
# =========================================================================================================
# This function the following inputs:
#
# 1) A trained model
# 2) A dataframe containing a test dataset
# 3) The size of the training set used, which is only needed to record the value in the results dataframe
# 4) A results dataframe that tracks this models accuracy on each training set size. This dataframe
#    will be updated to include its performance with this training set size and will be returned. 
# =========================================================================================================
def update_score_df(model, test_df, train_size, df):
    
    # This models parameters, which will be recorded in the results dataframe.
    params = model.named_steps
    
    # Split the test data into features and target
    X_test = test_df['all_text_data']
    y_true = test_df['subreddit']
    
    # Score the trained model on the test data.
    score = model.score(X_test, y_true)
    
    # Create a new dataframe containing the models results from the above test.
    new_df = pd.DataFrame({'Train_Size' : [train_size], 'Test_Accuracy' : [score], 'Model_Params' : [params]})
    
    # Concatentate this result to the dataframe that has the models results from all training set sizes that have been evaluated. 
    df = pd.concat([df, new_df], ignore_index=True)
    
    return df

In [13]:
# =========================================================================================================
# This function takes the following inputs:
#
# model: A tuple containing a "best estimator" found by gridsearch and the name of the model type
# formatted as (model, name).
#
# train_set_sizes: A list of training set sizes that we want to use to train this model and evaluate its 
# performance.
#
# score_df: A dataframe we will use to keep track of how well the model performs after each round
#           of training and testing.
#
# test_df: A dataframe containing the test dataset that the models performance will be evaluated on. 
#
# Outputs:
# 
# score_df: The dataframe of models performance after being trained on sets of each size specified in
# train_set_sizes.
#
# =========================================================================================================
def evaluate_model(model, train_set_sizes, score_df, test_df, save_every=None): 
    
    count = 0
    
    # Unpack the "model" tuple into the actual model and the name of this model.
    model, name = model
    
    try: 
        model_dir = f"./data/Processed/increment_train_size/model_test_scores/{name}/"
        os.mkdir(model_dir)
    except:
        print(f"Directory {model_dir} already exists!")
        print("Proceeding to the model training and evaluation loop...\n")
    
    # For each training set size we want to evaluate the model for. 
    for train_size in train_set_sizes:
        
        # Read in the training set of the appropriate size.
        train_df = pd.read_csv(f"./data/Processed/increment_train_size/train{train_size}/train_{train_size}.csv")
        
        # Split into features and target.
        X_train, y_train = data_check(train_df)
        
        # Fit the model.
        model.fit(X_train, y_train)
        
        # Test the model and update the results dataframe with its accuracy. 
        score_df = update_score_df(model=model, test_df=test_df, train_size=train_size, df=score_df)
        
        # If we have performed save_every number of evaluations since the last checkpoint save. 
        if (save_every is not None) and (count % save_every == 0):
            print(f"Updating: test_{name}_trainSize_{train_size}.csv\n")
            score_df.to_csv(f"./data/Processed/increment_train_size/model_test_scores/{name}/test_{name}_trainSize_{train_size}.csv", index=False)
        
        # increment the number of evaluations that have been performed.
        count+=1
        
    score_df.to_csv(f"./data/Processed/increment_train_size/model_test_scores/{name}/test_{name}_trainSize_{train_size}_FINAL.csv", index=False)
    
    return score_df

In [14]:
# =========================================================================================================
# This function calls the evaluate_model function for a list of models that is provided to the models parameter.
# =========================================================================================================
def evaluate_models(models, train_set_sizes, score_dfs, test_df, save_every=10):
    
    updated_score_dfs = []
    
    for model, score_df in zip(models, score_dfs):
        
        df = evaluate_model(model=model, train_set_sizes=train_set_sizes, score_df=score_df, test_df = test_df, save_every=save_every)
        
        updated_score_dfs.append(df)
    
    return updated_score_dfs

In [15]:
# Dataframes to collect information on each models performance after being trained on each different size training set.
gs_lt_test_df = pd.DataFrame({'Train_Size' : [], 'Test_Accuracy' : [], 'Model_Params' : []})
gs_bt_test_df = pd.DataFrame({'Train_Size' : [], 'Test_Accuracy' : [], 'Model_Params' : []})
nb_wp_test_df = pd.DataFrame({'Train_Size' : [], 'Test_Accuracy' : [], 'Model_Params' : []})
nb_np_test_df = pd.DataFrame({'Train_Size' : [], 'Test_Accuracy' : [], 'Model_Params' : []})

In [16]:
# List of training set sizes each model should be trained with.

# Note: Make sure all of these datasets have been created by the increment_train_set_size notebook before trying to run this!! 
train_sizes = [100 * n for n in range(1, 21)] + [250 * n for n in range(9, 20)] + [5000 * n for n in range(1, 101)]

# Evaluate the performance of each model in the models list, using the test data in test_df, after it has been trained using training sets of 
# each size listed in train_set_sizes.
score_dfs = evaluate_models(models=[(top_estimator_little_trees, "little_trees"),
                                    (top_estimator_big_trees, "big_trees"),
                                    (top_estimator_nb_np, "naive_bayes_NO_fit_prior"),
                                    (top_estimator_nb_wp, "naive_bayes_WITH_fit_prior")],
                            train_set_sizes = train_sizes,
                            score_dfs=[gs_lt_test_df,
                                       gs_bt_test_df,
                                       nb_wp_test_df,
                                       nb_np_test_df],
                            test_df=test_df)