# XGBoost experiments (Michael)

## Setup

In [1]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse
import re

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility function for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [4]:
# new cleaned data
df = pd.read_csv('data/data_usampl_60_40_comments_cleaned_preproc_fasttext.csv')
df.shape

(360038, 5)

## Optional: Create smaller sample from data to speed up experiments

In [5]:
sample_size = None

# uncomment to create sample of desired size
sample_size = 50_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using sample (50000 rows).


## Drop rows with NaN's

In [6]:
rows_before = df.shape[0]
print("rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print('rows after:', df.shape[0])
print('rows dropped:', rows_before - df.shape[0])

rows with NaN's before dropping: 50000
rows after: 50000
rows dropped: 0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   comment_raw            50000 non-null  object
 1   comment_clean          50000 non-null  object
 2   comment_clean_preproc  50000 non-null  object
 3   ft_vector              50000 non-null  object
 4   toxic                  50000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


In [8]:
df

Unnamed: 0,comment_raw,comment_clean,comment_clean_preproc,ft_vector,toxic
0,Good. Let's hope the president listens to his...,Good. Let's hope the president listens to his ...,good let hope president listen adviser instead...,[ 4.41469215e-02 -1.07265105e-02 -2.13325340e-...,1
1,Actually I'd consider people like Lindsey cons...,Actually I'd consider people like Lindsey cons...,actually consider people like Lindsey conserva...,[ 0.04503527 0.01501187 -0.03122971 0.095336...,1
2,Once again the left proves they are the scum o...,Once again the left proves they are the scum o...,left prove scum Earth win election spend time ...,[ 3.61347646e-02 6.09276891e-02 -4.58316170e-...,1
3,And Trump should be fired if his tweets are in...,And Trump should be fired if his tweets are in...,Trump fire tweet insensitive childish demente,[ 0.03957259 0.15793295 -0.02021416 0.057683...,1
4,Making decisions based on factors as nebulous ...,Making decisions based on factors as nebulous ...,make decision base factor nebulous Paris Accor...,[ 0.0471274 0.01432506 -0.06455102 0.083196...,1
...,...,...,...,...,...
49995,I couldn't disagree more\n\nThe positions are ...,I couldn't disagree more The positions are ent...,disagree position entirely consistent long und...,[ 0.07299212 0.06142109 -0.07250728 0.154581...,0
49996,"Just so RG readers know, the author of this sp...","Just so RG readers know, the author of this sp...",RG reader know author spiel guy name Michael B...,[-0.00407603 0.05170771 -0.03713477 0.124179...,0
49997,If anyone is interested. WalMart on W11th no l...,If anyone is interested. WalMart on W_number_t...,interested WalMart W_number_th long charge bag...,[-0.01897589 -0.0200948 -0.06883522 0.006006...,0
49998,Pierre Trudeau rode around Montreal on his bik...,Pierre Trudeau rode around Montreal on his bik...,Pierre Trudeau ride Montreal bike dress Nazi r...,[ 1.02514643e-02 -1.37367821e-03 -3.53123546e-...,0


## Create label/target variable and check for imbalance

In [9]:
target = df['toxic']

In [10]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 30000 (60.0 %)
Toxic (1): 20000 (40.0 %)


## Create various corpora

### Raw corpus

In [11]:
corp_raw = df['comment_raw']
corp_raw.shape

(50000,)

### Cleaned corpus

In [12]:
corp_clean = df['comment_clean']
corp_clean.shape

(50000,)

### Pre-processed corpus

In [13]:
corp_pp = df['comment_clean_preproc']
corp_pp.shape

(50000,)

### Corpus of fastText vectors

In [14]:
# If smaller sample: Convert vector string in csv file to df
# and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# --> If full data: Load pickle file to save time.

if sample_size != None:
    corp_ft = df['ft_vector'].str.strip('[]').str.split(expand=True)
    corp_ft = corp_ft.astype('float')
    display(corp_ft)
    # with open('pickle/ft_vectors.pkl', mode='wb') as f:
    #     pickle.dump(corp_ft, f)

else:
    with open('pickle/ft_vectors.pkl', mode='rb') as f:
        corp_ft = pickle.load(f)
    display(corp_ft)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.044147,-0.010727,-0.021333,0.073693,-0.021842,0.106237,0.048300,-0.033911,-0.003105,-0.029260,-0.017412,0.018268,-0.012076,0.004286,0.062145,0.006855,0.022881,-0.024971,0.006245,0.002184,-0.071574,0.051942,-0.000050,0.017572,0.010189,-0.026628,-0.092220,0.018010,0.074499,0.021300,0.075879,0.048802,-0.143237,-0.148499,-0.035969,0.028878,-0.059840,-0.137513,0.002799,-0.057196,-0.001409,0.055020,0.102742,-0.117058,0.157518,0.094997,-0.007800,0.034279,-0.063406,-0.064017,-0.016375,0.014664,-0.050131,0.099417,-0.006343,-0.066297,0.051177,0.183496,-0.063202,0.009117,-0.053415,0.044517,0.013730,0.011338,0.013443,-0.104968,0.002170,0.009175,0.074044,0.016404,0.009218,-0.003867,0.013765,0.056818,0.130176,-0.050709,-0.108570,0.132067,-0.052417,-0.002704,0.084048,0.018089,0.015692,0.109482,0.058689,0.019810,0.011112,0.046629,0.097933,0.038506,0.144735,0.017034,0.050863,0.090643,0.037196,-0.060164,-0.204394,0.036898,0.005015,-0.080517
1,0.045035,0.015012,-0.031230,0.095337,-0.005714,0.072040,0.021957,-0.104118,-0.012181,-0.034687,-0.009410,0.018831,-0.037818,-0.007835,0.026300,0.040378,0.045364,-0.009003,0.017358,0.044997,-0.033400,0.055305,-0.014680,0.035894,0.005406,-0.064536,-0.022109,0.004644,0.065064,-0.003331,0.078107,0.064094,-0.087262,-0.071299,-0.013654,0.003943,0.009838,-0.122297,0.017988,-0.051111,0.042570,0.038836,0.092849,-0.126283,0.115054,0.119809,-0.031324,-0.015771,-0.059560,-0.024641,0.019063,0.019704,-0.017091,0.086157,-0.051860,-0.049499,0.047760,0.135254,-0.011185,0.033549,-0.070093,0.069730,0.052525,0.057918,0.010601,-0.052078,-0.021647,0.028767,0.025027,0.015278,-0.004224,-0.008566,-0.041321,0.024186,0.099360,-0.025270,-0.155365,0.138787,-0.067567,-0.003745,0.004060,0.033323,0.014736,0.095166,0.054628,0.031185,0.027616,0.019930,0.076454,0.018387,0.121566,-0.009655,0.069269,0.064510,0.064865,0.012886,-0.195201,0.046845,-0.014470,-0.087090
2,0.036135,0.060928,-0.045832,-0.021712,0.031051,0.116954,0.093026,-0.027822,0.012752,-0.052951,-0.028258,0.027057,-0.029136,-0.010164,0.012568,0.058478,-0.041214,0.029549,0.005083,0.046028,-0.054956,0.033618,0.018311,0.009927,0.031874,-0.056786,-0.051601,0.000337,0.000993,0.010421,0.060127,0.073892,-0.039378,-0.096642,-0.053912,-0.005296,0.009595,-0.085247,0.063929,-0.021424,0.033577,0.063424,0.032319,-0.137784,0.185954,0.118363,-0.021805,0.015155,-0.061567,0.043021,0.038389,0.051905,0.023407,0.143960,-0.027262,-0.053986,0.063327,0.148148,-0.004690,0.076563,-0.089181,0.008221,0.018480,0.030871,0.054996,-0.073677,0.002084,0.078584,0.069686,0.011664,-0.065486,0.010194,-0.036486,0.005192,0.094966,-0.016059,-0.112567,0.141986,-0.082917,0.005868,-0.000361,0.042258,-0.049653,0.055382,0.110012,0.046045,0.104404,0.020696,0.046833,0.032274,0.042782,-0.007130,0.050428,0.082118,0.107009,-0.042840,-0.190986,-0.000122,-0.041266,-0.046468
3,0.039573,0.157933,-0.020214,0.057683,-0.008880,0.104484,-0.052335,0.006033,-0.042691,-0.040424,-0.027922,0.035889,-0.067628,-0.126732,0.004982,0.055914,0.009691,0.024989,-0.048098,-0.005371,0.031314,-0.017267,0.088873,0.019246,0.073914,0.059488,-0.038088,0.017873,0.079816,0.012914,-0.074497,0.059297,-0.089719,-0.137958,0.027465,0.073633,-0.079590,-0.122665,-0.017319,0.060372,-0.049327,0.047486,0.151064,-0.087532,0.150533,0.008041,-0.032695,-0.039066,-0.039877,-0.004224,0.004445,0.010489,0.017666,0.076992,0.009272,-0.014125,0.016549,0.132018,0.003592,0.023962,-0.074207,0.132911,0.088515,0.047677,0.006068,-0.088292,-0.004821,0.026328,0.093638,-0.012713,0.015317,0.032288,0.069446,0.030841,0.060915,-0.061268,-0.057279,0.017419,-0.017211,-0.024124,-0.023163,0.076009,0.031592,0.108365,0.053835,-0.027791,0.023800,0.077351,0.054805,-0.054671,0.158273,-0.008926,0.084062,0.036029,0.061134,-0.125965,-0.251415,0.050898,-0.054511,-0.131701
4,0.047127,0.014325,-0.064551,0.083197,-0.019630,0.084514,0.019399,-0.068587,-0.019620,-0.023196,-0.032098,0.042065,-0.044344,-0.004111,0.011791,0.042352,0.012252,-0.014673,0.057341,0.046910,-0.066897,0.064972,-0.034655,0.005188,0.012922,-0.058052,-0.024093,0.065037,0.021435,0.017271,0.063674,0.043773,-0.126437,-0.050208,-0.049604,0.006511,-0.021220,-0.092931,0.030745,-0.063225,-0.003405,0.044456,0.104145,-0.119646,0.159951,0.089902,-0.059345,-0.029951,-0.065419,0.011514,0.012206,0.014106,-0.036344,0.072171,-0.007067,-0.033601,-0.010010,0.109744,-0.005566,0.056769,-0.086494,0.063378,0.043386,0.049098,0.038811,-0.055674,-0.003815,0.037976,0.064653,0.009235,-0.031680,-0.006529,-0.045138,0.014543,0.093950,-0.083166,-0.187691,0.127421,-0.055424,0.015945,-0.037107,0.021291,-0.023526,0.065614,0.030877,0.035494,0.002108,0.014196,0.086827,-0.009053,0.086523,0.025544,0.056027,0.066404,0.042987,0.004231,-0.157346,0.032868,-0.004603,-0.046409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.072992,0.061421,-0.072507,0.154582,-0.017961,0.092963,-0.008755,-0.155729,0.010407,-0.039185,0.053642,0.069441,-0.040421,-0.027959,0.023054,0.081762,0.041737,0.031252,-0.027263,0.013952,-0.040392,0.029533,-0.005287,-0.006026,-0.019786,-0.023557,-0.109435,0.077050,0.072039,-0.008176,0.080902,0.116137,-0.140159,-0.140259,-0.027932,0.074213,-0.007709,-0.172199,0.035101,-0.029106,0.017739,0.116714,0.113392,-0.167974,0.171166,0.121920,-0.023881,0.019048,-0.070621,-0.017566,-0.046173,0.009177,0.009068,0.062127,-0.016557,-0.102086,-0.003490,0.070844,-0.050503,0.026266,-0.102985,0.097275,0.027578,0.043907,0.036535,-0.085531,-0.041771,0.021548,0.022069,0.015978,-0.063947,0.002273,-0.044897,0.012826,0.096724,0.010717,-0.154603,0.111576,-0.078541,0.019299,0.019054,0.027277,0.004200,0.019383,0.048016,0.032031,0.007084,0.023381,0.057421,0.028810,0.067925,0.049037,0.054886,0.022692,0.003686,-0.001977,-0.181355,0.037239,0.069705,-0.079338
49996,-0.004076,0.051708,-0.037135,0.124179,-0.048005,0.099867,0.029332,-0.094990,-0.027968,-0.030221,0.015495,0.011983,0.001935,-0.068048,0.029728,0.051330,0.033008,-0.001520,0.026196,0.053764,-0.020949,0.046885,-0.009603,0.000946,-0.014247,-0.035876,-0.032827,-0.021135,0.034281,0.036729,0.070477,0.072261,-0.121016,-0.068811,0.002590,0.020815,-0.004962,-0.108317,0.009416,-0.046612,0.018243,0.088364,0.084051,-0.111729,0.146643,0.087691,0.004862,0.021474,-0.054248,0.028745,0.007933,0.035826,-0.007495,0.077300,0.029120,-0.022797,0.041555,0.074839,-0.026332,0.026082,-0.074661,0.090992,0.047141,0.027461,-0.018288,-0.042035,-0.024912,0.023380,0.079701,-0.005610,-0.014284,0.030799,-0.040701,0.012929,0.077573,-0.039890,-0.129100,0.084841,-0.075460,-0.006274,-0.001608,0.017525,0.028175,0.078099,0.035662,0.028442,0.036223,0.021307,0.051086,0.043294,0.105491,0.015424,0.056720,0.019504,0.037504,-0.057102,-0.131954,-0.006964,-0.018257,-0.086776
49997,-0.018976,-0.020095,-0.068835,0.006007,-0.061755,0.080899,0.010507,-0.131596,0.052080,-0.026943,0.023422,0.000984,-0.045519,0.001027,0.000779,0.005445,0.025026,-0.089514,0.071286,0.044633,-0.055256,0.050054,-0.041321,0.037747,0.022783,-0.101310,0.021628,0.024129,0.060667,0.058565,0.132234,0.047025,-0.137007,-0.083749,0.006757,0.055179,-0.004582,-0.144903,0.012800,-0.072154,0.022564,0.028652,0.091591,-0.110621,0.112952,0.049028,-0.001911,-0.013599,-0.097929,0.013328,-0.004766,0.061360,-0.045369,0.050810,0.022519,-0.033835,0.014947,0.111027,-0.043359,0.100139,-0.034196,0.125882,0.048581,0.060136,0.015841,-0.063894,-0.004671,-0.058153,-0.012682,0.036304,-0.026377,-0.019847,-0.035826,0.024743,0.056156,-0.033316,-0.120117,0.132370,-0.067233,0.010890,-0.020816,0.029969,-0.021405,0.127907,0.042843,0.108523,0.059084,0.056659,0.064342,0.064153,0.060485,-0.024174,0.044688,0.093558,0.074229,-0.028566,-0.178525,0.008641,-0.086734,-0.044515
49998,0.010251,-0.001374,-0.035312,0.125206,0.022589,0.069817,0.030666,-0.077698,-0.051827,-0.032351,0.023295,0.015860,-0.040329,0.003414,0.035478,0.062868,0.050423,-0.055791,0.011508,0.063100,-0.026569,0.129161,-0.012539,-0.000037,0.021955,-0.062541,-0.031090,-0.008639,0.055509,0.112502,0.051428,0.043963,-0.046321,-0.115241,-0.027452,0.035859,0.052409,-0.081510,-0.056058,-0.015903,0.015892,0.076639,0.029948,-0.058817,0.078486,0.001241,0.009439,0.075545,-0.054919,-0.004066,-0.012697,0.007541,-0.003380,0.014730,0.032084,-0.021433,0.015778,0.045998,0.026132,0.070968,-0.084706,0.034821,0.025479,0.027231,0.000708,0.059313,-0.014666,-0.013105,0.052927,-0.028899,-0.040568,-0.005057,-0.082594,-0.004659,0.007405,-0.042611,-0.140220,0.092320,-0.019836,-0.017846,-0.077873,0.064523,-0.000938,0.065888,0.053105,0.084587,0.051709,0.134480,0.095011,0.017046,0.116170,0.008665,0.020098,0.124400,0.108705,-0.079640,-0.163437,-0.003368,-0.033252,-0.055717


### Bag of words on raw comments

In [15]:
vect_bow = CountVectorizer()
corp_raw_bow = vect_bow.fit_transform(corp_raw)
corp_raw_bow

<50000x56064 sparse matrix of type '<class 'numpy.int64'>'
	with 1911230 stored elements in Compressed Sparse Row format>

### Bag of words on cleaned comments

In [16]:
corp_bow = vect_bow.fit_transform(corp_clean)
corp_bow

<50000x52776 sparse matrix of type '<class 'numpy.int64'>'
	with 1895064 stored elements in Compressed Sparse Row format>

### Bag of words  on preprocessed comments

In [17]:
corp_pp_bow = vect_bow.fit_transform(corp_pp)
corp_pp_bow

<50000x43778 sparse matrix of type '<class 'numpy.int64'>'
	with 1022041 stored elements in Compressed Sparse Row format>

### Bag of 1/2-grams on preprocessed comments

In [18]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corp_pp_bo12grams = vect_bo12grams.fit_transform(corp_pp)
corp_pp_bo12grams

<50000x807794 sparse matrix of type '<class 'numpy.int64'>'
	with 2116644 stored elements in Compressed Sparse Row format>

### Tf_idf on cleaned comments

In [19]:
vect_tfidf = TfidfVectorizer()
corp_tfidf = vect_tfidf.fit_transform(corp_clean)
corp_tfidf

<50000x52776 sparse matrix of type '<class 'numpy.float64'>'
	with 1895064 stored elements in Compressed Sparse Row format>

### Tf_idf on preprocessed comments

In [20]:
vect_tfidf = TfidfVectorizer()
corp_pp_tfidf = vect_tfidf.fit_transform(corp_pp)
corp_pp_tfidf

<50000x43778 sparse matrix of type '<class 'numpy.float64'>'
	with 1022041 stored elements in Compressed Sparse Row format>

## Baseline model (logistic regression)

In [21]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on raw comments', corp_raw_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [22]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (cleaned)',
                         corp_bow, target)
store_test_result(test_result)

In [23]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corp_pp_bow, target)
store_test_result(test_result)

In [24]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corp_pp_bo12grams, target)
store_test_result(test_result)

In [25]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corp_tfidf, target)
store_test_result(test_result)

In [26]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [28]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [29]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'fastText vectors',
                         corp_ft, target)
store_test_result(test_result)

In [30]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'fastText vectors',
                         corp_ft, target)
store_test_result(test_result)

## Show test results + total exec time

In [31]:
test_results

Unnamed: 0,model_name,model_params,data_desc,data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},bag of words on raw comments,50000,56064,0.795,0.846,0.749,0.848,0.901,"[[6827, 673], [1255, 3745]]",0m 4s,
1,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (cleaned),50000,52776,0.766,0.834,0.679,0.879,0.9,"[[7034, 466], [1605, 3395]]",0m 2s,
2,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (preprocessed),50000,43778,0.766,0.833,0.683,0.871,0.906,"[[6996, 504], [1587, 3413]]",0m 1s,
3,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2-grams (preprocessed),50000,807794,0.765,0.832,0.68,0.873,0.905,"[[7003, 497], [1598, 3402]]",0m 13s,
4,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf,50000,52776,0.771,0.837,0.684,0.883,0.9,"[[7046, 454], [1578, 3422]]",0m 9s,
5,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (preprocessed),50000,43778,0.774,0.837,0.698,0.869,0.904,"[[6976, 524], [1510, 3490]]",0m 7s,
6,XGBoost,"{'random_state': 42, 'n_jobs': -1, 'n_estimato...",tf_idf (preprocessed),50000,43778,0.809,0.855,0.768,0.854,0.922,"[[6844, 656], [1159, 3841]]",0m 56s,
7,XGBoost,"{'random_state': 42, 'n_jobs': -1}",fastText vectors,50000,100,0.705,0.773,0.676,0.735,0.846,"[[6283, 1217], [1618, 3382]]",0m 1s,
8,XGBoost,"{'random_state': 42, 'n_jobs': -1, 'n_estimato...",fastText vectors,50000,100,0.716,0.78,0.694,0.739,0.854,"[[6274, 1226], [1530, 3470]]",0m 11s,


In [32]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 2m 34s


## Other stuff

### Calculate average comment length

In [33]:
# characters
comm_len_chars = df['comment_clean'].apply(lambda s: len(s))
avg_comm_len_chars = comm_len_chars.sum() / len(comm_len_chars)

# words (rough count)
comm_len_words = df['comment_clean']\
    .apply(lambda s: len(re.findall(r'\S+', s)))
avg_comm_len_words = comm_len_words.sum() / len(comm_len_words)

print('Average comment length:')
print(round(avg_comm_len_chars), 'characters')
print(round(avg_comm_len_words), 'words')

Average comment length:
291 characters
51 words


In [None]:
df.isna().sum()

### Calculate vocabulary size

In [None]:
pass