# XGBoost experiments (Michael)

## Setup

In [1]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse
import re
import os

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# fastText
import fasttext

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility functions for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 5),
            'acc': round(accuracy_score(y_test, y_pred), 5),
            'recall': round(recall_score(y_test, y_pred), 5),
            'prec': round(precision_score(y_test, y_pred), 5),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 5),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [4]:
# new cleaned data
df = pd.read_csv('data/data_usampl_60_40_comments_cleaned_preproc_fasttext.csv')
df.shape

(360038, 5)

## Optional: Create smaller sample from data to speed up experiments

In [5]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 50_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (360038 rows).


## Drop rows with NaN's

In [6]:
rows_before = df.shape[0]
print("rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print('rows after:', df.shape[0])
print('rows dropped:', rows_before - df.shape[0])

rows with NaN's before dropping: 360038
rows after: 360038
rows dropped: 0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360038 entries, 0 to 360037
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   comment_raw            360038 non-null  object
 1   comment_clean          360038 non-null  object
 2   comment_clean_preproc  360038 non-null  object
 3   ft_vector              360038 non-null  object
 4   toxic                  360038 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 13.7+ MB


In [8]:
df

Unnamed: 0,comment_raw,comment_clean,comment_clean_preproc,ft_vector,toxic
0,"Well, what are the chances he will turn out to...","Well, what are the chances he will turn out to...",chance turn active proponent slavery,[ 4.59616072e-02 6.82545453e-03 -5.34088910e-...,0
1,The moment of critical mass is approaching whe...,The moment of critical mass is approaching whe...,moment critical mass approach deed Gupta Co li...,[ 2.6372470e-02 1.1377232e-02 -3.8242862e-02 ...,0
2,"""Hey listen to me,"" he said. ""I'm not going to...","""Hey listen to me,"" he said. ""I'm not going to...",hey listen say go crap prove reporter say uh a...,[ 0.01168261 0.02205912 0.04105999 0.041691...,1
3,We are already owed $488 M plus interest($2Bil...,We are already owed $_number_ M plus interest(...,owe $ number M plus interest($_number_Billion ...,[-5.34912646e-02 -3.57165299e-02 -3.62071767e-...,0
4,There is a reason there are no teeth to the la...,There is a reason there are no teeth to the la...,reason tooth law unlawful law way force free e...,[-0.00287034 -0.01856179 -0.04622955 0.085087...,0
...,...,...,...,...,...
360033,Do you still beat your wife? Simple question.,Do you still beat your wife? Simple question.,beat wife simple question,[-0.04303486 -0.00332607 0.01728014 0.042077...,0
360034,The fascist dictator continues the insanity ag...,The fascist dictator continues the insanity ag...,fascist dictator continue insanity human civil...,[ 0.03354586 0.00128701 -0.02047029 0.151842...,1
360035,Sean Hannity is a lightweight foolish commenta...,Sean Hannity is a lightweight foolish commenta...,Sean Hannity lightweight foolish commentator F...,[ 0.05396431 0.01479254 0.04120773 0.066942...,0
360036,There are a number of countries which make it ...,There are a number of countries which make it ...,number country impossible national citizenship...,[ 0.08455141 -0.03090713 -0.03692765 0.154166...,0


## Create label/target variable and check for imbalance

In [9]:
target = df['toxic']

In [10]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 215704 (59.9 %)
Toxic (1): 144334 (40.1 %)


## Create various corpora + fastText vectors

### Raw corpus

In [11]:
corp_raw = df['comment_raw']
corp_raw.shape

(360038,)

### Cleaned corpus

In [12]:
corp_clean = df['comment_clean']
corp_clean.shape

(360038,)

### Pre-processed corpus

In [13]:
corp_pp = df['comment_clean_preproc']
corp_pp.shape

(360038,)

### Corpus of fastText vectors

In [14]:
# # If smaller sample: Convert vector string in csv file to df
# # and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# # --> If full data: Load pickle file to save time.

# if sample_size != None:
#     corp_ft = df['ft_vector'].str.strip('[]').str.split(expand=True)
#     corp_ft = corp_ft.astype('float')
#     display(corp_ft)
#     # with open('pickle/ft_vectors.pkl', mode='wb') as f:
#     #     pickle.dump(corp_ft, f)

# else:
#     with open('pickle/ft_vectors.pkl', mode='rb') as f:
#         corp_ft = pickle.load(f)
#     display(corp_ft)

### Bag of words on raw comments

In [15]:
vect_bow = CountVectorizer()
corp_raw_bow = vect_bow.fit_transform(corp_raw)
corp_raw_bow

<360038x136664 sparse matrix of type '<class 'numpy.int64'>'
	with 13692157 stored elements in Compressed Sparse Row format>

### BOW on cleaned comments

In [16]:
corp_bow = vect_bow.fit_transform(corp_clean)
corp_bow

<360038x123862 sparse matrix of type '<class 'numpy.int64'>'
	with 13580297 stored elements in Compressed Sparse Row format>

### BOW  on preprocessed comments

In [17]:
corp_pp_bow = vect_bow.fit_transform(corp_pp)
corp_pp_bow

<360038x110371 sparse matrix of type '<class 'numpy.int64'>'
	with 7323277 stored elements in Compressed Sparse Row format>

### Bag of 1/2-grams on preprocessed comments

In [18]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corp_pp_bo12grams = vect_bo12grams.fit_transform(corp_pp)
corp_pp_bo12grams

<360038x3794074 sparse matrix of type '<class 'numpy.int64'>'
	with 15164277 stored elements in Compressed Sparse Row format>

### TF-IDF on cleaned comments

In [19]:
vect_tfidf = TfidfVectorizer()
corp_tfidf = vect_tfidf.fit_transform(corp_clean)
corp_tfidf

<360038x123862 sparse matrix of type '<class 'numpy.float64'>'
	with 13580297 stored elements in Compressed Sparse Row format>

### TF-IDF on preprocessed comments

In [20]:
vect_tfidf = TfidfVectorizer()
corp_pp_tfidf = vect_tfidf.fit_transform(corp_pp)
corp_pp_tfidf

<360038x110371 sparse matrix of type '<class 'numpy.float64'>'
	with 7323277 stored elements in Compressed Sparse Row format>

### fastText vectors

In [21]:
# create temp file for fastText
corp_clean.to_csv('data/fasttext_training_data_tmp.csv',
                                    index=False, header=False)

# run unsupervised learning to get embeddings
ft = fasttext.train_unsupervised('data/fasttext_training_data_tmp.csv')

# delete temp file
os.remove('data/fasttext_training_data_tmp.csv')

Read 18M words
Number of words:  102420
Number of labels: 0
Progress: 100.0% words/sec/thread:   78691 lr:  0.000000 avg.loss:  1.736292 ETA:   0h 0m 0s


In [22]:
corp_clean_ft = corp_clean.map(ft.get_sentence_vector)

# convert series of lists to df
corp_clean_ft = pd.DataFrame\
    .from_dict(dict(zip(corp_clean_ft.index, corp_clean_ft.values))).T

corp_clean_ft

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.083202,-0.005201,-0.051713,0.006153,0.000078,0.079970,-0.041399,0.011427,0.040270,-0.022525,0.084457,0.089721,-0.016865,-0.010213,0.037416,0.024682,0.038822,0.054758,-0.026101,0.070488,-0.092043,0.001078,0.002308,-0.080840,0.007465,-0.022049,0.023298,0.089430,-0.012651,-0.097851,-0.003340,0.014615,0.015570,0.045052,-0.005077,0.047528,0.077082,-0.051437,0.061714,-0.060680,0.014307,0.016460,0.038675,-0.077703,0.044979,-0.010690,-0.038357,0.004304,0.017187,0.010453,-0.066871,0.102511,-0.117991,0.047338,-0.062362,0.145812,-0.013677,-0.101109,-0.112485,-0.153507,0.118857,0.104961,0.064911,-0.064177,0.008293,-0.002243,-0.077147,0.012315,-0.051776,-0.068964,-0.035597,0.018380,0.043746,0.040413,-0.059256,-0.025017,0.061146,-0.092445,-0.037445,-0.025365,0.159053,0.069898,0.042395,0.059438,-0.009883,-0.086683,0.126343,0.018057,-0.024816,-0.032165,0.020053,-0.062719,0.071218,0.050363,-0.133128,-0.065167,0.008821,0.050484,0.018988,-0.048591
1,0.053102,0.012319,-0.051832,-0.038495,-0.008992,0.084995,-0.029874,-0.023160,0.047822,0.005237,0.069121,0.118699,-0.012345,-0.034666,0.097277,0.029028,0.019159,0.072306,-0.031083,0.065965,-0.120953,0.006385,-0.010826,-0.060029,0.010435,-0.003612,-0.001583,0.070615,-0.048847,-0.078878,0.015428,0.041450,-0.023084,0.016647,-0.038719,0.073527,0.036474,-0.023914,0.032681,-0.051112,0.009641,-0.034146,0.043814,-0.077696,0.064140,-0.035666,-0.002674,0.019759,0.017303,-0.003499,-0.054480,0.055100,-0.140397,0.053358,-0.092014,0.117245,-0.036001,-0.095270,-0.112199,-0.104288,0.079268,0.077162,0.061409,-0.059238,-0.022129,-0.002426,-0.092081,0.020220,-0.074941,-0.077928,-0.048170,-0.014433,0.032600,0.077923,-0.069938,-0.007420,0.081750,-0.079845,-0.031611,-0.007988,0.092075,0.029710,0.053753,0.051925,-0.012925,-0.069880,0.054354,0.027516,0.002580,0.006332,0.031405,-0.109601,0.065893,0.049416,-0.115510,-0.081005,-0.024884,0.068638,-0.002618,0.020960
2,0.104128,-0.012421,-0.077930,-0.012344,-0.024223,0.054467,-0.040239,0.022507,0.095994,-0.000493,0.082927,0.080321,0.012016,-0.033098,0.039933,0.016107,-0.033966,0.084462,0.001099,0.052757,-0.125216,0.003641,0.019055,-0.033602,0.032815,-0.069327,0.041096,0.100424,-0.009382,-0.098583,-0.023857,0.053690,-0.005711,0.072066,-0.037703,0.038918,0.090917,-0.059864,0.057834,-0.028079,-0.004614,0.029118,0.033592,-0.067889,0.058457,-0.031744,-0.072802,0.025773,-0.022394,0.024422,-0.071256,0.059518,-0.105010,0.047130,-0.054758,0.098941,-0.105822,-0.071341,-0.161005,-0.134298,0.139845,0.144496,0.078291,-0.032247,0.018734,0.007270,-0.056012,0.029946,-0.039600,-0.056312,-0.042419,0.033311,0.063426,-0.025051,-0.033417,-0.006827,0.080186,-0.040309,-0.034874,-0.009030,0.122564,0.086479,0.062833,0.054068,-0.012943,-0.041871,0.097711,0.005183,-0.018188,-0.000337,0.001816,-0.004593,0.046642,0.034176,-0.083670,-0.061856,-0.036904,0.032579,-0.011701,-0.086980
3,0.037731,-0.034530,-0.075974,-0.037123,0.002940,0.102008,-0.043351,0.015969,0.042302,0.020489,0.041073,0.099202,-0.014576,-0.047065,0.127924,0.049131,0.046057,0.100627,-0.010978,0.098664,-0.088020,-0.008669,-0.001478,-0.051331,0.019168,-0.015570,-0.006856,0.048896,0.019093,-0.057744,-0.055913,0.029538,0.015527,0.052798,-0.021703,0.037257,0.032964,-0.019088,0.023130,-0.015703,0.039143,-0.040474,0.033402,-0.108725,0.043577,-0.062751,-0.013638,0.027323,0.013768,0.002364,-0.049083,0.075292,-0.093460,0.061991,-0.042798,0.055168,-0.069576,-0.108306,-0.123684,-0.115329,0.121963,0.119006,0.084960,-0.051673,-0.015786,-0.006749,-0.037367,-0.010039,-0.022203,-0.068922,-0.023228,-0.032396,-0.024443,0.048942,-0.027917,0.025932,0.043702,-0.062468,-0.020479,0.019087,0.167778,0.041165,0.072809,0.053825,-0.004265,-0.095071,0.070861,0.022013,-0.018728,-0.019261,0.029231,-0.082256,0.102700,0.055675,-0.123160,-0.065462,-0.016375,0.056967,0.027287,-0.002505
4,0.084314,0.008045,-0.059784,-0.023783,0.042135,0.097386,-0.067314,0.005421,0.049479,0.020861,0.117494,0.093519,-0.006785,-0.008236,0.071363,0.028648,0.015459,0.085883,-0.023081,0.111787,-0.108945,0.014471,0.020279,-0.050475,0.001908,-0.053114,0.019329,0.121922,-0.024901,-0.077498,-0.015457,0.057843,0.013211,0.038213,-0.032844,0.045617,0.047635,-0.032450,0.040624,-0.022483,0.054156,0.014466,0.023429,-0.081320,0.056779,-0.044134,-0.055715,0.041352,0.045690,0.005639,-0.074433,0.066608,-0.107198,0.010690,-0.061616,0.115163,-0.073024,-0.088545,-0.153160,-0.131318,0.116824,0.099421,0.072777,-0.025769,-0.012478,-0.012109,-0.067874,-0.008913,-0.071023,-0.060791,-0.042368,0.031736,0.035543,0.018550,-0.040747,-0.022087,0.038958,-0.066330,-0.067927,-0.002821,0.112678,0.049009,0.072658,0.087922,0.011663,-0.066166,0.069626,0.034667,-0.008098,-0.003714,0.019379,-0.038464,0.087775,0.033400,-0.093653,-0.030516,-0.016364,0.032484,0.019146,-0.002262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360033,0.173052,-0.015329,-0.052668,0.053407,-0.003576,0.126186,-0.066171,0.007807,0.119883,0.050172,0.043530,0.046850,0.049900,-0.003743,0.061169,0.014926,-0.016637,0.050152,0.039375,0.029072,-0.087316,0.001273,0.121648,-0.050722,0.007789,-0.062760,0.009473,0.078654,-0.062882,-0.114776,-0.023408,0.046707,-0.060382,0.034102,-0.036083,-0.065043,0.016374,-0.050555,-0.020147,-0.009126,0.010561,0.001750,-0.011071,-0.076066,0.104734,-0.011636,-0.073274,0.017222,-0.065366,0.006856,-0.091451,0.076849,-0.123687,0.050776,-0.031975,0.044255,-0.103599,-0.103348,-0.150557,-0.123896,0.133789,0.125102,0.058920,-0.057523,-0.045916,-0.030772,-0.116472,0.066094,-0.001119,-0.052825,0.022982,0.096684,0.038671,-0.028185,-0.045104,-0.042779,0.044048,-0.026447,-0.025297,0.000813,0.132366,0.035898,0.002328,0.061374,-0.006222,0.016553,0.074280,-0.013850,0.024040,0.012312,0.017815,0.028677,0.046255,0.019131,-0.086118,-0.120816,0.011406,0.023889,-0.027355,-0.075936
360034,0.105307,0.034959,-0.047209,-0.033156,0.030558,0.097833,-0.016263,0.009328,0.090392,0.023466,0.041508,0.092432,-0.061955,-0.060828,0.067516,0.047988,0.029216,0.087723,-0.039454,0.054184,-0.179318,0.027261,-0.003220,-0.080725,0.063984,-0.017694,-0.042354,0.097705,-0.043922,-0.065904,-0.007299,0.024742,-0.051552,0.028059,-0.015772,0.020474,0.014894,0.021831,0.035592,-0.069189,0.006137,-0.003605,-0.017622,-0.056612,0.058397,0.014319,-0.029586,-0.015357,0.007279,-0.010232,-0.023088,0.045590,-0.079648,0.086342,-0.054069,0.047401,-0.003228,-0.050387,-0.030273,-0.088795,0.056809,0.055350,0.016138,-0.048127,0.044105,0.034019,-0.031548,0.022854,-0.049754,-0.051335,-0.087892,-0.013776,0.026097,0.070420,-0.033527,0.011923,0.063898,-0.086537,-0.044937,-0.009319,0.160213,0.077029,0.112011,0.064123,0.021129,-0.087406,0.104488,0.046123,-0.019200,0.019404,0.063106,-0.110643,0.061767,0.104749,-0.082534,-0.067644,-0.012578,0.035838,-0.009081,-0.040426
360035,0.080155,-0.021127,-0.027745,-0.011018,0.009617,0.056855,-0.015209,-0.024446,0.084388,-0.003865,0.081248,0.077297,-0.017943,-0.067737,0.077523,0.015142,0.000516,0.088794,-0.000439,0.072673,-0.141035,-0.017050,0.008835,-0.066915,0.031467,-0.030445,0.042356,0.089509,-0.055122,-0.089548,-0.024664,0.035145,-0.065681,0.060111,-0.003226,0.071479,0.041343,-0.040205,0.047826,-0.033139,-0.019943,-0.035488,0.043944,-0.041394,0.097888,-0.061163,-0.068095,0.036011,-0.012948,0.019397,-0.079383,0.034203,-0.135181,0.071204,-0.046466,0.128034,-0.059511,-0.080046,-0.108623,-0.145713,0.064974,0.111029,0.049248,-0.043996,0.033387,0.004183,-0.082300,-0.004027,-0.054746,-0.082872,-0.048252,-0.020187,0.007486,0.069371,-0.038435,0.027344,0.069381,-0.089804,-0.071058,0.033135,0.125987,0.065568,0.031233,0.024794,-0.018881,-0.070463,0.082542,-0.001393,-0.063814,0.045269,0.000141,-0.036666,0.061268,0.049894,-0.044833,-0.075222,-0.028748,0.012993,-0.023951,-0.051810
360036,0.032178,-0.000325,-0.069736,-0.019694,0.029540,0.066460,-0.057857,0.036444,0.071078,0.022606,0.076120,0.096371,0.008606,-0.011928,0.061428,0.039525,0.004820,0.073819,-0.035359,0.078704,-0.102599,0.000292,0.035441,-0.034911,0.017612,-0.034191,0.012164,0.096911,-0.018811,-0.083534,0.003212,0.064681,-0.010106,0.070017,-0.022433,0.026067,0.063787,-0.031939,0.060093,-0.049516,0.042572,-0.014154,0.023886,-0.088346,0.020744,-0.024800,-0.041222,0.090231,0.008597,0.019820,-0.077288,0.036485,-0.106994,0.023453,-0.080356,0.132167,-0.075941,-0.085017,-0.142421,-0.140958,0.109822,0.083316,0.101136,-0.000789,-0.017585,-0.031227,-0.057177,0.009468,-0.013037,-0.056513,-0.027192,-0.004815,-0.003708,0.064943,-0.022716,-0.012443,0.049447,-0.041728,-0.026038,-0.013003,0.179959,0.052704,0.070612,0.079402,0.006432,-0.089764,0.116761,0.043480,-0.011542,-0.019180,0.039030,-0.071974,0.105385,0.058694,-0.155209,-0.045884,-0.007210,0.067340,-0.001060,-0.006787


In [23]:
corp_pp.to_csv('data/fasttext_training_data_tmp.csv',
                                    index=False, header=False)
ft = fasttext.train_unsupervised('data/fasttext_training_data_tmp.csv')
os.remove('data/fasttext_training_data_tmp.csv')
corp_clean_preproc_ft = corp_pp.map(ft.get_sentence_vector)
corp_clean_preproc_ft = pd.DataFrame.\
    from_dict(dict(zip(corp_clean_preproc_ft.index, corp_clean_preproc_ft.values))).T

Read 8M words
Number of words:  42082
Number of labels: 0
Progress: 100.0% words/sec/thread:   77231 lr:  0.000000 avg.loss:  2.163435 ETA:   0h 0m 0s


In [24]:
# remove chars that prevent fastText from training
regex = r'[\n\r]'
corp_raw = corp_raw.str.replace(regex, ' ', regex=True, case=False)

corp_raw.to_csv('data/fasttext_training_data_tmp.csv',
                                    index=False, header=False)
ft = fasttext.train_unsupervised('data/fasttext_training_data_tmp.csv')
os.remove('data/fasttext_training_data_tmp.csv')
corp_raw_ft = corp_raw.map(ft.get_sentence_vector)
corp_raw_ft = pd.DataFrame.\
    from_dict(dict(zip(corp_raw_ft.index, corp_raw_ft.values))).T

Read 18M words
Number of words:  104206
Number of labels: 0
Progress: 100.0% words/sec/thread:   82801 lr:  0.000000 avg.loss:  1.676684 ETA:   0h 0m 0s


## Baseline model (logistic regression) on raw comments

In [25]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on raw comments', corp_raw_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [26]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (cleaned)',
                         corp_bow, target)
store_test_result(test_result)

In [27]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corp_pp_bow, target)
store_test_result(test_result)

In [28]:
# # parameters for model
# params = {'random_state': 42,
#           'n_jobs': -1}

# # load model with parameters
# xgb = XGBClassifier(**params)

# test_result = test_model(xgb, 'XGBoost', params,
#                          'bag of 1/2-grams (preprocessed)',
#                          corp_pp_bo12grams, target)
# store_test_result(test_result)

In [29]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (cleaned)',
                         corp_tfidf, target)
store_test_result(test_result)

In [30]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [31]:
# # parameters for model
# params = {'random_state': 42,
#           'n_jobs': -1,
#           'n_estimators': 1000}

# # load model with parameters
# xgb = XGBClassifier(**params)

# test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
#                          corp_pp_tfidf, target)
# store_test_result(test_result)

In [32]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'fastText vectors (cleaned)',
                         corp_clean_ft, target)
store_test_result(test_result)

In [33]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'fastText vectors (raw)',
                         corp_raw_ft, target)
store_test_result(test_result)

In [34]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'fastText vectors (preprocessed)',
                         corp_clean_preproc_ft, target)
store_test_result(test_result)

## Show test results + total exec time

In [35]:
test_results

Unnamed: 0,model_name,model_params,data_desc,data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},bag of words on raw comments,360038,136664,0.82547,0.86655,0.78722,0.86762,0.92668,"[[49592, 4334], [7678, 28406]]",0m 41s,
1,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (cleaned),360038,123862,0.77778,0.84186,0.69033,0.8906,0.91189,"[[50866, 3060], [11174, 24910]]",0m 6s,
2,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (preprocessed),360038,110371,0.77338,0.8387,0.68659,0.8853,0.91343,"[[50716, 3210], [11309, 24775]]",0m 5s,
3,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (cleaned),360038,123862,0.77889,0.84255,0.69177,0.89112,0.91168,"[[50876, 3050], [11122, 24962]]",1m 26s,
4,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (preprocessed),360038,110371,0.78756,0.84674,0.70862,0.88628,0.91686,"[[50645, 3281], [10514, 25570]]",0m 53s,
5,XGBoost,"{'random_state': 42, 'n_jobs': -1}",fastText vectors (cleaned),360038,100,0.70315,0.77233,0.67262,0.73658,0.84523,"[[45246, 8680], [11813, 24271]]",0m 4s,
6,XGBoost,"{'random_state': 42, 'n_jobs': -1}",fastText vectors (raw),360038,100,0.69646,0.7683,0.66306,0.73341,0.84318,"[[45229, 8697], [12158, 23926]]",0m 4s,
7,XGBoost,"{'random_state': 42, 'n_jobs': -1}",fastText vectors (preprocessed),360038,100,0.72267,0.78571,0.69643,0.75096,0.86165,"[[45592, 8334], [10954, 25130]]",0m 4s,


In [36]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 12m 13s


## Other stuff

### Calculate average comment length

In [37]:
# characters
comm_len_chars = df['comment_clean'].apply(lambda s: len(s))
avg_comm_len_chars = comm_len_chars.sum() / len(comm_len_chars)

# words (rough count)
comm_len_words = df['comment_clean']\
    .apply(lambda s: len(re.findall(r'\S+', s)))
avg_comm_len_words = comm_len_words.sum() / len(comm_len_words)

print('Average comment length:')
print(round(avg_comm_len_chars), 'characters')
print(round(avg_comm_len_words), 'words')

Average comment length:
289 characters
50 words


### Calculate vocabulary size

In [38]:
pass