# XGBoost experiments

## Setup

In [15]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# currently not used and thus commented out
#import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility function for testing models and tracking results

In [16]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'train_data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'train_data_size': X_train.shape[0],
            'features_no': X_train.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [17]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [18]:
df = pd.read_csv('data/lexiguard_data.csv')
df.shape

(9964, 6)

## Optional: Create smaller sample from data to speed up experiments

In [21]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (9964 rows).


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9964 entries, 0 to 9963
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   raw                  9964 non-null   object
 1   clean                9964 non-null   object
 2   clean_pp             9964 non-null   object
 3   clean_pp_lemma       9964 non-null   object
 4   clean_pp_lemma_stop  9964 non-null   object
 5   toxic                9964 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 467.2+ KB


In [23]:
df.head()

Unnamed: 0,raw,clean,clean_pp,clean_pp_lemma,clean_pp_lemma_stop,toxic
0,Naturally you can feel it in your urine. \nTha...,Naturally you can feel it in your urine. That'...,naturally you can feel it in your urine that '...,naturally you can feel it in your urine that b...,naturally feel urine common expression germani...,0
1,Yum! What's not to love: water+maple syrup - ...,Yum! What's not to love: water+maple syrup - t...,yum what 's not to love water+maple syrup toge...,yum what be not to love water+maple syrup toge...,yum love water+maple syrup,0
2,Catou I will wager that mutual fund sales will...,Catou I will wager that mutual fund sales will...,catou i will wager that mutual fund sales will...,catou i will wager that mutual fund sale will ...,catou wager mutual fund sale strong rrsp seaso...,0
3,"""The shortage of priests is not a shortage of ...","""The shortage of priests is not a shortage of ...",the shortage of priests is not a shortage of v...,the shortage of priest be not a shortage of vo...,shortage priest shortage vocation shortness si...,0
4,I dont disagree with that. It takes money to d...,I dont disagree with that. It takes money to d...,i do nt disagree with that it takes money to d...,i do not disagree with that it take money to d...,not disagree take money deal drug street level...,0


## Create label/target variable and check for imbalance

In [24]:
target = df['toxic']

In [25]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 9123 (91.6 %)
Toxic (1): 841 (8.4 %)


## Create various corpora

### Bag of words (default)

In [28]:
vect_bow = CountVectorizer()
corpus_bow = vect_bow.fit_transform(df['clean_pp_lemma_stop'])
corpus_bow

<9964x20742 sparse matrix of type '<class 'numpy.int64'>'
	with 201189 stored elements in Compressed Sparse Row format>

In [29]:
# output just a small number of features, or else kernel crashes while converting
# sparse matrix to array
n_words = 100
pd.DataFrame(data=corpus_bow[:, 10000:10000+n_words].toarray(),
             columns=vect_bow.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,kaua,kauai,kaui,kaulukou,kavilina,kawamoto,kbay,kc,kcc,kd,kea,keala,kealoha,kealohas,kearny,keating,keaukaha,keds,keelikolani,keen,keep,keeper,keeping,keg,keiki,keillor,keisha,keith,keller,kelley,kelli,kellie,kelly,kellyanne,kelowna,kelp,ken,kenai,kendro,kendzior,kenmore,kennedy,kenneth,kenney,kenny,kensington,kent,kentucky,kenya,kenyan,kenyans,kermit,kerouac,kerry,kershaw,kesey,kessel,ketchikan,kevin,kevlar,key,keyboard,keystone,keyword,kfc,kfile,kgb,khadr,khan,khanya,kia,kick,kickback,kicker,kickin,kickoff,kid,kidding,kiddo,kiddy,kidnap,kidnaper,kidney,kids,kiellor,kieran,kiev,kiewett,kiko,kilborn,kile,kill,killer,killers,killing,kiloton,kilt,kim,kimber,kimberly
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9959,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Bag of words (binary)

In [31]:
vect_bow_bin = CountVectorizer(binary=True)
corpus_bow_bin = vect_bow_bin.fit_transform(df['clean_pp_lemma_stop'])
corpus_bow_bin

<9964x20742 sparse matrix of type '<class 'numpy.int64'>'
	with 201189 stored elements in Compressed Sparse Row format>

### Bag of words (default) on preprocessed comments (lemmatization, stopword and punctuation removal)

In [17]:
vect_bow_pp = CountVectorizer()
corpus_bow_pp = vect_bow_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_bow_pp

<360301x123598 sparse matrix of type '<class 'numpy.int64'>'
	with 7419284 stored elements in Compressed Sparse Row format>

### Bag of 1/2-grams (default) on preprocessed comments

In [18]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corpus_bo12grams = vect_bo12grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo12grams

<360301x3943310 sparse matrix of type '<class 'numpy.int64'>'
	with 15332522 stored elements in Compressed Sparse Row format>

### Bag of 1/2/3-grams (default) on preprocessed comments

In [19]:
vect_bo123grams = CountVectorizer(ngram_range=(1,3))
corpus_bo123grams = vect_bo123grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo123grams

<360301x11022914 sparse matrix of type '<class 'numpy.int64'>'
	with 22998333 stored elements in Compressed Sparse Row format>

### Bag of 2-grams (default) on preprocessed comments

In [20]:
vect_bo2grams = CountVectorizer(ngram_range=(2,2))
corpus_bo2grams = vect_bo2grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo2grams

<360301x3819712 sparse matrix of type '<class 'numpy.int64'>'
	with 7913238 stored elements in Compressed Sparse Row format>

### Tf_idf

In [21]:
vect_tfidf = TfidfVectorizer()
corpus_tfidf = vect_tfidf.fit_transform(df['comment_text'])
corpus_tfidf

<360301x136896 sparse matrix of type '<class 'numpy.float64'>'
	with 13695102 stored elements in Compressed Sparse Row format>

In [22]:
# output just a small number of features, else kernel crashes
n_words = 100
pd.DataFrame(data=corpus_tfidf[:, 10000:10000+n_words].toarray(),
             columns=vect_tfidf.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,alternative,alternativecheckmate,alternativefacts,alternatively,alternatives,alternator,alternave,alternet,alters,althea,althealthworks,althewhile,althletes,altho,althogh,although,althought,altima,altimetry,altin,altitude,altitudes,altleft,altletnative,altman,altmed,alto,altogether,altogther,alton,altonbakerpark,altoona,altough,altra,altrettanto,altria,altright,altrightpub,altrightpubs,altrite,altruism,altruist,altruistic,altruistically,altruists,alts,altshiler,altuve,altzheimers,alu,alum,aluminium,aluminum,alumn,alumna,alumnae,alumni,alumnists,alumnus,alums,aluta,alutiiq,alva,alvarado,alvarez,alvaro,alveda,alvey,alvin,alvord,alvq8dek8ms,alwaleed,alwasy,alwat,alway,always,always_skeptical,alwaysconstitution,alwaysnowhere,alwayspuzzled,alwaysthere,alwaze,alwyas,aly,alyeska,alyeskaresort,alyosha,alyssa,alzheimer,alzheimers,alzner,alzres,am,am0,am9ng,ama,amab,amabantu,amabhungane,amabungane
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Tf_idf on preprocessed comments (lemmatization, stopword and punctuation removal)

In [23]:
vect_tfidf_pp = TfidfVectorizer()
corpus_tfidf_pp = vect_tfidf_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_tfidf_pp

<360301x123598 sparse matrix of type '<class 'numpy.float64'>'
	with 7419284 stored elements in Compressed Sparse Row format>

## Baseline model (logistic regression)

In [24]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words', corpus_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [25]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words',
                         corpus_bow, target)
store_test_result(test_result)

In [26]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (binary)',
                         corpus_bow_bin, target)
store_test_result(test_result)

In [28]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corpus_bow_pp, target)
store_test_result(test_result)

In [29]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corpus_bo12grams, target)
store_test_result(test_result)

In [30]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2/3-grams (preprocessed)',
                         corpus_bo123grams, target)
store_test_result(test_result)

In [31]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 2-grams (preprocessed)',
                         corpus_bo2grams, target)
store_test_result(test_result)

In [32]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corpus_tfidf, target)
store_test_result(test_result)

In [33]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

In [34]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

## Show test results + total exec time

In [37]:
test_results

Unnamed: 0,model_name,model_params,data_desc,train_data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},bag of words,270225,136896,0.827,0.868,0.785,0.872,0.928,"[[49847, 4145], [7750, 28334]]",0m 42s,
1,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words,270225,136896,0.775,0.841,0.686,0.891,0.91,"[[50962, 3030], [11313, 24771]]",0m 6s,
2,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (binary),270225,136896,0.774,0.84,0.684,0.891,0.91,"[[50985, 3007], [11416, 24668]]",0m 5s,
3,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (mixed case),270225,180483,0.755,0.83,0.655,0.892,0.899,"[[51138, 2854], [12453, 23631]]",0m 7s,
4,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (preprocessed),270225,123598,0.77,0.837,0.682,0.885,0.912,"[[50802, 3190], [11485, 24599]]",0m 5s,
5,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2-grams (preprocessed),270225,3943310,0.771,0.838,0.683,0.885,0.912,"[[50802, 3190], [11428, 24656]]",2m 33s,
6,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2/3-grams (preprocessed),270225,11022914,0.77,0.837,0.681,0.886,0.912,"[[50834, 3158], [11512, 24572]]",40m 27s,
7,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 2-grams (preprocessed),270225,3819712,0.313,0.664,0.192,0.862,0.623,"[[52884, 1108], [29172, 6912]]",17m 11s,
8,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf,270225,136896,0.779,0.843,0.691,0.893,0.912,"[[51012, 2980], [11152, 24932]]",0m 47s,
9,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (preprocessed),270225,123598,0.785,0.846,0.704,0.888,0.916,"[[50779, 3213], [10671, 25413]]",0m 35s,


In [38]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 193m 26s
