# Custom predictions using fastText + XGBoost (Michael)

## Setup

In [1]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse
import re
import os

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# fastText
import fasttext

# spaCy
import spacy

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Load data

In [2]:
df = pd.read_csv('data/data_usampl_60_40_comments_cleaned_preproc_fasttext.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360835 entries, 0 to 360834
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   comment_raw            360835 non-null  object
 1   comment_clean          360603 non-null  object
 2   comment_clean_preproc  360038 non-null  object
 3   ft_vector              360835 non-null  object
 4   toxic                  360835 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 13.8+ MB


## Create label/target variable + basic corpus variables

In [4]:
target = df['toxic']

corp_raw = df['comment_raw']
corp_clean = df['comment_clean']
corp_pp = df['comment_clean_preproc']

## Train fastText

In [5]:
# choose from corp_raw, corp_clean and corp_pp for different fastText
# training sets
corp2ft = corp_pp

In [7]:
# remove chars that prevent fastText from training
#if corp2ft is corp_raw:
regex = r'[\n\r]'
corp_raw = corp_raw.str.replace(regex, ' ', regex=True, case=False)

# create temp file for fastText
corp2ft.to_csv('data/fasttext_training_data_tmp.csv',
               index=False, header=False)

# run unsupervised learning to get embeddings
ft = fasttext.train_unsupervised('data/fasttext_training_data_tmp.csv')

# delete temp file
os.remove('data/fasttext_training_data_tmp.csv')

# get fastText vectors
corp_ft = corp2ft.map(ft.get_sentence_vector)

# convert series of lists to df
corp_ft = pd.DataFrame\
    .from_dict(dict(zip(corp_ft.index, corp_ft.values))).T

corp_ft

Read 8M words
Number of words:  42083
Number of labels: 0
Progress: 100.0% words/sec/thread:   63546 lr:  0.000000 avg.loss:  2.163434 ETA:   0h 0m 0s


AttributeError: 'float' object has no attribute 'find'

## Train baseline model on raw BOW

In [None]:
vect_bow = CountVectorizer()
corp_raw_bow = vect_bow.fit_transform(corp_raw)

lr = LogisticRegression(max_iter=2000)
lr.fit(corp_raw_bow, target)

## Define custom strings to do tox detection on

In [None]:
custom_strings = pd.Series(np.array([
    'I hate gay people!',
    'Lesbians suck.',
    'You\'re a pussy and you know it.',
    'Fuck you!',
    'Up your ass.',
    'I really like people.',
    'Hello world!',
    'Gay men are great!'
]))

## Predict: BOW (raw) + baseline model

In [None]:
# vectorize strings
custom_strings_bow = vect_bow.transform(custom_strings)

# predict strings
custom_strings_predict = lr.predict(custom_strings_bow)

for i, str in enumerate(custom_strings):
    print(i, str, '-->', custom_strings_predict[i])

## Predict: fastText (preprocessed) + XGBoost

    clean custom_strings
    preprocess custom_strings
    vectorize custom_strings with fastText
    predict custom_strings with XGBoost
    output results

In [None]:
def clean(s):

    # remove HTML anchor tags
    regex = r'<a .*?>|</a>' # *? for non-greedy repetition
    s = re.sub(regex, '', s, flags=re.IGNORECASE)

    # remove URLs
    regex = r'https?://\S+'
    s = re.sub(regex, '', s, flags=re.IGNORECASE)

    # remove newlines (\n), carriage returns (\r), unicode line separators (U+2028)
    regex = r'[\n\r\u2028]'
    s = re.sub(regex, ' ', s, flags=re.IGNORECASE)

    # remove numbers and replace with _number_
    regex = r'\d+'
    s = re.sub(regex, '_number_', s, flags=re.IGNORECASE)

    # "unmask" morst frequent swearwords, insults etc. (e.g. f*ck, cr@p)
    match_list = '(?i)f*ck, (?i)sh*t, (?i)s**t, (?i)f***, (?i)p***y, (?i)b*tch, (?i)f**k, (?i)p*ssy, (?i)p****, (?i)s***, (?i)a**, (?i)h*ll, (?i)h***, (?i)sh*t, (?i)pu**y, (?i)sh**, (?i)cr*p, (?i)@ss, (?i)cr@p, (?i)b@lls, (?i)f@ck, (?i)waaay, (?i)waaaay, (?i)riiiight, (?i)soo+, (?i)stooooopid, (?i)huu+ge, (?i)yuu+ge, (?i)suu+re'\
        .replace('*', r'\*').split(', ')
    replace_list = 'fuck, shit, shit, fuck, pussy, bitch, fuck, pussy, pussy, shit, ass, hell, hell, shit, pussy, shit, crap, ass, crap, balls, fuck, way, way, right, so, stupid, huge, huge, sure'\
        .split(', ')
    for match, repl in zip(match_list, replace_list):
        re.sub(match, repl, s, flags=re.IGNORECASE)

    ### Remove multiple spaces
    regex = r' {2,}'
    s = re.sub(regex, ' ', s, flags=re.IGNORECASE)

    return s

In [None]:
custom_strings_clean = custom_strings.map(clean)

In [None]:
# function that returns list of lemmatized tokens with stop words and
# punctuation marks removed
def preprocess(s):
    doc = nlp(s) # tokenize

    final_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue # skip punctuation marks and stop words
        final_tokens.append(token.lemma_) # lemmatize token

    return " ".join(final_tokens) # convert list to space-separated string

In [None]:
nlp = spacy.load('en_core_web_sm')
custom_strings_pp = custom_strings_clean.map(preprocess)
print(custom_strings, custom_strings_clean, custom_strings_pp)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)
xgb.fit(corp_ft, target)


