In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from gensim.models import fasttext, word2vec

from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

from IPython.display import display
from sklearn.metrics import precision_score, recall_score, classification_report, f1_score, roc_auc_score

import seaborn as sns
from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary

import pickle

import gc
from typing import List

In [2]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [3]:
train = pd.read_csv('data/jigsaw-unintended-bias-train.csv', error_bad_lines=False)
train

Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,Is this something I'll be able to install on m...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.021277,0.872340,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902189,7194635,He should lose his job for promoting mis-infor...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,333226,approved,0,0,0,0,0,0.0,0,4
1902190,7194636,"""Thinning project is meant to lower fire dange...",0.166667,0.000000,0.0,0.166667,0.166667,0.0,,,...,380644,approved,0,0,0,1,0,0.0,0,6
1902191,7194637,I hope you millennials are happy that you put ...,0.400000,0.000000,0.0,0.100000,0.400000,0.0,,,...,163903,rejected,0,0,0,0,0,0.0,0,10
1902192,7194638,I'm thinking Kellyanne Conway (a.k.a. The Trum...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,,,...,159423,approved,0,0,0,2,0,0.0,0,5


In [4]:
train['target'] = 0
train.loc[train['toxic'] > 0.5, 'target'] = 1

In [5]:
train['target'].value_counts()

0    1789968
1     112226
Name: target, dtype: int64

In [6]:
target_col = 'target'

# Preprocess

In [7]:
punct_chars =  '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

def add_punc_tokens(series):

    r=punct_chars
    #adding an escape character to them
    to_replace=[re.escape(i) for i in r]

    #adding a space between and after them
    replace_with=[f' {i} ' for i in r]
    # We're converting the sentence to a dataframe so we can easily replace all
    #punctuation marks with the function "replace" of pandas
    return series.replace(to_replace,replace_with,regex=True)

In [8]:
def display_random(count=10):
    for text_id in np.random.choice(train.index, count):
        print(train.loc[text_id, 'comment_text'])

In [9]:
def n_upper_chars(string):
    return sum(list(map(str.isupper, string)))


In [10]:
train['comment_text'] = train['comment_text'].str.replace('\n', '')
train['comment_text'] = add_punc_tokens(train['comment_text'])


tokenizer = nltk.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def text_preprocess(sentence: str) -> List[str]:
    sentence.replace('\n', ' ')
    
    sentence = re.sub(r'\W', ' ', str(sentence))

    # remove all single characters
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)

    # Remove single characters from the start
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence)

    
    tokens = tokenizer.tokenize(sentence)
    lemmatized = list(map(lemmatizer.lemmatize, tokens))
    
    
    
    return lemmatized

# features = pd.DataFrame()
# features['count_upper_letters'] = train['comment_text'].apply(n_upper_chars)

# for punct in punct_chars:
#     features[f'count_"{punct}"'] = train['comment_text'].apply(lambda string: string.count(punct))


# train_tokens = train['comment_text'].str.lower().apply(text_preprocess)

# Word2Vec

In [11]:
from sklearn.metrics import roc_auc_score

In [12]:
y_data = train[target_col]

In [13]:
def embed_sentence(sentence: List[str], model, dim=100):
    embeds = []
    
    for token in sentence:
        if token in model.vocab:
            embeds.append(model.get_vector(token))
    if len(embeds) > 0:
        return np.mean(embeds, axis=0)
    else:
        return np.zeros(dim)
    


def report(y_true, y_pred, y_pred_proba, average='micro'):
    precs = precision_score(y_true, y_pred)
    recalls = recall_score(y_true, y_pred)
    f_scores = f1_score(y_true, y_pred)
    roc_score = roc_auc_score(y_true, y_pred_proba)
    
    print(f'Mean precision score: {precs:.2}')
    print(f'Mean recall score: {recalls:.2}')
    print(f'Mean f-score: {f_scores:.2}')
    print(f'ROC {roc_score}')
    return precs, recalls, f_scores


def train_report(features, y_data):
    x_train, x_val, y_train, y_val \
        = train_test_split(features, y_data, train_size=0.8)
    
    base_model = LogisticRegressionCV(class_weight='balanced', n_jobs=-1)

    base_model.fit(x_train, y_train)
    
    y_pred = base_model.predict(x_val)
    y_pred_proba = base_model.predict_proba(x_val)
    report(y_val.values, y_pred, y_pred_proba[:, 1])
    return base_model, y_val, y_pred, y_pred_proba

In [14]:
model = Word2Vec()
model.build_vocab(sentences=train_tokens)
model.train(sentences=train_tokens,
            total_examples=model.corpus_count, epochs=5)


In [16]:
w2v_embed = train_tokens.apply(
    lambda sentence: embed_sentence(sentence, model.wv)
)

w2v_features = np.hstack([
    features.values, np.vstack(w2v_embed.values)
])


In [20]:
logreg_model, test_labels, predicted_labels, y_pred_proba = train_report(w2v_features, y_data)



Mean precision score: 0.18
Mean recall score: 0.75
Mean f-score: 0.29
ROC 0.8436522044869069


# Glove

In [17]:
import os

In [18]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'glove_twitter_27B_100d.txt'
tmp_file = "w2v_from_glove.txt"

_ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(glove_file)


glove_embed = train_tokens.apply(
    lambda sentence: embed_sentence(sentence, glove_model)
)

glove_features = np.hstack([
    features.values, np.vstack(glove_embed.values)
])

In [24]:
logreg_model, test_labels, predicted_labels, y_pred_proba  = train_report(glove_features, y_data)



Mean precision score: 0.18
Mean recall score: 0.75
Mean f-score: 0.3
ROC 0.8495674062914913


# Fasttext


In [19]:
from gensim.models.fasttext import FastText

In [20]:
embedding_size = 60
window_size = 20
min_word = 5
down_sampling = 1e-2


In [23]:
%%time
ft_model = FastText(train_tokens,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=5)

CPU times: user 2h 45min 11s, sys: 2.49 s, total: 2h 45min 13s
Wall time: 55min 21s


In [24]:
with open('ft_model.pkl', mode='wb') as file:
    pickle.dump( ft_model, file)

In [25]:
with open('ft_model.pkl', mode='rb') as file:
    pickle.load( file)

In [26]:
ft_embed = train_tokens.apply(
    lambda sentence: embed_sentence(sentence, ft_model.wv, 60
                                   )
)

ft_features = np.hstack([
    features.values, np.vstack(ft_embed.values)
])

In [27]:
logreg_model, test_labels, predicted_labels, y_pred_proba  = train_report(ft_features, y_data)



Mean precision score: 0.18
Mean recall score: 0.76
Mean f-score: 0.29
ROC 0.8472855757477639


# BERT


In [3]:
import torch
from torch.nn.utils.rnn import pack_padded_sequence
from torch import nn
from torch.utils.data import DataLoader

import sys
import numpy as np
from tqdm import tqdm
from torch.utils.data import TensorDataset

import pickle

import warnings
warnings.filterwarnings("ignore")

In [25]:

bert_tokenizer = torch.hub.load(
    'huggingface/pytorch-transformers',
    'tokenizer',
    'bert-base-multilingual-uncased'
)
seqlen = 20


class MyIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, input_df, dataset_size=None, ):
        super(MyIterableDataset).__init__()
        self.index = 0
        if dataset_size is None:
            self.dataset_size = input_df.shape[0]
        else:
            self.dataset_size = dataset_size
        
        self.input_df=input_df
        
    def __iter__(self):
        
        while self.index < self.dataset_size:
            row = self.input_df['comment_text'].iloc[self.index]
            
            tokenized_dict = bert_tokenizer(
                [row],
                truncation=True,
                padding='max_length',
                max_length=seqlen,
                return_tensors='pt'
            )
            tokenized_dict['label'] = self.input_df['target'].iloc[self.index]
            tokenized_dict['sample_id'] = self.input_df.index[self.index]
            
            self.index += 1
            yield tokenized_dict
            
        
        
        
    



Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


In [26]:
train_text, val_text= train_test_split(
    train, train_size=0.8
)

In [27]:
bert = torch.hub.load(
            'huggingface/pytorch-transformers', 
            'modelForSequenceClassification', 
            'bert-base-multilingual-uncased', num_labels = 2,
            output_attentions = False,
            output_hidden_states = False, #
        )

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

In [29]:
learing_rate = 0.05
n_epochs = 1
batch_size = 64


train_text_ds = MyIterableDataset(train_text)
val_text_ds = MyIterableDataset(val_text)


train_data_loader = DataLoader(train_text_ds, batch_size=batch_size)
val_data_loader = DataLoader(val_text_ds, batch_size=batch_size)



In [32]:


def model_train(model, dataloader):
    pos_weight = train_text.shape[0] / train_text['target'].sum()
    neg_weight = train_text.shape[0] / (train_text['target'] == 0).sum()
    loss_function = torch.nn.CrossEntropyLoss(weight=torch.tensor( [neg_weight, pos_weight], dtype=torch.float))

    optimizer = torch.optim.SGD(model.parameters(), lr=learing_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    
    
    model.train()
    total_acc, total_count = 0, 0
    


    for idx, sample in tqdm(enumerate(dataloader)):
        text = sample['input_ids']
        label = sample['label']
        attention = sample['attention_mask']
        
        
        optimizer.zero_grad()
        predited_label = model(
            text.reshape(batch_size, -1), 
            token_type_ids=None, 
            attention_mask=attention, 
            labels=label
          ).logits
        
        loss = loss_function(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()


        if idx % 500 == 0:
            print(f'Dumping model at batch {idx}')
            with open(f'bert_{idx}.pkl', 'wb') as file:
                pickle.dump(model, file)


In [1]:
trained = False

if not trained:
    for epoch_id in tqdm(range(n_epochs)):
        train_text_ds = MyIterableDataset(train_text)
        val_text_ds = MyIterableDataset(val_text)


        train_data_loader = DataLoader(train_text_ds, batch_size=batch_size)
        model_train(bert, train_data_loader)
else:
    with open('bert_23500.pkl', 'rb') as file:
        bert = pickle.load(file)

In [2]:
validation_predictions = []
val_text_ds = MyIterableDataset(val_text)

val_data_loader = DataLoader(val_text_ds, batch_size=batch_size)

with torch.no_grad():
    for batch in tqdm(val_data_loader):
        toxic_proba = bert(
            batch['input_ids'].reshape(-1, seqlen)
        ).logits.softmax(dim=1)[:,1]

        validation_predictions.append(toxic_proba.numpy())
        

In [52]:
roc_auc_score(
    val_text['target'].values,
    np.concatenate(validation_predictions)
)

0.8460933960657369

# BPE tokenizing

In [15]:
from bpemb import BPEmb
multibpemb = BPEmb(lang="multi", vs=1000000)
def bpe_embed(x):
    return embed_sentence(multibpemb.encode(x), multibpemb.emb.wv)



Setting dim=300 for multilingual BPEmb


In [16]:
train_bpe, val_bpe, train_features, val_features, y_train, y_val = train_test_split(
    train, features, y_data, train_size=0.7
)

In [17]:
%%time

bpe_encoded = train_bpe['comment_text'].apply(lambda x: bpe_embed(x))

  after removing the cwd from sys.path.


CPU times: user 7min 42s, sys: 292 ms, total: 7min 42s
Wall time: 7min 42s


In [18]:
logreg_model, test_labels, predicted_labels, y_pred_proba = train_report(
    np.hstack([
        train_features.values, np.vstack(bpe_encoded.values)
    ]),
    y_train
)



Mean precision score: 0.16
Mean recall score: 0.72
Mean f-score: 0.26
ROC 0.8183686626918516
