In [2]:
import sys
!{sys.executable} -m pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp36-cp36m-manylinux2010_x86_64.whl (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 70 kB/s  eta 0:00:01
[?25hCollecting scipy>=0.19.1
  Downloading scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 59.7 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1309 sha256=73a2fa544759c4a0e8920f69d39b2f4fa73d2a53b34f762163e4f182b9f8f326
  Stored in directory: /home/ml-05/.cache/pip/wheels/23/9d/42/5ec745cbbb17517000a53cecc49d6a865450d1f5cb16dc8a9c
Successfully built sklearn
Installing collected packages: threadpoolctl, scipy, scikit-learn, sklearn
Successfully in

In [7]:
import re                                  
import string  
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import BertModel, BertConfig


from utils import clean_text, preprocessing_for_bert, initialize_model, set_seed, train

In [6]:
pd.set_option('display.max_colwidth', None)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

# Get Data

In [4]:
train_df = pd.read_csv("Data/train.csv")
print(f'Shape: {train_df.shape}')

Shape: (159571, 8)


## EDA

### Example of comments and labels

In [8]:
train_df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


### Example of toxic comments

In [8]:
train_df[train_df['toxic'] == 1].head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,"Hey... what is it..\n@ | talk .\nWhat is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?\n\nAsk Sityush to clean up his behavior than issue me nonsensical warnings...",1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming back! Tosser.",1,0,0,0,0,0
42,001810bf8c45bf5f,"You are gay or antisemmitian? \n\nArchangel WHite Tiger\n\nMeow! Greetingshhh!\n\nUh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...\n\n1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!\n\n2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!\n\n3 - First and last warning, you fucking gay - I won't appreciate if any more nazi shwain would write in my page! I don't wish to talk to you anymore!\n\nBeware of the Dark Side!",1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


### Labels' proportions

In [9]:
train_df.loc[:,'toxic':].apply(lambda x: x.value_counts(normalize=True))

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.904156,0.990004,0.947052,0.997004,0.950636,0.991195
1,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805


### Data cleaning

In [9]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x: clean_text(x))

In [10]:
train_df[train_df['toxic'] == 1].head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
12,0005c987bdfc9d4b,hey what is it talk what is it an exclusive group of some wp talibanswho are good at destroying selfappointed purist who gang up any one who asks them questions abt their antisocial and destructive noncontribution at wp ask sityush to clean up his behavior than issue me nonsensical warnings,1,0,0,0,0,0
16,0007e25b2121310b,bye do not look come or think of comming back tosser,1,0,0,0,0,0
42,001810bf8c45bf5f,you are gay or antisemmitian archangel white tiger meow greetingshhh uh there are two ways why you do erased my comment about that holocaust was brutally slaying of jews and not gaysgypsysslavsanyone if you are antisemitian than shave your head bald and go to the skinhead meetings if you doubt words of the bible that homosexuality is a deadly sin make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals first and last warning you fucking gay i wo not appreciate if any more nazi shwain would write in my page i do not wish to talk to you anymore beware of the dark side,1,0,1,0,1,1
43,00190820581d90ce,fuck your filthy mother in the ass dry,1,0,1,0,1,0


### Keep comments with number of words less than 150

In [13]:
text_word_counts = train_df['comment_text'].apply(lambda x: len(x.split()))

In [14]:
n_max_words = 150

print(f'% of observations left: {text_word_counts.lt(n_max_words).mean()}')

% of observations left: 0.899981826271691


In [15]:
train_df = train_df[text_word_counts.lt(n_max_words)]

In [16]:
train_df.loc[:,'toxic':].apply(lambda x: x.value_counts(normalize=True))

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.89982,0.989973,0.944454,0.996825,0.94795,0.990815
1,0.10018,0.010027,0.055546,0.003175,0.05205,0.009185


# Modeling

## Split data

In [16]:
X = train_df['comment_text']
y = train_df.loc[:,'toxic':]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

In [17]:
y_train.apply(lambda x: x.value_counts(normalize=True))

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.899883,0.989926,0.944425,0.996851,0.947899,0.990777
1,0.100117,0.010074,0.055575,0.003149,0.052101,0.009223


In [18]:
y_val.apply(lambda x: x.value_counts(normalize=True))

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.899248,0.990391,0.944715,0.996588,0.948406,0.991157
1,0.100752,0.009609,0.055285,0.003412,0.051594,0.008843


In [19]:
y_train, y_val = y_train.values, y_val.values

## Tokenize data

In [20]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
train_inputs, train_masks = preprocessing_for_bert(X_train, tokenizer)
val_inputs, val_masks = preprocessing_for_bert(X_val, tokenizer)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


  0%|          | 0/129249 [00:00<?, ?it/s]

  0%|          | 0/14362 [00:00<?, ?it/s]

## Use DataLoader

In [22]:
batch_size = 16

train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data) # Change to random
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [11]:
n_batches = len(train_dataloader)
print(f'Number of batches in train data: {n_batches}')

Number of batches in train data: 8079


## Model Training

In [28]:
set_seed(42)
loss_fn = nn.BCEWithLogitsLoss()

bert_classifier, optimizer, scheduler = initialize_model(device=device, n_batches=n_batches, epochs=1)

train(bert_classifier, train_dataloader, optimizer=optimizer, scheduler=scheduler, loss_fn=loss_fn, device=device,
      val_dataloader=val_dataloader, epochs=1, evaluation=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Start training...



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8079 [00:00<?, ?it/s]

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  | Mean ROCAUC |  Elapsed 
----------------------------------------------------------------------
   1    |   10    |   0.455363   |     -      |     -     |     -     |   10.57  
   1    |   20    |   0.245394   |     -      |     -     |     -     |   8.64   
   1    |   30    |   0.210780   |     -      |     -     |     -     |   8.63   
   1    |   40    |   0.201738   |     -      |     -     |     -     |   8.68   
   1    |   50    |   0.142386   |     -      |     -     |     -     |   8.67   
   1    |   60    |   0.185346   |     -      |     -     |     -     |   8.63   
   1    |   70    |   0.178938   |     -      |     -     |     -     |   8.63   
   1    |   80    |   0.125901   |     -      |     -     |     -     |   8.64   
   1    |   90    |   0.144103   |     -      |     -     |     -     |   8.65   
   1    |   100   |   0.111561   |     -      |     -     |     -     |   8.65   
   1    |   110   |   0.1

## Save model

In [29]:
bert_classifier = bert_classifier.to('cpu')

In [30]:
torch.save(bert_classifier.state_dict(), 'bert_1_epoch.pth')