# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Ridge

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

from scipy.stats import rankdata
from bs4 import BeautifulSoup
from transformers import PreTrainedTokenizerFast
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

from datasets import Dataset


# Import dataset

In [2]:
comments_train = pd.read_csv('train.csv')

comments_to_score = pd.read_csv('comments_to_score.csv')

validation_data = pd.read_csv('validation_data.csv')
validation_data.head(5)

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


# Test Preprocessing

In [3]:
%%time
stopWords = stopwords.words('english')

def text_process(text):
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'', text) # 1. Removes website links
    
    soup = BeautifulSoup(text, 'lxml') # 2. Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    text = re.compile("["
                               u"\U0001F600-\U0001F64F"  # 3. emoticons
                               u"\U0001F300-\U0001F5FF"  # 4. symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # 5. transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # 6. flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE).sub(r'', text)

    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

comments_train['cleaned text'] = comments_train['comment_text'].apply(text_process)
validation_data['cleaned less toxic'] = validation_data['less_toxic'].apply(text_process)
validation_data['cleaned more toxic'] = validation_data['more_toxic'].apply(text_process)




Wall time: 1min 46s


# EDA

# define a function to create a wordcloud
def word_cloud(data, title, ax, fig):
    wordcloud = WordCloud(background_color = 'white', stopwords = set(stopWords), scale = 5).generate(' '.join(data))
    ax.axis('off')
    ax.set_title(title, fontsize = 20)
    fig.subplots_adjust(bottom=0.1,top=0.9, wspace=0.1, hspace=0.2)
    ax.imshow(wordcloud)

fig, ax = plt.subplots(3,2, figsize=(20,15))

word_cloud(comments_train[comments_train['toxic']==1]['cleaned text'], 'Word Cloud of toxic comments', ax[0,0], fig)
word_cloud(comments_train[comments_train['severe_toxic']==1]['cleaned text'], 'Word Cloud of severe_toxic comments', ax[0,1], fig)
word_cloud(comments_train[comments_train['obscene']==1]['cleaned text'], 'Word Cloud of obscene comments', ax[1,0], fig)
word_cloud(comments_train[comments_train['threat']==1]['cleaned text'], 'Word Cloud of threat comments', ax[1,1], fig)
word_cloud(comments_train[comments_train['insult']==1]['cleaned text'], 'Word Cloud of insult comments', ax[2,0], fig)
word_cloud(comments_train[comments_train['identity_hate']==1]['cleaned text'], 'Word Cloud of identity_hate comments', ax[2,1], fig)


# Train Test split and Parameter selection

In [4]:
scales = {'obscene': 0.16, 'toxic': 0.32, 'insult': 0.64, 'threat': 1.5, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in scales:
    comments_train[category] = comments_train[category] * scales[category]

comments_train['mean_score'] = comments_train.loc[:, 'toxic':'identity_hate'].mean(axis=1)

comments_train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned text,mean_score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0,Explanation Why the edits made under my userna...,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0,D aww He matches this background colour I m se...,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0,Hey man I m really not trying to edit war It s...,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0,More I can t make any real suggestions on impr...,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0,You sir are my hero Any chance you remember wh...,0.0


In [17]:
raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [20]:
trainer.vocab_size


25000

In [6]:
dataset = Dataset.from_pandas(comments_train[['comment_text']])
dataset

Dataset({
    features: ['comment_text'],
    num_rows: 159571
})

In [26]:
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["comment_text"]



In [24]:
get_training_corpus()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [27]:
raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(tokenizer_object=raw_tokenizer, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]")


In [28]:
tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=25000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [34]:
tokenizer.get_vocab()

{'needing': 11134,
 'extinction': 18752,
 'sod': 21737,
 'lh': 22265,
 'ent': 3127,
 '##miss': 4055,
 'dictionary': 6639,
 'hallow': 15090,
 'fam': 3560,
 '##rine': 9599,
 'sketch': 15409,
 '##∧': 1933,
 'alfred': 17160,
 'framed': 20936,
 'lately': 10051,
 'mediter': 14744,
 'engineers': 15162,
 '##sternet': 17446,
 '##nx': 18693,
 'inher': 8083,
 'poisoning': 19167,
 'amnesty': 23969,
 'restart': 17805,
 'fuckin': 7670,
 'bizarre': 10085,
 'little': 3311,
 'min': 3339,
 'challenging': 13982,
 'southeast': 16107,
 '134': 9996,
 '##さ': 2004,
 'urgent': 15149,
 '1911': 15884,
 'obsolete': 15888,
 'boards': 12700,
 'substantiated': 18459,
 '期': 1321,
 'lobby': 11242,
 '##ܘ': 2429,
 '##த': 2185,
 'synthesis': 9393,
 'captions': 13333,
 'derivative': 15639,
 'secretary': 11690,
 'globally': 20600,
 'nil': 20628,
 'delusion': 21264,
 'myriad': 23588,
 'crappy': 15336,
 'refered': 15988,
 'epip': 23388,
 'aeros': 24294,
 'andy': 8834,
 '##kkk': 13442,
 'occasional': 14290,
 'unauthorized': 2

In [35]:
labels = comments_train['mean_score']
comments = comments_train['comment_text']
tokenized_comments = tokenizer(comments.to_list())['input_ids']


In [36]:
vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, token_pattern = None)
comments_tr = vectorizer.fit_transform(tokenized_comments)
comments_tr


<159571x24341 sparse matrix of type '<class 'numpy.float64'>'
	with 8517113 stored elements in Compressed Sparse Row format>

In [37]:
regressor = Ridge(random_state=42, alpha=0.8)
regressor.fit(comments_tr, labels)


Ridge(alpha=0.8, random_state=42)

In [38]:
less_toxic_comments = tokenizer(validation_data['less_toxic'].to_list())['input_ids']
more_toxic_comments = tokenizer(validation_data['more_toxic'].to_list())['input_ids']

less_toxic = vectorizer.transform(less_toxic_comments)
more_toxic = vectorizer.transform(more_toxic_comments)

# make predictions
validation_data['less_toxic score'] = regressor.predict(less_toxic)
validation_data['more_toxic score'] = regressor.predict(more_toxic)

In [39]:
texts = tokenizer(comments_to_score['text'].to_list())['input_ids']
texts = vectorizer.transform(texts)
comments_to_score['score1'] = regressor.predict(texts)


In [40]:
comments_to_score["score"] = rankdata(comments_to_score["score1"], method='ordinal')


In [41]:
comments_to_score.head(5)

Unnamed: 0,comment_id,text,score1,score
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",0.001075,874
1,732895,"Looks like be have an abuser , can you please ...",0.042142,3789
2,1139051,I confess to having complete (and apparently b...,0.023576,2696
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",0.002531,994
4,2084821,It is not just you. This is a laundry list of ...,0.04563,3961


In [42]:
comments_to_score[['comment_id', 'score']].to_csv('submissionBERT.csv', index=False)

# DONE!