# Import libraries

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
from collections import defaultdict

import scipy
from scipy import sparse
import scipy.optimize as optimize
from scipy.stats import rankdata

from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

from IPython.display import display
from pprint import pprint
import nltk
from nltk.tokenize import word_tokenize
import time
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

#pd.options.display.max_colwidth=300
#pd.options.display.max_columns = 100

In [2]:
from transformers import PreTrainedTokenizerFast, BertTokenizer, BertModel

In [3]:
import torch

In [4]:
from torch import nn

In [21]:
import torch.optim as optim


# Import Dataset

In [5]:
training_data = pd.read_csv("train.csv")
validation_data = pd.read_csv("validation_data.csv")
testing_data = pd.read_csv("comments_to_score.csv")


In [6]:
scales = {'obscene': 0.16, 'toxic': 0.32, 'insult': 0.64, 'threat': 1.5, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in scales:
    training_data[category] = training_data[category] * scales[category]

training_data['sum_score'] = training_data.loc[:, 'toxic':'identity_hate'].sum(axis=1)

training_data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum_score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
testing_data = testing_data.rename(columns = {'text': 'comment_text'}, inplace = False)


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


In [34]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.score = [val for val in df['sum_score']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['comment_text']]

    def classes(self):
        return self.score

    def __len__(self):
        return len(self.score)

    def get_score(self, idx):
        return np.array(self.score[idx])
    
    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_score = self.get_score(idx)

        return batch_texts, batch_score

In [35]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.relu = nn.ReLU()
        self.double()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [11]:
np.random.seed(112)
df_train, df_val, df_test = np.split(training_data[['comment_text','sum_score']].sample(frac=1, random_state=42), [int(.8*len(training_data[['comment_text','sum_score']])), int(.9*len(training_data[['comment_text','sum_score']]))])

print(len(df_train),len(df_val), len(df_test))

127656 15957 15958


In [36]:
model = BertClassifier()


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.MSELoss() 
    optimizer = optim.Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_score in tqdm(train_dataloader):

                train_score = train_score.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_score)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_score).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_score in val_dataloader:

                    val_score = val_score.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_score)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_score).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')


In [14]:
EPOCHS = 5
LR = 1e-6


In [38]:
%%time
train(model, df_train, df_val, LR, EPOCHS)

  1%|▍                                                                       | 336/63828 [1:16:55<242:17:09, 13.74s/it]


KeyboardInterrupt: 

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_score in test_dataloader:

              test_label = test_score.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_score).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

## Slightly different approach

In [None]:
tfidf_vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = tfidf_vec.fit_transform(training_data['text'])


In [None]:
model = Ridge(alpha=0.5)
model.fit(X, training_data['sum_score'])
l_model = Ridge(alpha=1.)
l_model.fit(X, training_data['sum_score'])
s_model = Ridge(alpha=2.)
s_model.fit(X, training_data['sum_score'])

In [None]:
validation_data['less_toxic cleaning'] = validation_data['less_toxic'].apply(text_cleaning)
validation_data['more_toxic cleaning'] = validation_data['more_toxic'].apply(text_cleaning)

X_less_toxic = tfidf_vec.transform(validation_data['less_toxic cleaning'])
X_more_toxic = tfidf_vec.transform(validation_data['more_toxic cleaning'])

validation_data['less_toxic score cleaning'] = model.predict(X_less_toxic)
validation_data['more_toxic score cleaning'] = model.predict(X_more_toxic)

In [None]:
testing_data['cleaning text'] = testing_data['text'].apply(text_cleaning)
text_vector = tfidf_vec.transform(testing_data['cleaning text'])

testing_data['score_05'] = model.predict(text_vector)
testing_data['score_10'] = l_model.predict(text_vector)
testing_data['score_15'] = s_model.predict(text_vector)


In [None]:
# Validation Accuracy
print(f'val : {(validation_data["less_toxic score cleaning"] < validation_data["more_toxic score cleaning"]).mean()}')

testing_data['score2'] = (testing_data['score_05'] + testing_data['score_10'] + testing_data['score_15']) / 3.
#testing_data[['comment_id', 'avg score']].to_csv("submission3.csv", index=False)

In [None]:
testing_data["score"] = .66*testing_data["score1"] + .44*testing_data["score2"] 


In [None]:
testing_data["score"] = rankdata(testing_data["score"], method='ordinal')
testing_data.head()

In [None]:
testing_data['finalscore'] = 0
for i in range(0, 500):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.35
for i in range(801, 1200):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.45
for i in range(1701, 2300):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 0.81
for i in range(2501, 2980):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 0.85    
for i in range(3001, 4000):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.42    
for i in range(4001, 4500):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.45   
for i in range(4501, 4940):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 0.86
for i in range(5501, 5980):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 0.83
for i in range(6001, 6500):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.45
for i in range(7001, 7536):
    testing_data['finalscore'].iloc[i] = testing_data['score'].iloc[i] * 1.42 

In [None]:
testing_data["finalscore"] = rankdata(testing_data["finalscore"], method='ordinal')


In [None]:
testing_data['finalscore'].sort_values()

In [None]:
testing_data[['comment_id', 'finalscore']].sort_values(by = 'finalscore')

In [None]:
testing_data[['comment_id', 'finalscore']].to_csv('submissionfinal.csv', index=False)