# Imports

In [2]:
# Asthetics
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# General
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import re
import random
import gc
import glob
pd.set_option('display.max_columns', None)
np.seterr(divide='ignore', invalid='ignore')
gc.enable()

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# NLP
from transformers import AutoTokenizer, AutoModel

# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything()

# Device Optimization
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

Using device: cuda


In [3]:
data_dir = '../input/jigsaw-toxic-severity-rating'
models_dir = '../input/roberta-test'
test_file_path = os.path.join(data_dir, 'comments_to_score.csv')
print(f'Train file: {test_file_path}')

Train file: ../input/jigsaw-toxic-severity-rating/comments_to_score.csv


In [4]:
test_df = pd.read_csv(test_file_path)

# Text Cleaning

In [5]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [6]:
tqdm.pandas()
test_df['text'] = test_df['text'].progress_apply(text_cleaning)

  0%|          | 0/7537 [00:00<?, ?it/s]

In [7]:
test_df.sample(10)

Unnamed: 0,comment_id,text
601,40508854,silly little boy my ass
2031,108591632,Is that so Than why so many people questiong y...
1928,101910871,Wow Thanks You are SO amazing I am in awe of y...
468,35445625,Send this to User Bumpusmills1 Pass this infor...
6351,390340674,that is your opinion And what is your proof th...
748,46347755,man its all about captain morgans spiced rum s...
5296,308431548,Important exceptions include declined unblock ...
1393,72839951,Like Sheen said himself on Jimmy Kimmel s show...
251,23525772,by the vandal Pavel Vozenilek
5519,323602656,Hello Nawlins Why do you like to deflower prep...


# CFG

In [8]:
params = {
    'device': device,
    'debug': False,
    'checkpoint': '../input/roberta-base',
    'output_logits': 768,
    'max_len': 32,
    'batch_size': 16,
    'dropout': 0.2,
    'num_workers': 2
}

In [9]:
if params['debug']:
    train_df = train_df.sample(frac=0.01)
    print('Reduced training Data Size for Debugging purposes')

# Dataset

In [10]:
class BERTDataset:
    def __init__(self, text, max_len=params['max_len'], checkpoint=params['checkpoint']):
        self.text = text
        self.max_len = max_len
        self.checkpoint = checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.num_examples = len(self.text)

    def __len__(self):
        return self.num_examples

    def __getitem__(self, idx):
        text = str(self.text[idx])

        tokenized_text = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_token_type_ids=True,
        )

        ids = tokenized_text['input_ids']
        mask = tokenized_text['attention_mask']
        token_type_ids = tokenized_text['token_type_ids']

        return {'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)}

# NLP Model

In [11]:
class ToxicityModel(nn.Module):
    def __init__(self, checkpoint=params['checkpoint'], params=params):
        super(ToxicityModel, self).__init__()
        self.checkpoint = checkpoint
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.layer_norm = nn.LayerNorm(params['output_logits'])
        self.dropout = nn.Dropout(params['dropout'])
        self.dense = nn.Sequential(
            nn.Linear(params['output_logits'], 128),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(params['dropout']),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds

# Prediction

In [12]:
predictions_nn = None
for model_name in glob.glob(models_dir + '/*.pth'):
    model = ToxicityModel()
    model.load_state_dict(torch.load(model_name))
    model = model.to(params['device'])
    model.eval()

    test_dataset = BERTDataset(
        text = test_df['text'].values
    )
    test_loader = DataLoader(
        test_dataset, batch_size=params['batch_size'],
        shuffle=False, num_workers=params['num_workers'],
        pin_memory=True
    )

    temp_preds = None
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f'Predicting. '):
            ids= batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            predictions = model(ids, token_type_ids, mask).to('cpu').numpy()
            
            if temp_preds is None:
                temp_preds = predictions
            else:
                temp_preds = np.vstack((temp_preds, predictions))

    if predictions_nn is None:
        predictions_nn = temp_preds
    else:
        predictions_nn += temp_preds

predictions_nn /= (len(glob.glob(models_dir + '/*.pth')))


Predicting. :   0%|          | 0/472 [00:00<?, ?it/s]

Predicting. :   0%|          | 0/472 [00:00<?, ?it/s]

# Submission

In [13]:
sub_df = pd.DataFrame()
sub_df['comment_id'] = test_df['comment_id']
sub_df['score'] = predictions_nn
sub_df['score'] = sub_df['score'].rank(method='first')

In [14]:
sub_df.head()

Unnamed: 0,comment_id,score
0,114890,990.0
1,732895,282.0
2,1139051,1942.0
3,1434512,837.0
4,2084821,5078.0


In [15]:
sub_df.to_csv('submission.csv', index=False)