In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import re
import spacy
from transformers import pipeline
import seaborn as sns
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
from torch import cuda
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import BCEWithLogitsLoss
device = 'cuda' if cuda.is_available() else 'cpu'
import nltk
from transformers import Trainer
nltk.download('words')
from nltk.corpus import words
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments
import os

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

In [3]:
english_words = set(words.words())
# romanian_words = set()

In [4]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [50]:
# preprocess the data for the model
def replace_unwanted_characters(df):
    df.loc[df['content'].isnull(), 'content'] = ''
    df.loc[df['title'].isnull(), 'title'] = ''
    df['title'] = df['title'].apply(lambda x: x.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș"))
    df['content'] = df['content'].apply(lambda x: x.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș"))
    return df

repeated_characters = [x * 3 for x in "AĂÂBCDEFGHÎJKLMNOPQRSȘTȚUVWXYZaăâbcdefghîjklmnopqrsștțuvwxyz,.-/';[]-!@#$%^&*()?+"]
repeated_characters.append('iiii')

profanity_list = [
    "muie",
    "laba",
    "labă",
    "pula",
    "pulă",
    "pizda",
    "pizdă",
    "ce pula",
    "ce pulă",
    "ce pizda",
    "ce pizdă",
    "caca",
    "cacat",
    "căcat",
    "pipi",
    "pisat",
    "pișat",
    "pishat",
    "rahat",
    "kkt",
    "kk",
    "plm",
    "ma fut",
    "mă fut",
    "ma cac",
    "mă cac",
    "ma pis",
    "mă pis",
    "ma pish",
    "mă pish",
    "pwla",
    "pwlă",
    "p.u.l.a.",
    "poola",
    "naiba",
    "dracu",
    "draq",
    "drecu",
    "naibii",
    "dracului",
    "drecului",
    "drqlui",
    "coaie",
    "coae",
    "sloboz",
    "lindic",
    "gaoz",
    "ochiul maro",
    "floci",
    "cur",
    "futai",
    "futare",
    "futere",
    "popou",
    "nanau",
    "pulii",
    "pulii mele",
    "coaiele",
    "coaiele mele",
    "pulile",
    "măta",
    "mata",
    "mă-tii",
    "mă-ta",
    "mă-ti",
]

english_words = set(words.words())

def normalize_text(df):

    # replace special characters
    df['content'] = df['content'].apply(lambda x: x.replace('ă', 'a'))
    df['content'] = df['content'].apply(lambda x: x.replace('â', 'a'))
    df['content'] = df['content'].apply(lambda x: x.replace('î', 'i'))
    df['content'] = df['content'].apply(lambda x: x.replace('ș', 's'))
    df['content'] = df['content'].apply(lambda x: x.replace('ț', 't'))
    df['title'] = df['title'].apply(lambda x: x.replace('ă', 'a'))
    df['title'] = df['title'].apply(lambda x: x.replace('â', 'a'))
    df['title'] = df['title'].apply(lambda x: x.replace('î', 'i'))
    df['title'] = df['title'].apply(lambda x: x.replace('ș', 's'))
    df['title'] = df['title'].apply(lambda x: x.replace('ț', 't'))
    # again with capital letters
    df['content'] = df['content'].apply(lambda x: x.replace('Ă', 'A'))
    df['content'] = df['content'].apply(lambda x: x.replace('Â', 'A'))
    df['content'] = df['content'].apply(lambda x: x.replace('Î', 'I'))
    df['content'] = df['content'].apply(lambda x: x.replace('Ș', 'S'))
    df['content'] = df['content'].apply(lambda x: x.replace('Ț', 'T'))
    df['title'] = df['title'].apply(lambda x: x.replace('Ă', 'A'))
    df['title'] = df['title'].apply(lambda x: x.replace('Â', 'A'))
    df['title'] = df['title'].apply(lambda x: x.replace('Î', 'I'))
    df['title'] = df['title'].apply(lambda x: x.replace('Ș', 'S'))
    df['title'] = df['title'].apply(lambda x: x.replace('Ț', 'T'))

    df['title'] = df['title'].apply(lambda x : re.sub(r"<.*?>", '', x ))
    df['content'] = df['content'].apply(lambda x:re.sub(r"<.*?>", '', x))

    # df['title'] = df['title'].apply(lambda x : re.sub(r"\.\.\.", '', x ))
    # df['content'] = df['content'].apply(lambda x:re.sub(r"\.\.\.", '', x))

    df['title'] = df['title'].apply(lambda x : re.sub(r"#[\w+-]+", '[HTAG]', x ))
    df['content'] = df['content'].apply(lambda x:re.sub(r"#[\w+-]+", '[HTAG]', x))

    df['title'] = df['title'].apply(lambda x : ' '.join(["[STAR]" if "*" in word else word for word in x.split()]))
    df['content'] = df['content'].apply(lambda x:' '.join(["[STAR]" if "*" in word else word for word in x.split()]))

    # df['title'] = df['title'].apply(lambda x : ' '.join(["[REP]" if any([rep in word for rep in repeated_characters]) else word for word in x.split()]))
    # df['content'] = df['content'].apply(lambda x : ' '.join(["[REP]" if any([rep in word for rep in repeated_characters]) else word for word in x.split()]))

    pattern = r'^[!?"\',“”-]*(.*?)[!?"\',“”-]*$'
    df['title'] = df['title'].apply(lambda x : ' '.join(["[PROF]" if re.sub(pattern, r'\1', word) in profanity_list else word for word in x.split()]))
    df['content'] = df['content'].apply(lambda x : ' '.join(["[PROF]" if re.sub(pattern, r'\1', word) in profanity_list else word for word in x.split()]))


    emoji_pattern = r"""
                    (?:
                      [<>]?
                      [:;=8]                     # eyes
                      [\-o\*\']?                 # optional nose
                      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
                      |
                      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
                      [\-o\*\']?                 # optional nose
                      [:;=8]                     # eyes
                      [<>]?
                      |
                      </?3                       # heart
                    )"""
    df['title'] = df['title'].apply(lambda x : re.sub(emoji_pattern, '[EMOJI]', x ))
    df['content'] = df['content'].apply(lambda x : re.sub(emoji_pattern, '[EMOJI]', x ))


    return df


def elim_cuv_scurte(df):
    df['content'] = df['content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>=2]))
    df['title'] = df['title'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>=2]))
    return df



train_df = replace_unwanted_characters(df)
train_df = normalize_text(train_df)
train_df = elim_cuv_scurte(train_df)

test_df = replace_unwanted_characters(df_test)
test_df = normalize_text(test_df)
test_df = elim_cuv_scurte(test_df)

train_df.loc[train_df['content'].isnull(), 'content'] = ''
train_df.loc[train_df['title'].isnull(), 'title'] = ''
test_df.loc[test_df['content'].isnull(), 'content'] = ''
test_df.loc[test_df['title'].isnull(), 'title'] = ''

In [129]:
train_df.to_csv('train_normalized.csv', index=False)
test_df.to_csv('test_normalized.csv', index=False)

In [16]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [51]:
df_shuffled = train_df.sample(frac=0.3, random_state=1000)
train_ratio = 0.8
train_size = int(train_ratio * len(df_shuffled))
df_train = df_shuffled[:train_size]
df_val = df_shuffled[train_size:]

In [52]:
df_train.loc[train_df['content'].isnull(), 'content'] = ''
df_train.loc[train_df['title'].isnull(), 'title'] = ''
df_val.loc[train_df['content'].isnull(), 'content'] = ''
df_val.loc[train_df['title'].isnull(), 'title'] = ''

In [53]:
# fill the missing values
df_train.loc[df_train['content'].isnull(), 'content'] = ''
df_train.loc[df_train['title'].isnull(), 'title'] = ''

In [59]:
df_val.loc[train_df['content'].isnull(), 'content'] = ''
df_val.loc[train_df['title'].isnull(), 'title'] = ''

In [64]:
# find where the missing values are
df_val.loc[df_val['content'].isnull(), 'content']

Series([], Name: content, dtype: object)

In [66]:
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
added_toks = tokenizer.add_special_tokens({'additional_special_tokens': ['[STAR]', '[HTAG]', '[REP]', '[PROF]', '[EMOJI]']})

In [67]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.dataframe = dataframe.reset_index()
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):

        title = self.dataframe.loc[index]['title']
        content = self.dataframe.loc[index]['content']
        label = self.dataframe.loc[index]['class']

        title = '[CLS] ' + title + ' [SEP] ' + content

        title_tensor = self.tokenizer(title, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        return title_tensor, label

In [68]:
class CustomRoBert(nn.Module):
    def __init__(self,):
        super(CustomRoBert, self).__init__()
        self.transformer = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
        self.transformer.resize_token_embeddings(len(tokenizer))
        # self.pooling = None # add pooling method
        self.classifier = nn.Linear(768, 1)
        
    def forward(self, batch):
        outputs  = self.transformer(input_ids = batch['input_ids'].squeeze(1), attention_mask=batch['attention_mask'].squeeze(1))
        # outputs = self.pooling(outputs)
        outputs = self.classifier(outputs[1])
        return outputs

In [69]:
training_dataset = CustomDataset(df_train, tokenizer)
validation_dataset = CustomDataset(df_val, tokenizer)

train_dataloader = DataLoader(training_dataset, sampler=RandomSampler(training_dataset),batch_size=32)
val_dataloader = DataLoader(validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=32)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [46]:
model = CustomRoBert().to(device)

for p in model.transformer.parameters():
    p.requires_grad = False
trainable_params_transformer = [p for (n, p) in model.transformer.named_parameters() if "bias" in n]
for p in trainable_params_transformer:
    p.requires_grad = True
trainable_params = list(model.classifier.parameters())
trainable_params.extend(list(trainable_params_transformer))

print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters')

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(trainable_params, lr= 1e-5)
EPOCHS = 30

The model has 103,681 trainable parameters


In [51]:
model.load_state_dict(torch.load('model.pth'))

RuntimeError: Error(s) in loading state_dict for CustomRoBert:
	size mismatch for transformer.embeddings.word_embeddings.weight: copying a param with shape torch.Size([50006, 768]) from checkpoint, the shape in current model is torch.Size([50005, 768]).

In [43]:
from tqdm import tqdm

model.train()

for epoch in range(EPOCHS):
    running_loss = 0
    running_acc = 0
    
    for X, y in tqdm(train_dataloader):
        y = y.to(device, dtype=torch.long)
        X = {key:val.to(device) for key,val in X.items()}
        
        output = model(X)

        output.squeeze(1)
        loss = criterion(output.squeeze(1), y.float())
        running_loss += loss.item()
        
        model.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch: {epoch + 1} | Train LossL {running_loss / len(training_dataset)}")

100%|██████████| 530/530 [05:19<00:00,  1.66it/s]


Epoch: 1 | Train LossL 0.018212180059648864


100%|██████████| 530/530 [05:18<00:00,  1.66it/s]


Epoch: 2 | Train LossL 0.013586576584837206


100%|██████████| 530/530 [05:14<00:00,  1.68it/s]


Epoch: 3 | Train LossL 0.009156901599412516


100%|██████████| 530/530 [05:15<00:00,  1.68it/s]


Epoch: 4 | Train LossL 0.006248278562818641


100%|██████████| 530/530 [05:15<00:00,  1.68it/s]


Epoch: 5 | Train LossL 0.004728778695033631


100%|██████████| 530/530 [05:14<00:00,  1.68it/s]


Epoch: 6 | Train LossL 0.003954299368998472


100%|██████████| 530/530 [05:32<00:00,  1.59it/s]


Epoch: 7 | Train LossL 0.0035107603499321305


100%|██████████| 530/530 [07:47<00:00,  1.13it/s]


Epoch: 8 | Train LossL 0.003230769312638382


100%|██████████| 530/530 [07:53<00:00,  1.12it/s]


Epoch: 9 | Train LossL 0.003039705314284557


100%|██████████| 530/530 [11:08<00:00,  1.26s/it]


Epoch: 10 | Train LossL 0.0029050588830817213


100%|██████████| 530/530 [11:52<00:00,  1.34s/it]


Epoch: 11 | Train LossL 0.0028343093133624824


100%|██████████| 530/530 [09:18<00:00,  1.05s/it]


Epoch: 12 | Train LossL 0.00268412483978704


100%|██████████| 530/530 [11:12<00:00,  1.27s/it]


Epoch: 13 | Train LossL 0.002619573451374504


100%|██████████| 530/530 [07:48<00:00,  1.13it/s]


Epoch: 14 | Train LossL 0.0025488297243476048


100%|██████████| 530/530 [06:06<00:00,  1.45it/s]


Epoch: 15 | Train LossL 0.0024851629628854896


100%|██████████| 530/530 [06:14<00:00,  1.42it/s]


Epoch: 16 | Train LossL 0.0024565271420929225


100%|██████████| 530/530 [06:09<00:00,  1.43it/s]


Epoch: 17 | Train LossL 0.0023859358409866922


100%|██████████| 530/530 [05:58<00:00,  1.48it/s]


Epoch: 18 | Train LossL 0.002310341060844095


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch: 19 | Train LossL 0.002262280443831183


100%|██████████| 530/530 [05:38<00:00,  1.57it/s]


Epoch: 20 | Train LossL 0.002232380019364593


100%|██████████| 530/530 [05:42<00:00,  1.55it/s]


Epoch: 21 | Train LossL 0.0021980923473253885


100%|██████████| 530/530 [06:19<00:00,  1.40it/s]


Epoch: 22 | Train LossL 0.0021420298090333596


100%|██████████| 530/530 [05:57<00:00,  1.48it/s]


Epoch: 23 | Train LossL 0.002149243449141544


100%|██████████| 530/530 [06:20<00:00,  1.39it/s]


Epoch: 24 | Train LossL 0.0020698368825219


100%|██████████| 530/530 [06:43<00:00,  1.31it/s]


Epoch: 25 | Train LossL 0.0020635582602004897


100%|██████████| 530/530 [06:43<00:00,  1.31it/s]


Epoch: 26 | Train LossL 0.0020180448989589586


100%|██████████| 530/530 [06:08<00:00,  1.44it/s]


Epoch: 27 | Train LossL 0.0019579526995024245


100%|██████████| 530/530 [05:51<00:00,  1.51it/s]


Epoch: 28 | Train LossL 0.0019459201032412947


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch: 29 | Train LossL 0.0019681383459456254


100%|██████████| 530/530 [05:21<00:00,  1.65it/s]

Epoch: 30 | Train LossL 0.0019175230220308613





In [44]:
torch.save(model.state_dict(), 'model_mini.pth')

In [45]:
import gc
del model
gc.collect()
torch.cuda.empty_cache()

In [47]:
model.load_state_dict(torch.load('model_mini.pth'))

<All keys matched successfully>

In [70]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def validate(model, val_loader):
    model.eval()
    val_loss = 0

    pred_labels = []
    true_labels = []

    for X, y in tqdm(val_loader):
        y = y.to(device, dtype=torch.long)
        X = {key:val.to(device) for key,val in X.items()}

        output = model(X)

        output.squeeze(1)
        loss = criterion(output.squeeze(1), y.float())
        val_loss += loss.item()

        pred = (torch.sigmoid(output.squeeze(1)) > 0.5)
        pred_labels.extend(pred.cpu().numpy().tolist())
        true_labels.extend(y.cpu().numpy().tolist())
        
    val_loss /= len(val_loader)
    acc = accuracy_score(true_labels, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')
    print(f"Val Loss: {val_loss} | Val Acc: {acc} | Val Precision: {precision} | Val Recall: {recall} | Val F1: {f1}")


In [71]:
validate(model, val_dataloader)

100%|██████████| 133/133 [00:39<00:00,  3.38it/s]

Val Loss: 0.06525788049710761 | Val Acc: 0.9787485242030697 | Val Precision: 0.9881305637982196 | Val Recall: 0.9786921381337252 | Val F1: 0.9833887043189369





In [72]:
def predict(model, test_loader):
    model.eval()
    pred_labels = []

    for X in tqdm(test_loader):
        X = {key:val.to(device) for key,val in X.items()}

        output = model(X)

        output.squeeze(1)
        pred = (torch.sigmoid(output.squeeze(1)) > 0.5)
        pred_labels.extend(pred.cpu().numpy().tolist())
        
    return pred_labels

In [73]:
class CustomDatasetTest(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.dataframe = dataframe.reset_index()
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):

        title = self.dataframe.loc[index]['title']

        title = title.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")

        title_tensor = self.tokenizer(title, max_length=512, padding='max_length', return_tensors='pt', truncation=True)
        return title_tensor

In [74]:
test_dataset = CustomDatasetTest(df_test, tokenizer)

In [75]:
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)

In [76]:
preds = predict(model, test_dataloader)

100%|██████████| 1146/1146 [05:09<00:00,  3.70it/s]


In [77]:
pred_df = pd.DataFrame({'id': df_test['id'], 'class': preds})

In [78]:
pred_df.to_csv('submission_robert_30E_full.csv', index=False)