In [None]:
! pip install keras

In [None]:
! pip install tensorflow

In [None]:
!pip install torchsummary

In [None]:
!pip install bert-serving-server

In [5]:
from collections import defaultdict
import re
import time

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
from torch import nn
from torch.utils.data import (
    TensorDataset, DataLoader, RandomSampler, SequentialSampler
)
from keras.preprocessing.sequence import pad_sequences
from torchsummary import summary
from transformers import BertTokenizer, BertConfig, BertTokenizer
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, BertModel, BertForSequenceClassification
from autocorrect import Speller
from collections import Counter


In [6]:
def open_train():
    data = pd.read_csv('raw_data/labels_racism.csv', sep='|')
    return data

def open_test(test_data_type='sample'):
    data = pd.read_csv('raw_data/evaluation_{}.csv'.format(test_data_type), sep='|')
    return data

def cleaned_df(df):
    field = 'message'
    df = df.copy()
    df[field] = df[field].str.lower()
    df[field] = df[field].str.strip("\t")
    df[field] = df[field].str.replace("r[^A-Za-z()]", " ")
    df[field] = df[field].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
    df[field] = df[field].str.replace('\n', '')
    df[field] = df[field].str.replace('\w*\d\w*', '')

    return df


def remove_short_words(df, short_len=3):
    def get_corpus(text_column):
        words = []
        for i in text_column:
            for j in i.split():
                words.append(j.strip())
        return words
    
    df = df.copy()
    corpus = get_corpus(df['message'])
    counter = Counter(corpus)
    most_common = counter.most_common(5)
    most_common = dict(most_common)
    counter.most_common()[::-1]
    k=len(counter.most_common())
    l=[]
    for i in range(k):
        if len(counter.most_common()[i][0]) < short_len:
            l.append(counter.most_common()[i][0])
    
    df["message"] = df["message"].apply(lambda row: ' '.join([word for word in row.split() if word not in l]))
    return df
 

def prepare_for_bert(df):
    sentences = df.message.values
    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences] 
    return sentences

def prepare_labels(dataset, is_train):
    column_name = 'message'
    def label_to_score(scores, label):
        if label in scores:
            return scores[label]
        else:
            return 0

    scores = {
        'racist': 1,
        'non-racist': 0,
        'unknown': 0.5
    }
    
    if is_train:
        per_message = dataset.groupby(column_name).label.apply(list).reset_index()
        per_message['label'] = per_message.label.apply(
            lambda x: np.round(np.mean([scores[i] for i in x])).astype("int64") 
        )
        return per_message[per_message.label != 0.5].copy()
    else:
        dataset = dataset.copy()
        dataset['label'] = dataset.label.apply(lambda x: label_to_score(scores, x))
        return dataset


def tokenized_to_trf_inputs(tokenizer, tokenized_sentences):
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentences]
    MAX_LEN = 258

    padding = lambda texts: pad_sequences(texts, 
                                        maxlen=MAX_LEN, 
                                        dtype="long", 
                                        truncating="post", 
                                        padding="post"
                                        )


    input_ids = padding(input_ids)
    get_attention_masks = lambda input_ids: [[float(i>0) for i in seq] for seq in input_ids]

    attention_masks = get_attention_masks(input_ids)
    return input_ids, attention_masks


def split_train_test(input_ids, attention_masks, labels):
    X_train, X_test, mask_train, mask_test, y_train, y_test = train_test_split(input_ids, 
                                                                           attention_masks, 
                                                                           labels, 
                                                                           test_size=0.3)
    return X_train, X_test, mask_train, mask_test, y_train, y_test


def split_and_tensorize(input_ids, attention_masks, labels, batch_size=64):
    X_train, X_test, mask_train, mask_test, y_train, y_test = split_train_test(input_ids, attention_masks, labels)
    X_train = torch.tensor(X_train)
    X_val = torch.tensor(X_val)
    X_test = torch.tensor(X_test)

    mask_train = torch.tensor(mask_train)
    mask_val = torch.tensor(mask_val)
    mask_test = torch.tensor(mask_test)

    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)
    y_test = torch.tensor(y_test)

    train_data = TensorDataset(X_train, mask_train, y_train)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    val_data = TensorDataset(X_val, mask_val, y_val)
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)


    test_data = TensorDataset(X_test, mask_test, y_test)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    
    return train_data, train_dataloader, val_data, val_dataloader, test_data, test_dataloader


def train(
    model,
    optimizer, 
    train_batch_gen,
    val_batch_gen,
    num_epochs
):

    for epoch in range(num_epochs):
        train_loss = 0
        train_acc = 0
        val_loss = 0
        val_acc = 0
        
        start_time = time.time()

        model.train(True)

        for X_batch, mask_batch, y_batch in train_batch_gen:
            
            model_output = model(X_batch, mask_batch, labels=y_batch)
            loss = model_output.loss
            logits = model_output.logits
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.train(False)

     
        for X_batch, mask_batch, y_batch in val_batch_gen:

            with torch.no_grad():
                model_output = model(X_batch, mask_batch, labels=y_batch)
                loss = model_output.loss
                logits = model_output.logits
        
    return model


def evaluate_model(model, test_dataloader):
    model.eval()
    test_preds, test_labels = [], []

    for X_batch, mask_batch, y_batch in test_dataloader:
        

        with torch.no_grad():
            logits = model(X_batch, mask_batch).logits          

        y_pred = logits.max(1)[1].detach().gpu().numpy()   
        test_preds.extend(y_pred)
        test_labels.extend(y_batch.numpy())
        
        print('F1_score: {0:.2f}%, Accuracy: {1:.2f}%, Precision: {2:.2f}%, Recall: {3:.2f}%'.format(
            f1_score(test_labels, test_preds),
            accuracy_score(test_labels, test_preds),
            precision_score(test_labels, test_preds),
            recall_score(test_labels, test_preds)
        ))
        
        return test_preds, test_labels
    


In [None]:
transformer_type = "dccuchile/bert-base-spanish-wwm-cased"
batch_size = 8
num_epochs = 1

data = open_train()
clean_data = (
    data
    .pipe(cleaned_df)
    .pipe(remove_short_words)

)
df=clean_data
sentences = prepare_for_bert(df)
labels = prepare_labels(df, is_train=True)
tokenizer = AutoTokenizer.from_pretrained(transformer_type)
tokenized_sentences = tokenize(sentences, tokenizer)
input_ids, attention_masks = tokenized_to_trf_inputs(tokenizer, tokenized_sentences)
train_data, train_dataloader, val_data, val_dataloader, test_data, test_dataloader = \
    split_and_tensorize(input_ids, attention_masks, labels, batch_size)

model = BertForSequenceClassification.from_pretrained(
    transformer_type, num_labels=2
)
optimizer = AdamW(model.parameters(), lr=5e-5)
model, history = train(
    model,
    optimizer, 
    train_dataloader,
    val_dataloader,
    num_epochs
)
test_preds, test_labels = evaluate_model(model, test_dataloader)