In [None]:
import torch
torch.cuda.get_device_name()

# Clone the <b>simpletransformers</b> library 

In [None]:
! git clone https://github.com/Debapriya-Tula/simpletransformers.git
%cd simpletransformers
! git checkout deb

# Define the requirements file

In [None]:
%%writefile req.txt

transformers>=4.2.0
datasets
scipy
scikit-learn
seqeval
tensorboardx
pandas
tokenizers
wandb
streamlit
sentencepiece
tqdm>=4.47.0

# Download and install the requirements(restart runtime if using Google Colab)

In [None]:
%%capture
! pip install -r req.txt
! pip install langdetect
! pip install polyglot
! pip install pyicu
! pip install pycld2
! pip install morfessor
! pip install pyenchant
! sudo apt-get install libenchant1c2a

In [None]:
! pwd

# STOP HERE! Restart and run from below.

In [None]:
%%capture
%cd simpletransformers/

# Download the datasets(files converted to xlsx) 

In [None]:
# ## Tamil
! gdown https://drive.google.com/uc?id=10pPg_WI0Qzgvi-qwxcyWbtfqYda0DsoM
! gdown https://drive.google.com/uc?id=1iF4sZ1XFL4pG6YVGrPWEsN1NMiPDQ0cA
! gdown https://drive.google.com/uc?id=1CgoMCL-ZKda6G8xfrVeNPPiSt2S6hBio


# ## Malayalam
! gdown https://drive.google.com/uc?id=1aA-cxg_iRtM83NgCDSluBIL7fsNUiJg_
! gdown https://drive.google.com/uc?id=1y50Xnd685oCoziVytpVtBcJ8CZizE5Nt
! gdown https://drive.google.com/uc?id=1h7vrLgccRuEanpDipHMFv2q4ptX65HlW


# ## Kannada
! gdown https://drive.google.com/uc?id=1k6on-7xMJ6zyaFpCxrV3CZii4y1UD4KP
! gdown https://drive.google.com/uc?id=1Dx-TByQ2gIjvHmmNJTA-Aj8rqwGjHo7i
! gdown https://drive.google.com/uc?id=1zG-K2hdpx4n-Geqpww7s5bd8CYCgeJBH

In [None]:
! python setup.py install
%cd ..

# Import required modules

In [None]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, classification_report, matthews_corrcoef
from sklearn.metrics import precision_recall_fscore_support as score
import os
import tarfile
import warnings

warnings.filterwarnings('ignore')

# Choose language 

In [None]:
language = input('Choose language: 1 for tamil, 2 for malayalam, 3 for kannada: ')
class_list = ['Not_offensive',
 'Offensive_Targeted_Insult_Group',
 'Offensive_Targeted_Insult_Individual',
 'Offensive_Targeted_Insult_Other',
 'Offensive_Untargetede']

train_file_name = None
dev_file_name = None

if language == '1': 
    language = 'Tamil'
    lang = 'ta'
    class_list.append('not-Tamil')
    train_file_name = 'tamil_offensive_full_train.xlsx'
    dev_file_name = 'tamil_offensive_full_dev.xlsx'
    test_file_name = 'tamil_offensive_full_test_with_labels.xlsx'
elif language == '2': 
    language = 'malayalam'
    lang = 'ml'
    # As it contains only 5 classes ('Offensive_Targeted_Insult_Other' is not present)
    class_list.append('not-malayalam')
    class_list.remove('Offensive_Targeted_Insult_Other')

    train_file_name = 'mal_full_offensive_train.xlsx'
    dev_file_name = 'mal_full_offensive_dev.xlsx'
    test_file_name = 'mal_full_offensive_test_with_labels.xlsx'
elif language == '3': 
    language = 'Kannada'
    lang = 'kn'
    class_list.append('not-Kannada')
    train_file_name = 'kannada_offensive_train.xlsx'
    dev_file_name = 'kannada_offensive_dev.xlsx'
    test_file_name = 'kannada_offensive_test_with_labels.xlsx'

# Load data into dataframes

In [None]:
choose_not_class = 'y' #input(f"Do you want to keep the not-{language} class: y or n: ")

if choose_not_class.lower() == 'y':
    choose_not_class = True
else:
    choose_not_class = False


# os.chdir('/content/simpletransformers/')
train_df = pd.read_excel(train_file_name, header=None)
train_df.columns = ['Input', 'Label']
train_df = train_df.dropna().drop_duplicates().reset_index(drop=True, inplace=False)

dev_df = pd.read_excel(dev_file_name, header=None)
dev_df.columns = ['Input', 'Label']

test_df = pd.read_excel(test_file_name, header=None, engine='openpyxl')
test_df.columns = ['Input', 'Label']


if not choose_not_class:
    train_df = train_df[train_df['Label'] != f'not-{language}']
    dev_df = dev_df[dev_df['Label'] != f'not-{language}']
    test_df = test_df[test_df['Label'] != f'not-{language}']

    if f'not-{language}' in class_list:
        class_list.remove(f'not-{language}')

# Labels mapped to integers
train_df['Label'] = train_df.apply(lambda x:  class_list.index(x['Label']),axis=1)
dev_df['Label'] = dev_df.apply(lambda x:  class_list.index(x['Label']),axis=1)
test_df['Label'] = test_df.apply(lambda x:  class_list.index(x['Label']),axis=1)


print(f'Number of examples in the train set: {train_df.shape[0]}')
print(f'Number of examples in the validation set: {dev_df.shape[0]}')
print(f'Number of examples in the test set: {test_df.shape[0]}')

# How sample data looks like.

In [None]:
test_df.head(10)

In [None]:
dev_df.head(10)

# Class-weighting with inverse of #samples in the class

In [None]:
inverse_weights = np.array(train_df['Label'].value_counts().sort_index())
weights = np.sum(inverse_weights) / inverse_weights

# Load pre-trained model

In [None]:
# define hyperparameters
train_args = {"reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "num_train_epochs": 10, # run for 10 epochs
             "no_save": False, # don't save the weights after each iteration as it exceeds the runtime memory allowed
             "save_model_every_epoch": False,
             "save_eval_checkpoints": False,
             "save_steps": -1,
             "use_mixup": False,
             "loss_type": 'cmi_loss',
             "use_cosnorm": True
             }


class_weighting = 'y' #input('Do you want to use class weighting:\nPress\nY for Yes\nN for No: ')

if class_weighting.lower() == 'n':
    # Create a Classification Model (WITHOUT USING CLASS-WEIGHTING)
    print('Model not using class-weighting')
    model = ClassificationModel(
        "distilbert", "distilbert-base-multilingual-cased",
        num_labels=len(class_list),
        args=train_args
)

else:
    # Create a Classification Model (USING CLASS-WEIGHTING)
    print('Model using class-weighting')
    model = ClassificationModel(
        "distilbert", "distilbert-base-multilingual-cased",
        num_labels=len(class_list),
        weight=list(weights),
        args=train_args
    )

# Train the model

In [None]:
## used by cmi
from IPython.utils import io
import polyglot
from polyglot.text import Text, Word
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import warnings
import enchant
from enchant.checker import SpellChecker
import re

chkr = SpellChecker("en_US")
d = enchant.Dict("en_US")

def parse_input(text, base_lang):
    base_lang_regex_map = {
        "ml": '0d[0-7][0-9A-F]',
        "ta": '0b[89A-F][0-9A-F]',
        "kn": '0c[89A-F][0-9A-F]'
    }

    tokenizer = nltk.tokenize.TweetTokenizer()
    sentence_list = tokenizer.tokenize(text)

    labelled_input = []
    
    for word in sentence_list:
        word = word.encode('ascii', 'ignore').decode('ascii')
        x = Text(word)
        chkr.set_text(word)

        flag  = 0
        # check for native language using utf-8 codes
        # logic: if the word is in native script then the language is native becasue no one writes english in native script
        if all([re.search(base_lang_regex_map[base_lang], "%04x"%(ord(char))) for char in word]):
            labelled_input.append(word + f'/{base_lang}')
            flag = 1
        
        # check if language is english
        # logic: if the word is in roman script and it is a proper english word (when we check with oxford), it is obviously english
        elif word.isalpha():
            if d.check(word): 
                labelled_input.append(word + f'/en')
                flag = 1
            # else:
            #     try:
            #         # inspired by: https://stackoverflow.com/questions/31026394/how-to-correct-text-and-return-the-corrected-text-automatically-with-pyenchant
            #         if d.check(next(chkr).suggest()[0]):
            #             labelled_input.append(word + f'/en')
            #             flag = 1
            #     except Exception as exc:
            #         print(exc)
                    
        # use the polyglot library
        if not flag:
            if x.language.code == base_lang:
                labelled_input.append(word + f'/{base_lang}')
            elif x.language.code == 'en':
                labelled_input.append(word + f'/en')
            else:
                labelled_input.append(word + f'/O')

    labelled_input = ' '.join(labelled_input)

    return labelled_input

In [None]:
## used by cmi
%%capture

processed_df = train_df.copy(deep=True)
processed_df['Input'] = processed_df['Input'].apply(str)
processed_df['Input'] = processed_df['Input'].apply(lambda x: parse_input(x, lang))
if not choose_not_class:
    processed_df = processed_df[processed_df['Label'] != f'not-{language}']

In [None]:
def to_str(text):
    tt = text['Input']
    if not isinstance(tt, str):
        tt = str(tt)
    return pd.Series([tt, text['Label']])

model.train_model(train_df=train_df.apply(lambda row: to_str(row), axis=1), processed_df=processed_df, lang=lang)

# Check performance over test set

## Metrics

In [None]:
# precision_recall_fscore_support
def scr(labels, preds):
    return score(labels, preds, average='macro')

# f1 score
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
# classification report
def classification_rprt(labels, preds):
    return classification_report(labels, preds, output_dict=True, target_names=class_list)

# matthews correlation coefficient 
def mathews_coff(labels, pred):
    return matthews_corrcoef(labels, pred)

## Evaluate over the validation set

In [None]:
processed_df = dev_df.copy(deep=True)

with io.capture_output() as captured:
    processed_df['Input'] = processed_df['Input'].apply(str)
    processed_df['Input'] = processed_df['Input'].apply(lambda x: parse_input(x, lang))
    if not choose_not_class:
        processed_df = processed_df[processed_df['Label'] != f'not-{language}']

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(dev_df.apply(lambda x: to_str(x), axis=1), processed_df, lang, f1=f1_multiclass, acc=accuracy_score, 
                                                            cls_report=classification_rprt, mathews_coff = mathews_coff, score=scr)

print(f"Val acc: {result['acc']}\nEval_loss: {result['eval_loss']}\nf1 score: {result['f1']}\nMatthews corrcoeff: {result['mathews_coff']}")

## The Classification report

In [None]:
pd.DataFrame(result['cls_report']).transpose()

## The precision, recall and f1 scores over the dev-set



In [None]:
print(f"Precision: \t{result['score'][0]} \nRecall: \t{result['score'][1]} \nF1-score: \t{result['score'][2]}")

# Make a dataframe to write the results to an excel file

In [None]:
if language == 'Kannada' or language == 'Tamil':
    if choose_not_class:
        result_df = pd.concat([pd.DataFrame({'Input': [dev_df.iloc[i]['Input']],
                                'Correct Label': [class_list[dev_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Targeted_Insult_Other': [model_outputs[i][3]],
                                'Score_Offensive_Untargetede': [model_outputs[i][4]],
                                f'Score_not-{language}': [model_outputs[i][5]]
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)
    else:
        result_df = pd.concat([pd.DataFrame({'Input': [dev_df.iloc[i]['Input']],
                                'Correct Label': [class_list[dev_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Targeted_Insult_Other': [model_outputs[i][3]],
                                'Score_Offensive_Untargetede': [model_outputs[i][4]]
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)

        
else:
    if choose_not_class:
        result_df = pd.concat([pd.DataFrame({'Input': [dev_df.iloc[i]['Input']],
                            'Correct Label': [class_list[dev_df.iloc[i]['Label']]], 
                            'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                            'Score_Not_offensive': [model_outputs[i][0]],
                            'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                            'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                            'Score_Offensive_Untargetede': [model_outputs[i][3]],
                            f'Score_not-{language}': [model_outputs[i][4]]

                            }) for i, j in enumerate(model_outputs)],  ignore_index=True)
    else:
        result_df = pd.concat([pd.DataFrame({'Input': [dev_df.iloc[i]['Input']],
                            'Correct Label': [class_list[dev_df.iloc[i]['Label']]], 
                            'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                            'Score_Not_offensive': [model_outputs[i][0]],
                            'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                            'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                            'Score_Offensive_Untargetede': [model_outputs[i][3]]
                            
                            }) for i, j in enumerate(model_outputs)],  ignore_index=True)

## Normalize the scores

In [None]:
# To normalize the scores between 0 and 1 (both inclusive)
req_cols = [col for col in result_df.columns if col.startswith('Score')]

def normalize(row):
    vals = np.array([row[col] for col in req_cols])
    req_vals = np.exp(vals)/sum(np.exp(vals))
    for i,col in enumerate(req_cols):
        row[col] = req_vals[i]

    return pd.Series(row)

# Write results to file

In [None]:
result_df = result_df.apply(lambda row: normalize(row), axis=1)
result_df.to_excel(f'mbert_val_{language}_without_pseudo-labelling_cmiloss.xlsx', encoding='utf-8')

# Pseudo-labelling

### 1) Obtain predictions over test-set

In [None]:
processed_df = test_df.copy(deep=True)

with io.capture_output() as captured:
    processed_df['Input'] = processed_df['Input'].apply(str)
    processed_df['Input'] = processed_df['Input'].apply(lambda x: parse_input(x, lang))
    if not choose_not_class:
        processed_df = processed_df[processed_df['Label'] != f'not-{language}']

In [None]:
test_df['Input'] = test_df['Input'].apply(str)

# for i in range(test_df.shape[0]):
#     try:
#         with io.capture_output() as captured:
#             predictions, raw_outputs = model.predict(test_df['Input'][i], processed_df, lang)
#     except:
#         print(test_df['Input'][i])

predictions, raw_outputs = model.predict(test_df['Input'].values.tolist(), processed_df, lang)

### 2) Combine the test-set inputs and predictions obtained with the train-set

In [None]:
test_df['Label'] = predictions
pseudo_label_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

### 3) Train the model again with pseudo-labelled data included 

In [None]:
processed_df = pseudo_label_df.copy(deep=True)

with io.capture_output() as captured:
    processed_df['Input'] = processed_df['Input'].apply(str)
    processed_df['Input'] = processed_df['Input'].apply(lambda x: parse_input(x, lang))
    if not choose_not_class:
        processed_df = processed_df[processed_df['Label'] != f'not-{language}']

In [None]:
model.train_model(pseudo_label_df, processed_df=processed_df, lang=lang)

# Check performance over dev-set with the new model

### Evaluate over the validation set

In [None]:
processed_df = test_df.copy(deep=True)

with io.capture_output() as captured:
    processed_df['Input'] = processed_df['Input'].apply(str)
    processed_df['Input'] = processed_df['Input'].apply(lambda x: parse_input(x, lang))
    if not choose_not_class:
        processed_df = processed_df[processed_df['Label'] != f'not-{language}']

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_df.apply(lambda x: to_str(x), axis=1), processed_df, lang, f1=f1_multiclass, acc=accuracy_score, 
                                                            cls_report=classification_rprt, mathews_coff = mathews_coff, score=scr)

print(f"Val acc: {result['acc']}\nEval_loss: {result['eval_loss']}\nf1 score: {result['f1']}\nMatthews corrcoeff: {result['mathews_coff']}")

## The Classification Report

In [None]:
pd.DataFrame(result['cls_report']).transpose()

## The precision, recall and f1 scores over the dev-set

In [None]:
print(f"Precision: {result['score'][0]} \nRecall: {result['score'][1]} \nF1-score: {result['score'][2]}")

# Make a dataframe to write the results to an excel file

In [None]:
if language == 'Kannada' or language == 'Tamil':
    if choose_not_class:
        result_df = pd.concat([pd.DataFrame({'Input': [test_df.iloc[i]['Input']],
                                'Correct Label': [class_list[test_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Targeted_Insult_Other': [model_outputs[i][3]],
                                'Score_Offensive_Untargetede': [model_outputs[i][4]],
                                f'Score_not-{language}': [model_outputs[i][5]]
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)
    else:
        result_df = pd.concat([pd.DataFrame({'Input': [test_df.iloc[i]['Input']],
                                'Correct Label': [class_list[test_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Targeted_Insult_Other': [model_outputs[i][3]],
                                'Score_Offensive_Untargetede': [model_outputs[i][4]]
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)

    
else:
    if choose_not_class:
        result_df = pd.concat([pd.DataFrame({'Input': [test_df.iloc[i]['Input']],
                                'Correct Label': [class_list[test_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Untargetede': [model_outputs[i][3]],
                                f'Score_not-{language}': [model_outputs[i][4]]
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)
    else:
        result_df = pd.concat([pd.DataFrame({'Input': [test_df.iloc[i]['Input']],
                                'Correct Label': [class_list[test_df.iloc[i]['Label']]], 
                                'Predicted Label': [class_list[np.argmax(model_outputs[i])]],
                                'Score_Not_offensive': [model_outputs[i][0]],
                                'Score_Offensive_Targeted_Insult_Group': [model_outputs[i][1]],
                                'Score_Offensive_Targeted_Insult_Individual': [model_outputs[i][2]],
                                'Score_Offensive_Untargetede': [model_outputs[i][3]],
                                
                                }) for i, j in enumerate(model_outputs)],  ignore_index=True)


## Write results to file

In [None]:
result_df = result_df.apply(lambda row: normalize(row), axis=1)
result_df.to_excel(f'mbert_val_{language}_with_pseudo-labelling_cmiloss.xlsx', encoding='utf-8')