<a href="https://colab.research.google.com/github/Aanisha/ACL_Abusive_Tamil_Comment_Classification/blob/main/MuRIL_on_original_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuned MuRIL on original dataset

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 34.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from transformers import AutoTokenizer
from transformers import TFAutoModel

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import logging

#### Pre-processing the data

In [3]:
train = pd.read_csv('/content/Tamil_train_data.csv')
test = pd.read_csv('/content/Tamil_test_data.csv')
valid = pd.read_csv('/content/Tamil_valid_data.csv')

In [4]:
tags = {"tag":     {'Hope-Speech':0, 'None-of-the-above':7, 'Homophobia':1, 'Misandry':2,
       'Counter-speech':3, 'Misogyny':4, 'Xenophobia':5, 'Transphobic':6}}

In [5]:
test = test.replace(tags)

#### Loading the model

In [6]:
config = {
    
    'seed' : 42,
    'model': '/content/drive/MyDrive/Muril-base-cased',
    'group': 'MURIL',
    
    'batch_size': 16,
    'max_length': 64,
    
    'device' : 'GPU',
    'epochs' : 2,

    'test_size' : 0.1,
    'lr': 5e-6,
    'use_transfer_learning' : False,
    
    'use_wandb': True,
    'wandb_mode' : 'online',
}

In [7]:
def get_keras_model():
    pretrained_model = TFAutoModel.from_pretrained(config['model'])
    
    input_ids = layers.Input(shape=(config['max_length']),
                             name='input_ids', 
                             dtype=tf.int32)
    token_type_ids = layers.Input(shape=(config['max_length'],),
                                  name='token_type_ids', 
                                  dtype=tf.int32)
    attention_mask = layers.Input(shape=(config['max_length'],),
                                  name='attention_mask', 
                                  dtype=tf.int32)
    embedding = pretrained_model(input_ids, 
                     token_type_ids=token_type_ids, 
                     attention_mask=attention_mask)[0]

   

    x1 = tf.keras.layers.Dropout(0.2)(embedding) 
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Dense(8, activation='softmax')(x1)
   
    #print(x1.shape)
    
    model = keras.Model(inputs=[input_ids, 
                                token_type_ids, 
                                attention_mask],
                        outputs=x1)
    
    return model

In [8]:
model = get_keras_model()

model.load_weights('/content/drive/MyDrive/Muril-base-cased/best_model_25.h5')

Some layers from the model checkpoint at /content/drive/MyDrive/Muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Muril-base-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Preparing test data

In [9]:
test_labels = pd.read_csv("/content/Tamil_test_labels_data.csv")

test_labels = test_labels.replace(tags)
test_labels = pd.merge(test_labels, test, on=['comments'])
test_labels = test_labels.dropna()

In [10]:
test_labels.shape

(2555, 2)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

In [12]:
x_test = tokenizer(
    text=test_labels.comments.tolist(),
    add_special_tokens=True,
    max_length = 64,
    padding='max_length',
    truncation=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [13]:
len(x_test['input_ids'])

2555

In [14]:
preds = model.predict(x = {'input_ids':x_test['input_ids'], 
                    'token_type_ids':x_test['token_type_ids'], 
                    'attention_mask': x_test['attention_mask']}, verbose = 1, workers=4)



In [17]:
pr = []
for p in preds:
  pr.append(np.argmax(p))

#### Testing the model on unseen test data

In [18]:
import sklearn
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

print(sklearn.metrics.classification_report(test_labels['tag'], pr))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        95
           1       0.00      0.00      0.00        64
           2       0.52      0.68      0.59       419
           3       0.25      0.31      0.28       135
           4       0.00      0.00      0.00       105
           5       0.72      0.11      0.19       120
           6       0.00      0.00      0.00        60
           7       0.76      0.89      0.82      1557

    accuracy                           0.68      2555
   macro avg       0.28      0.25      0.23      2555
weighted avg       0.60      0.68      0.62      2555



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
