# Muril finetuned on sampled data

This model uses a dataset with oversampling and undersampling.

In [45]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from transformers import AutoTokenizer
from transformers import TFAutoModel,AutoModel

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import logging
import wandb

In [46]:
config = {
    
    'seed' : 42,
    'model': '../input/murilbasecased',
    'group': 'MURIL',
    
    'batch_size': 16,
    'max_length': 64,
    
    'device' : 'GPU',
    'epochs' : 25,

    'test_size' : 0.1,
    'lr': 5e-6,
    'use_transfer_learning' : False,
    
    'use_wandb': True,
    'wandb_mode' : 'online',
    

}


In [47]:
def seed_everything(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
# Creating a logger 📃
def init_logger(log_file:str ='training.log'):
    
    # Specify the format 
    formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
    
    # Create a StreamHandler Instance
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    stream_handler.setFormatter(formatter)
    
    # Create a FileHandler Instance
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(formatter)
    
    # Create a logging.Logger Instance
    logger = logging.getLogger('MURIL')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger


LOGGER = init_logger()
LOGGER.info("Logger Initialized")

seed_everything()
LOGGER.info("Seed Setting done")

#### Preparing the train and validation dataset

In [48]:
train_data = pd.read_csv("../input/tamil-data/Tamil_train_data.csv")
valid_data = pd.read_csv("../input/tamil-data/Tamil_valid_data.csv")

In [49]:
train_data = train_data[train_data.tag != 'Not-Tamil']

In [50]:
tags = {"tag":     {'Hope-Speech':0, 'None-of-the-above':7, 'Homophobia':1, 'Misandry':2,
       'Counter-speech':3, 'Misogyny':4, 'Xenophobia':5, 'Transphobic':6}}

In [51]:
train_data = train_data.replace(tags)
valid_data = valid_data.replace(tags)

In [52]:
train_data['tag'].value_counts()

In [53]:
train_data.head()

In [54]:
# Disproportionate sampling:
# randomly select 4 samples from each stratum

#train_data = train_data.groupby('tag', group_keys=False).apply(lambda x: x.sample(750,replace='True'))
train_data = train_data.drop(train_data[train_data['tag'] == 7].sample(frac=0.4).index)

In [55]:
def oversample(df):
    classes = [4,0,1,6,5]
    most = 250
    classes_list = []
    for key in classes:
        classes_list.append(df[df['tag'] == key]) 
    classes_sample = []
    for i in range(len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,df], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [56]:
train_data = oversample(train_data)

In [57]:
train_data['tag'].value_counts()

In [58]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(train_data.tag)
y_valid = to_categorical(valid_data.tag)

In [59]:
from sklearn.model_selection import train_test_split

df_train = train_data
df_valid = valid_data

#df_train = df_train.reset_index(drop=True)
#df_valid = df_valid.reset_index(drop=True)
df_train.to_csv('df_train.csv', index=False)
df_valid.to_csv('df_valid.csv', index=False)

In [60]:
import wandb

if config['use_wandb']:
    
    wandb.login(key='bea7e72396b621da99070349c065e980f4721fa0')
    wandb.init()
    artifact =  wandb.Artifact(name="folds", type="dataset")
    artifact.add_file('./df_train.csv')
    artifact.add_file('./df_valid.csv')

    LOGGER.info("Logging folds.csv to W&B Artifacts")
    wandb.log_artifact(artifact)

#### Loading the tokeniser and tokenising the data

In [61]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])
print(tokenizer)

LOGGER.info('Tokenizer loaded')

In [62]:
#x_train = dataset.run_tokenizer(df_train)
#x_valid = dataset.run_tokenizer(df_valid)

x_train = tokenizer(
    text=df_train.comments.tolist(),
    add_special_tokens=True,
    max_length = 64,
    padding='max_length',
    truncation=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)
x_test = tokenizer(
    text=df_valid.comments.tolist(),
    add_special_tokens=True,
    truncation=True,
    max_length = 64,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [63]:
x_train["input_ids"]

In [64]:
df_train.shape

In [65]:
df_train['tag'].value_counts()

In [66]:
df_valid['tag'].value_counts()

#### Defining the model architecture

In [67]:
def get_keras_model():
    pretrained_model = TFAutoModel.from_pretrained(config['model'])
    
    input_ids = layers.Input(shape=(config['max_length']),
                             name='input_ids', 
                             dtype=tf.int32)
    token_type_ids = layers.Input(shape=(config['max_length'],),
                                  name='token_type_ids', 
                                  dtype=tf.int32)
    attention_mask = layers.Input(shape=(config['max_length'],),
                                  name='attention_mask', 
                                  dtype=tf.int32)
    embedding = pretrained_model(input_ids, 
                     token_type_ids=token_type_ids, 
                     attention_mask=attention_mask)[0]

   

    x1 = tf.keras.layers.Dropout(0.2)(embedding) 
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Dense(8, activation='softmax')(x1)
   
    #print(x1.shape)
    
    model = keras.Model(inputs=[input_ids, 
                                token_type_ids, 
                                attention_mask],
                        outputs=x1)
    
    return model

In [68]:
model = get_keras_model()
LOGGER.info("Model Loaded")

In [69]:
if config['use_transfer_learning']:
    for layer in model.layers:
        if 'tf_bert_model' in layer.name:
            layer.trainable = False
    Logger.info("Transfer learning is enabled")

#### Setting hyperparams for the model

In [70]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

loss =CategoricalCrossentropy()
metric = CategoricalAccuracy('balanced_accuracy')



optimizer = keras.optimizers.Adam(lr= config['lr'],epsilon=1e-08, clipnorm=1.0)
#model.compile(loss = "categorical_crossentropy" , optimizer=optimizer, metrics=[METRICS])

model.compile(loss=loss, optimizer= optimizer, metrics = metric)

In [71]:
tf.keras.utils.plot_model(model, show_shapes=True,show_dtype=True)

In [72]:
if config['use_wandb']:
    wandb.log({"model": wandb.Image('model.png')})

In [73]:
def get_callbacks():
    bm = tf.keras.callbacks.ModelCheckpoint('best_model_sampled_25.h5',
                                            verbose=1, 
                                            monitor='val_loss', 
                                            mode='min', 
                                            save_best_only=True, 
                                            save_weights_only=True)
    lm = tf.keras.callbacks.ModelCheckpoint('last_model_25.h5',
                                            verbose=1, 
                                            save_best_only=False, 
                                            save_weights_only=True)
    
    callbacks = [bm , lm]
    
    if config['use_wandb']:
        callbacks.append( wandb.keras.WandbCallback(save_model=False) )
    return callbacks

In [74]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight = "balanced",classes =  np.unique(df_train['tag']),y = np.array(df_train['tag']))
class_weights = dict(enumerate(class_weights))
class_weights

#### Training the model

In [75]:
history = model.fit(x = {'input_ids':x_train['input_ids'], 
                    'token_type_ids':x_train['token_type_ids'], 
                    'attention_mask': x_train['attention_mask']},
                    y = y_train,
                    epochs=config['epochs'], 
                    callbacks=get_callbacks(), 
                    validation_data = ({'input_ids':x_test['input_ids'], 
                    'token_type_ids':x_test['token_type_ids'], 
                    'attention_mask': x_test['attention_mask']},y_valid))

In [76]:
def plot_hist(hist):
    plt.figure(figsize=(15,5))
    local_epochs = len(hist.history["loss"])
    plt.plot(np.arange(local_epochs, step=1), hist.history["loss"], '-o', label='Train Loss',color='#ff7f0e')
    plt.plot(np.arange(local_epochs, step=1), hist.history["val_loss"], '-o',label='Val Loss',color='#1f77b4')
    plt.xlabel('Epoch',size=14)
    plt.ylabel('Loss',size=14)
    plt.legend(loc=2)
    
    plt.savefig('loss.png')
    plt.show()
    
plot_hist(history)

#### Loading the model and preparing the test data

In [77]:
model_eval = get_keras_model()

model_eval.load_weights('./best_model_sampled_25.h5')

In [78]:
test = pd.read_csv('../input/tamil-data/Tamil_test_data.csv')
test = test.replace(tags)

In [79]:
test_labels = pd.read_csv("../input/tamil-data/Tamil_test_labels_data.csv")

test_labels = test_labels.replace(tags)
test_labels = pd.merge(test_labels, test, on=['comments'])
test_labels = test_labels.dropna()

In [80]:
x_test = tokenizer(
    text=test_labels.comments.tolist(),
    add_special_tokens=True,
    max_length = 64,
    padding='max_length',
    truncation=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [81]:
preds = model.predict(x = {'input_ids':x_test['input_ids'], 
                    'token_type_ids':x_test['token_type_ids'], 
                    'attention_mask': x_test['attention_mask']}, verbose = 1, workers=4)

In [82]:
pr = []
for p in preds:
    pr.append(np.argmax(p))

#### Testing the performance of model on unseen test data

In [83]:
import sklearn
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

print(sklearn.metrics.classification_report(test_labels['tag'], pr))