<a href="https://colab.research.google.com/github/Aanisha/ACL_Abusive_Tamil_Comment_Classification/blob/main/Gradient_Boosting_Classifier_on_sampled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Boosting Classifier on the dataset

The experiment here uses the data after the oversampling and under-sampling of the data.

In [119]:
# Downloading library

!pip install indic-nlp-library



In [120]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

#### Preprocessing the data

In [121]:
train = pd.read_csv('/content/Tamil_train_data.csv')
test = pd.read_csv('/content/Tamil_test_data.csv')
valid = pd.read_csv('/content/Tamil_valid_data.csv')

In [122]:
train = train[train.tag != 'Not-Tamil']

In [123]:
tags = {"tag":     {'Hope-Speech':0, 'None-of-the-above':7, 'Homophobia':1, 'Misandry':2,
       'Counter-speech':3, 'Misogyny':4, 'Xenophobia':5, 'Transphobic':6}}

In [124]:
train = train.replace(tags)
valid = valid.replace(tags)
test = test.replace(tags)

In [125]:
train = pd.concat([train,valid],axis=0)

In [126]:
train.shape

(10227, 2)

In [127]:
train = train.drop(train[train['tag'] == 7].sample(frac=0.4).index)

In [128]:
test_labels = pd.read_csv("/content/Tamil_test_labels_data.csv")

test_labels = test_labels.replace(tags)
test_labels = pd.merge(test_labels, test, on=['comments'])
test_labels = test_labels.dropna()

In [129]:
gt = []
co = []
for com in range(len(list(test_labels['comments']))):
  if test_labels['comments'][com] in list(test['comments']):

    gt.append(test_labels['tag'][com])
    co.append(test_labels['comments'][com])

In [130]:
len(co)

2555

In [131]:
def oversample(df):
    classes = [4,0,1,6,5]
    most = 250
    classes_list = []
    for key in classes:
        classes_list.append(df[df['tag'] == key]) 
    classes_sample = []
    for i in range(len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,df], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [132]:
train = oversample(train)

#### Tokenization of the train data

In [133]:
import re, string
from indicnlp.tokenize import indic_tokenize
def tokenize(s): return indic_tokenize.trivial_tokenize(s)

In [134]:
n = train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)


trn_term_doc = vec.fit_transform(train['comments'])
test_term_doc = vec.transform(co)

#### Training the model on the training data

In [135]:
label_cols = ['tag']

In [136]:
x = trn_term_doc
test_x = test_term_doc

In [137]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [138]:
import sklearn.svm as svm
import sklearn.ensemble

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = sklearn.ensemble.GradientBoostingClassifier()
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [139]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds = m.predict(test_x.multiply(r))

fit tag


In [140]:
len(preds)

2555

### Preparing the test data

In [141]:
test_labels = pd.read_csv("/content/Tamil_test_labels_data.csv")

In [142]:
test_labels.shape

(2559, 2)

In [143]:
test_labels = test_labels.replace(tags)

In [144]:
test_labels = test_labels.dropna()

In [145]:
test_labels.shape

(2556, 2)

### Testing the model using unseen test data

In [146]:
import sklearn

print(sklearn.metrics.classification_report(gt, preds))

              precision    recall  f1-score   support

           0       0.39      0.13      0.19        95
           1       0.62      0.39      0.48        64
           2       0.71      0.47      0.56       419
           3       0.63      0.18      0.28       135
           4       0.44      0.23      0.30       105
           5       0.65      0.33      0.44       120
           6       0.37      0.25      0.30        60
           7       0.72      0.93      0.81      1557

    accuracy                           0.70      2555
   macro avg       0.57      0.36      0.42      2555
weighted avg       0.67      0.70      0.66      2555

