<a href="https://colab.research.google.com/github/Aanisha/ACL_Abusive_Tamil_Comment_Classification/blob/main/Gradient_Boosting_Classifier_on_sampled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Boosting Classifier on the dataset

The experiment here uses the data after the oversampling and under-sampling of the data.

In [2]:
import warnings 
warnings.filterwarnings('ignore') 

In [29]:
# Downloading library

!pip install indic-nlp-library

Collecting indic-nlp-library
  Obtaining dependency information for indic-nlp-library from https://files.pythonhosted.org/packages/ec/21/61240bcf965cedfec993497b38c42f054b149b9669e6d6cddeb1dee09d51/indic_nlp_library-0.92-py3-none-any.whl.metadata
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Obtaining dependency information for sphinx-argparse from https://files.pythonhosted.org/packages/21/98/d32f45b19b60e52b4ddc714dee139a92c6ea8fa9115f994884d321c3454d/sphinx_argparse-0.4.0-py3-none-any.whl.metadata
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl.metadata (1.9 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Obtaining dependency information for sphinx-rtd-theme from https://files.pythonhosted.org/packages/ea/46/00fda84467815c29951a9c91e3ae7503c409ddad04373e7cfc78daad4300/sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl.metadata
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl.metadata (4

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

  from pandas.core import (


#### Preprocessing the data

In [3]:
train = pd.read_csv('data/Tamil_train_data.csv')
test = pd.read_csv('data/Tamil_test_data.csv')
valid = pd.read_csv('data/Tamil_valid_data.csv')

In [4]:
train = train[train.tag != 'Not-Tamil']

In [5]:
tags = {"tag":     {'Hope-Speech':0, 'None-of-the-above':7, 'Homophobia':1, 'Misandry':2,
       'Counter-speech':3, 'Misogyny':4, 'Xenophobia':5, 'Transphobic':6}}

In [6]:
train = train.replace(tags)
valid = valid.replace(tags)
test = test.replace(tags)

In [7]:
train = pd.concat([train,valid],axis=0)

In [8]:
train.shape

(10227, 2)

In [9]:
train = train.drop(train[train['tag'] == 7].sample(frac=0.4).index)

In [10]:
test_labels = pd.read_csv("data/Tamil_test_labels_data.csv")

test_labels = test_labels.replace(tags)
test_labels = pd.merge(test_labels, test, on=['comments'])
test_labels = test_labels.dropna()

In [11]:
gt = []
co = []
for com in range(len(list(test_labels['comments']))):
  if test_labels['comments'][com] in list(test['comments']):

    gt.append(test_labels['tag'][com])
    co.append(test_labels['comments'][com])

In [12]:
len(co)

2555

In [13]:
def oversample(df):
    classes = [4,0,1,6,5]
    most = 250
    classes_list = []
    for key in classes:
        classes_list.append(df[df['tag'] == key]) 
    classes_sample = []
    for i in range(len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,df], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [14]:
train = oversample(train)

#### Tokenization of the train data

In [15]:
import re, string
from indicnlp.tokenize import indic_tokenize
def tokenize(s): return indic_tokenize.trivial_tokenize(s)

In [16]:
n = train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)


trn_term_doc = vec.fit_transform(train['comments'])
test_term_doc = vec.transform(co)

#### Training the model on the training data

In [17]:
label_cols = ['tag']

In [18]:
x = trn_term_doc
test_x = test_term_doc

In [19]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [20]:
import sklearn.svm as svm
import sklearn.ensemble

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = sklearn.ensemble.GradientBoostingClassifier()
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [21]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds = m.predict(test_x.multiply(r))

fit tag


In [22]:
len(preds)

2555

### Preparing the test data

In [23]:
test_labels = pd.read_csv("data/Tamil_test_labels_data.csv")

In [24]:
test_labels.shape

(2559, 2)

In [25]:
test_labels = test_labels.replace(tags)

In [26]:
test_labels = test_labels.dropna()

In [27]:
test_labels.shape

(2556, 2)

### Testing the model using unseen test data

In [28]:
import sklearn

print(sklearn.metrics.classification_report(gt, preds))

              precision    recall  f1-score   support

           0       0.36      0.13      0.19        95
           1       0.52      0.38      0.44        64
           2       0.69      0.46      0.55       419
           3       0.49      0.17      0.25       135
           4       0.38      0.18      0.25       105
           5       0.67      0.33      0.44       120
           6       0.44      0.33      0.38        60
           7       0.72      0.92      0.81      1557

    accuracy                           0.69      2555
   macro avg       0.53      0.36      0.41      2555
weighted avg       0.66      0.69      0.65      2555

