<a href="https://colab.research.google.com/github/Aanisha/ACL_Abusive_Tamil_Comment_Classification/blob/main/SVM_on_sampled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SVM on the dataset

The experiment here uses the data after the oversampling and under-sampling of the data.

In [1]:
import warnings 
warnings.filterwarnings('ignore') 

In [2]:
# Downloading library

#!pip install indic-nlp-library

In [3]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

#### Preprocessing the data

In [4]:
train = pd.read_csv('data/Tamil_train_data.csv')
test = pd.read_csv('data/Tamil_test_data.csv')
valid = pd.read_csv('data/Tamil_valid_data.csv')

In [5]:
train = train[train.tag != 'Not-Tamil']

In [6]:
tags = {"tag":     {'Hope-Speech':0, 'None-of-the-above':7, 'Homophobia':1, 'Misandry':2,
       'Counter-speech':3, 'Misogyny':4, 'Xenophobia':5, 'Transphobic':6}}

In [7]:
train = train.replace(tags)
valid = valid.replace(tags)
test = test.replace(tags)

In [8]:
train = pd.concat([train,valid],axis=0)

In [9]:
train.shape

(10227, 2)

In [10]:
train = train.drop(train[train['tag'] == 7].sample(frac=0.4).index)

In [11]:
test_labels = pd.read_csv("data/Tamil_test_labels_data.csv")

test_labels = test_labels.replace(tags)
test_labels = pd.merge(test_labels, test, on=['comments'])
test_labels = test_labels.dropna()

In [12]:
gt = []
co = []
for com in range(len(list(test_labels['comments']))):
  if test_labels['comments'][com] in list(test['comments']):

    gt.append(test_labels['tag'][com])
    co.append(test_labels['comments'][com])

In [13]:
len(co)

2555

In [14]:
def oversample(df):
    classes = [4,0,1,6,5]
    most = 250
    classes_list = []
    for key in classes:
        classes_list.append(df[df['tag'] == key]) 
    classes_sample = []
    for i in range(len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,df], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

In [15]:
train = oversample(train)

#### Tokenization of the train data

In [16]:
import re, string
from indicnlp.tokenize import indic_tokenize
def tokenize(s): return indic_tokenize.trivial_tokenize(s)

In [17]:
n = train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)


trn_term_doc = vec.fit_transform(train['comments'])
test_term_doc = vec.transform(co)

#### Training the model on the training data

In [18]:
label_cols = ['tag']

In [19]:
x = trn_term_doc
test_x = test_term_doc

In [20]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [21]:
import sklearn.svm as svm
import sklearn.ensemble

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    #m = LogisticRegression(solver='newton-cg')
    m = svm.SVC(kernel='poly', degree=8, C=1)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [22]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds = m.predict(test_x.multiply(r))

fit tag


In [23]:
len(preds)

2555

### Preparing the test data

In [24]:
test_labels = pd.read_csv("data/Tamil_test_labels_data.csv")

In [25]:
test_labels.shape

(2559, 2)

In [26]:
test_labels = test_labels.replace(tags)

In [27]:
test_labels = test_labels.dropna()

In [28]:
test_labels.shape

(2556, 2)

### Testing the model using unseen test data

In [29]:
import sklearn

print(sklearn.metrics.classification_report(gt, preds))

              precision    recall  f1-score   support

           0       0.17      0.23      0.20        95
           1       0.24      0.14      0.18        64
           2       0.60      0.08      0.14       419
           3       0.07      0.01      0.01       135
           4       0.12      0.05      0.07       105
           5       0.00      0.00      0.00       120
           6       0.09      0.07      0.07        60
           7       0.61      0.88      0.72      1557

    accuracy                           0.56      2555
   macro avg       0.24      0.18      0.17      2555
weighted avg       0.50      0.56      0.48      2555

