In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, classification_report
)



In [2]:
train_df = pd.read_csv("../Dataset/tamil_train_cleaned.csv")
test_df = pd.read_csv("../Dataset/tamil_test_cleaned.csv")

In [3]:
with pd.option_context('display.max_colwidth', 0):
    display(train_df.sample(n=5))

Unnamed: 0,content,labels
172,மாட்ட கடிச்சு ஆட்ட கடிச்சு அடுத்து கோழிய கடிச்சு.என்ன செய்யப்போறானுக? நாங்கள் என்ன சாப்பிடனும் சாப்பிடக்கூடாதுன்னு சொல்ல நீங்க யாருடா?\nமக்களின்_சின்னம்_மைக்\nமக்களின்_சின்னம்_ஒலிவாங்கி,Neutral
1431,₹752 கோடி போச்சா சோனமுத்தா\n\nசோனியா - ராகுல்காந்தி கதறல் \n\ncongressfails விடியாஅரசு dmkfailstn dmkfails rejectdmk திமுக,Sarcastic
2838,‘கை’ சின்னத்திற்கு ஆதரவு கொடுக்கும் ‘நம்ம ஊரு இந்தியன் தாத்தா’…\n\nvote4india rahulgandhi priyankagandhivadra mkstalin dmkitwing vote4sudha indiaallaince vendammodi novoteforbjp \n\n | | | |,Positive
1027,naaready ft edappadi version\nமொத்தமா செதச்சு விட்டானுங்க எப்பாசாமி யாருப்ப அந்த creater எனக்கே அவர பாக்கனும் போல இருக்கே. \n\nleo edappadipalanisamy modi admk politics\n\n,Sarcastic
949,மத்திய சென்னையை தட்டி தூக்க தயாராகும் மருத்துவர் கார்த்திகேயன் \n\nசீமான்_சின்னம்_ஒலிவாங்கி,Opinionated


In [4]:
train_df.groupby("labels").size()

labels
Negative              406
Neutral               637
None of the above     171
Opinionated          1361
Positive              575
Sarcastic             790
Substantiated         412
dtype: int64

In [7]:
test_df.groupby("labels").size()

labels
0     70
1     51
2    171
3     75
4    106
5     46
6     25
dtype: int64

In [6]:
labeling = {
    'Neutral': 0,
    'Substantiated': 1,
    'Opinionated': 2,
    'Positive' : 3,
    'Sarcastic': 4,
    'Negative': 5,
    'None of the above': 6
}
train_df['labels'] = train_df['labels'].apply(lambda x : labeling[x])
test_df['labels'] = test_df['labels'].apply(lambda x : labeling[x])

In [8]:
from sklearn.utils import resample

# Set the target number of instances for minority classes
target_minority_count = 200

for minority_class in [0,1,3,5,6]:
    minority_indices = train_df[train_df['labels'] == minority_class].index
    minority_data = train_df.loc[minority_indices]
    
    minority_oversampled = resample(
        minority_data,
        replace=True,
        n_samples=target_minority_count,
        random_state=42
    )
    
    train_df = pd.concat([train_df, minority_oversampled], ignore_index=True)

In [9]:
train_df.groupby('labels').size()

labels
0     837
1     612
2    1361
3     775
4     790
5     606
6     371
dtype: int64

In [12]:
X_train = train_df['content']
y_train = train_df['labels']
X_test = test_df['content']
y_test = test_df['labels']

In [13]:
tfidf_vect = TfidfVectorizer().fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [14]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
y_pred = lr.predict(X_test_tfidf)

In [15]:


# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
macro_f1 = f1_score(y_test, y_pred, average='macro')

macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

accuracy = accuracy_score(y_test, y_pred)
# Print metrics
print("Macro F1 Score:", macro_f1)
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.20      0.21        70
           1       0.10      0.06      0.07        51
           2       0.41      0.68      0.51       171
           3       0.37      0.29      0.33        75
           4       0.41      0.25      0.31       106
           5       0.15      0.09      0.11        46
           6       0.85      0.68      0.76        25

    accuracy                           0.37       544
   macro avg       0.36      0.32      0.33       544
weighted avg       0.35      0.37      0.35       544

Confusion Matrix:
 [[ 14   6  35   3   8   3   1]
 [ 10   3  26   6   3   3   0]
 [  8   9 116  14  19   4   1]
 [ 12   5  24  22   6   5   1]
 [  9   6  48   9  27   7   0]
 [  8   0  28   5   1   4   0]
 [  2   0   3   1   2   0  17]]
Macro F1 Score: 0.3294978305150858
Macro Precision: 0.35993713456767645
Macro Recall: 0.32174184838795983
Accuracy: 0.37316176470588236


In [16]:
svc = SVC(kernel='linear')
svc.fit(X_train_tfidf, y_train)
y_pred = svc.predict(X_test_tfidf)

In [17]:


# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
macro_f1 = f1_score(y_test, y_pred, average='macro')

macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

accuracy = accuracy_score(y_test, y_pred)
# Print metrics
print("Macro F1 Score:", macro_f1)
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.23      0.23        70
           1       0.11      0.08      0.09        51
           2       0.41      0.65      0.50       171
           3       0.31      0.27      0.29        75
           4       0.43      0.25      0.31       106
           5       0.19      0.09      0.12        46
           6       0.86      0.72      0.78        25

    accuracy                           0.37       544
   macro avg       0.36      0.33      0.33       544
weighted avg       0.35      0.37      0.34       544

Confusion Matrix:
 [[ 16   7  31   4   5   6   1]
 [ 11   4  26   7   2   1   0]
 [ 11  12 111  16  19   1   1]
 [ 15   4  28  20   5   3   0]
 [ 11   8  45   9  26   6   1]
 [  6   0  26   8   2   4   0]
 [  2   0   3   1   1   0  18]]
Macro F1 Score: 0.3318222439319295
Macro Precision: 0.36232339089481946
Macro Recall: 0.3250045450588162
Accuracy: 0.36580882352941174


In [24]:
rf = RandomForestClassifier(
    bootstrap=False,
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=200,
    random_state=42
)

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

In [25]:
# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
macro_f1 = f1_score(y_test, y_pred, average='macro')

macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

accuracy = accuracy_score(y_test, y_pred)
# Print metrics
print("Macro F1 Score:", macro_f1)
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.26      0.13      0.17        70
           1       0.33      0.10      0.15        51
           2       0.39      0.82      0.53       171
           3       0.42      0.24      0.31        75
           4       0.44      0.22      0.29       106
           5       0.13      0.04      0.07        46
           6       0.70      0.76      0.73        25

    accuracy                           0.40       544
   macro avg       0.38      0.33      0.32       544
weighted avg       0.38      0.40      0.34       544

Confusion Matrix:
 [[  9   1  48   5   4   2   1]
 [  6   5  30   3   4   3   0]
 [  3   2 140   7  12   3   4]
 [  5   4  38  18   7   2   1]
 [  4   3  66   6  23   3   1]
 [  5   0  33   4   1   2   1]
 [  3   0   2   0   1   0  19]]
Macro F1 Score: 0.3208305343966898
Macro Precision: 0.3829403476755441
Macro Recall: 0.32939764107073394
Accuracy: 0.39705882352941174


In [18]:
xgb = XGBClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='error',
    random_state=42
)

xgb.fit(X_train_tfidf, y_train)
y_pred = xgb.predict(X_test_tfidf)

In [19]:


# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
macro_f1 = f1_score(y_test, y_pred, average='macro')

macro_precision = precision_score(y_test, y_pred, average='macro')
macro_recall = recall_score(y_test, y_pred, average='macro')

accuracy = accuracy_score(y_test, y_pred)
# Print metrics
print("Macro F1 Score:", macro_f1)
print("Macro Precision:", macro_precision)
print("Macro Recall:", macro_recall)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.06      0.07        70
           1       0.14      0.06      0.08        51
           2       0.38      0.67      0.48       171
           3       0.31      0.27      0.29        75
           4       0.42      0.21      0.28       106
           5       0.15      0.07      0.09        46
           6       0.51      0.80      0.62        25

    accuracy                           0.34       544
   macro avg       0.29      0.30      0.27       544
weighted avg       0.30      0.34      0.30       544

Confusion Matrix:
 [[  4   3  40   7   7   4   5]
 [  5   3  28  11   3   1   0]
 [  8   4 115  15  16   7   6]
 [ 10   6  35  20   1   2   1]
 [  9   5  53   9  22   3   5]
 [  4   1  30   2   4   3   2]
 [  0   0   5   0   0   0  20]]
Macro F1 Score: 0.27392968312577415
Macro Precision: 0.2860850688958347
Macro Recall: 0.30398746203142835
Accuracy: 0.34375
