In [136]:
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

In [137]:
## Read csv file
df = pd.read_csv('train_all_tasks.csv')

In [138]:
## Data pre-processing

In [139]:
# text and labels for task A
xtrain_taskA, xdev_taskA, ytrain_taskA, ydev_taskA = train_test_split(df['text'], df['label_sexist'], test_size=0.2, random_state=10)

# text and labels for task B
xtrain_taskB, xdev_taskB, ytrain_taskB, ydev_taskB = train_test_split(df['text'], df['label_category'], test_size=0.2, random_state=10)

In [140]:
print(Counter(ytrain_taskA))
print(Counter(ydev_taskA))

print(Counter(ytrain_taskB))
print(Counter(ydev_taskB))

Counter({'not sexist': 8496, 'sexist': 2704})
Counter({'not sexist': 2106, 'sexist': 694})
Counter({'none': 8496, '2. derogation': 1279, '3. animosity': 912, '4. prejudiced discussions': 258, '1. threats, plans to harm and incitement': 255})
Counter({'none': 2106, '2. derogation': 311, '3. animosity': 253, '4. prejudiced discussions': 75, '1. threats, plans to harm and incitement': 55})


In [141]:
def encode_labels_taskA(labels):
    """Encoding the labels to numerical values"""
    y_label = []
    for label in labels:
        if label == "not sexist":
            y_label.append(0)
        elif label == "sexist":
            y_label.append(1)
    return np.array(y_label)

In [142]:
def encode_labels_taskB(labels):
    """Encoding the labels to numerical values"""
    y_label = []
    for label in labels:
        if label == "none":
            y_label.append(0)
        elif label == "1. threats, plans to harm and incitement":
            y_label.append(1)
        elif label == "2. derogation":
            y_label.append(2)
        elif label == "3. animosity":
            y_label.append(3)
        elif label == "4. prejudiced discussions":
            y_label.append(4)
    return np.array(y_label)

In [143]:
def svm_train(xtrain, ytrain, xdev, ydev):
  model = SVC(kernel='linear')
  vec = CountVectorizer()

  vec.fit(xtrain, xdev)
  xtrain_enc = vec.transform(xtrain)
  xdev_enc = vec.transform(xdev)

  model.fit(xtrain_enc, ytrain)
  pred = model.predict(xdev_enc)
  print("Classification Report:\n{0}\nConfusion Matrix:\n{1}".format(classification_report(ydev, pred), confusion_matrix(ydev, pred)))

In [2]:
## Running Task A (Binary) 

In [144]:
ytrain_taskA = encode_labels_taskA(ytrain_taskA)
ydev_taskA = encode_labels_taskA(ydev_taskA)

In [145]:
svm_train(xtrain_taskA, ytrain_taskA, xdev_taskA, ydev_taskA)

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      2106
           1       0.60      0.53      0.56       694

    accuracy                           0.80      2800
   macro avg       0.72      0.71      0.71      2800
weighted avg       0.79      0.80      0.79      2800

Confusion Matrix:
[[1857  249]
 [ 324  370]]


In [51]:
## Running Task B (Multi-class) 

In [146]:
ytrain_taskB = encode_labels_taskB(ytrain_taskB)
ydev_taskB = encode_labels_taskB(ydev_taskB)

In [147]:
svm_train(xtrain_taskB, ytrain_taskB, xdev_taskB, ydev_taskB)

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      2106
           1       0.21      0.16      0.18        55
           2       0.36      0.34      0.35       311
           3       0.40      0.25      0.30       253
           4       0.22      0.11      0.14        75

    accuracy                           0.75      2800
   macro avg       0.41      0.35      0.37      2800
weighted avg       0.72      0.75      0.74      2800

Confusion Matrix:
[[1926   18  101   47   14]
 [  35    9    6    4    1]
 [ 147   10  107   36   11]
 [ 110    6   72   62    3]
 [  50    0   11    6    8]]
