In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
## Read csv file
df = pd.read_csv('train_all_tasks.csv')

In [3]:
## Data pre-processing

In [4]:
# text and labels for task A
xtrain_taskA, xdev_taskA, ytrain_taskA, ydev_taskA = train_test_split(df['text'], df['label_sexist'], test_size=0.2, random_state=10)

# text and labels for task B
xtrain_taskB, xdev_taskB, ytrain_taskB, ydev_taskB = train_test_split(df['text'], df['label_category'], test_size=0.2, random_state=10)

In [5]:
print(Counter(ytrain_taskA))
print(Counter(ydev_taskA))

print(Counter(ytrain_taskB))
print(Counter(ydev_taskB))

Counter({'not sexist': 8496, 'sexist': 2704})
Counter({'not sexist': 2106, 'sexist': 694})
Counter({'none': 8496, '2. derogation': 1279, '3. animosity': 912, '4. prejudiced discussions': 258, '1. threats, plans to harm and incitement': 255})
Counter({'none': 2106, '2. derogation': 311, '3. animosity': 253, '4. prejudiced discussions': 75, '1. threats, plans to harm and incitement': 55})


In [6]:
def svm_train(xtrain, ytrain, xdev, ydev, le):
  model = SVC(kernel='linear')
  vec = CountVectorizer()

  vec.fit(xtrain, xdev)
  xtrain_enc = vec.transform(xtrain)
  xdev_enc = vec.transform(xdev)

  model.fit(xtrain_enc, ytrain)
  pred = model.predict(xdev_enc)
  pred = le.inverse_transform(pred)

  print("Classification Report:\n{0}\nConfusion Matrix:\n{1}".format(classification_report(ydev, pred), confusion_matrix(ydev, pred)))

In [7]:
## Running Task A (Binary) 

In [8]:
le = LabelEncoder()

le.fit(ytrain_taskA)
ytrain_taskA = le.transform(ytrain_taskA)

In [9]:
svm_train(xtrain_taskA, ytrain_taskA, xdev_taskA, ydev_taskA, le)

Classification Report:
              precision    recall  f1-score   support

  not sexist       0.85      0.88      0.87      2106
      sexist       0.60      0.53      0.56       694

    accuracy                           0.80      2800
   macro avg       0.72      0.71      0.71      2800
weighted avg       0.79      0.80      0.79      2800

Confusion Matrix:
[[1857  249]
 [ 324  370]]


In [10]:
## Running Task B (Multi-class) 

In [11]:
le.fit(ytrain_taskB)
ytrain_taskB = le.transform(ytrain_taskB)

In [12]:
svm_train(xtrain_taskB, ytrain_taskB, xdev_taskB, ydev_taskB, le)

Classification Report:
                                          precision    recall  f1-score   support

1. threats, plans to harm and incitement       0.20      0.22      0.21        55
                           2. derogation       0.33      0.38      0.35       311
                            3. animosity       0.38      0.25      0.30       253
               4. prejudiced discussions       0.20      0.11      0.14        75
                                    none       0.86      0.89      0.87      2106

                                accuracy                           0.74      2800
                               macro avg       0.39      0.37      0.38      2800
                            weighted avg       0.73      0.74      0.73      2800

Confusion Matrix:
[[  12    6    4    1   32]
 [  10  117   37   11  136]
 [   7   80   64    4   98]
 [   2   16    6    8   43]
 [  30  132   56   17 1871]]
