In [1]:
import csv
from pathlib import Path

import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

## Reading the dataset

In [2]:
train_dir_path = '../data/subtask1/train'
dev_dir_path = '../data/subtask1/dev'

In [3]:
def read_subtask1_corpus(file_path):
    corpus = []
    labels = []
    for file in Path(file_path).iterdir():
        with file.open() as f:
            reader = csv.reader(f, delimiter='\t', quotechar='"')
            for row in reader:
                corpus.append(row[0].strip())
                labels.append(int(row[1]))
    return corpus, labels

In [4]:
corpus, Y_train = read_subtask1_corpus(train_dir_path)

In [5]:
len(Y_train)

16659

In [6]:
dev_corpus, Y_dev = read_subtask1_corpus(dev_dir_path)

In [7]:
len(Y_dev)

810

## Test SVM

In [8]:
def evaluate_setup(pipeline):
    pipeline.fit(corpus, Y_train)
    Y_dev_pred = pipeline.predict(dev_corpus)
    print(metrics.classification_report(Y_dev, Y_dev_pred))
    return pd.DataFrame(metrics.confusion_matrix(Y_dev, Y_dev_pred))

In [9]:
evaluate_setup(
    Pipeline([
        ('vect', CountVectorizer(lowercase=False, ngram_range=(1,3))),
        ('tidf', TfidfTransformer(use_idf=False)),
        ('clf', LinearSVC())
    ])
)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       537
           1       0.71      0.71      0.71       273

    accuracy                           0.80       810
   macro avg       0.78      0.78      0.78       810
weighted avg       0.80      0.80      0.80       810



Unnamed: 0,0,1
0,459,78
1,80,193
