In [65]:
import csv
import os
from pathlib import Path
import pickle
import zipfile

import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

## Reading the dataset

In [66]:
train_path = '../data/deft_split/subtask1_raw/train_dev/'
test_path = '../data/deft_split/subtask1_raw/test/'

In [67]:
def read_subtask1_file(file_path):
    sentences = []
    labels = []
    with file_path.open() as f:
        reader = csv.reader(f, delimiter='\t', quotechar='"')
        for row in reader:
            sentences.append(row[0])
            labels.append(int(row[1]))
    return sentences, labels
    
def read_subtask1_corpus(file_path):
    corpus = []
    labels = []
    input_path = Path(file_path)
    if input_path.is_dir():
        for file_name in Path(file_path).iterdir():
            sentences, sent_labels = read_subtask1_file(file_name)
            corpus.extend(sentences)
            labels.extend(sent_labels)
    else:
        print('loading file')
        corpus, labels = read_subtask1_file(input_path)
    return corpus, labels

In [68]:
corpus, Y_train = read_subtask1_corpus(train_path)

In [69]:
len(Y_train)

16659

In [70]:
test_corpus, Y_test = read_subtask1_corpus(test_path)

In [71]:
len(Y_test)

810

## Test SVM

In [72]:
def evaluate_setup(pipeline):
    pipeline.fit(corpus, Y_train)
    Y_test_pred = pipeline.predict(test_corpus)
    print(metrics.classification_report(Y_test, Y_test_pred))
    print(pd.DataFrame(metrics.confusion_matrix(Y_test, Y_test_pred)))
    return Y_test_pred

In [73]:
pipeline =  Pipeline([
    ('vect', CountVectorizer(lowercase=False, ngram_range=(1,3))),
    ('tidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC())
])

In [74]:
Y_test_pred = evaluate_setup(pipeline)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       537
           1       0.71      0.71      0.71       273

    accuracy                           0.80       810
   macro avg       0.78      0.78      0.78       810
weighted avg       0.80      0.80      0.80       810

     0    1
0  459   78
1   80  193


## Write model and predictions

In [75]:
predictions_input_path = '../data/deft_split/subtask1_raw/test/'
results_path = '../data/results/subtask1__svm_baseline__dev/'
Path(results_path).mkdir(exist_ok=False)

In [76]:
for input_file in Path(predictions_input_path).iterdir():
    text, _ = read_subtask1_file(input_file)
    pred_Y = pipeline.predict(text)
    output_file_name = "task_1_" + input_file.name
    output_file = os.path.join(results_path, output_file_name)
    with open(output_file, 'w') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
        for prediction in zip(text, pred_Y):
            writer.writerow(prediction)

In [77]:
with zipfile.ZipFile(Path(results_path, "task_1_svm_submission.zip"), 'w') as zf:
    for pred_file in Path(results_path).iterdir():
        if pred_file.suffix == '.deft':
            zf.write(pred_file, pred_file.name)
            #pred_file.unlink()

In [78]:
model_file = os.path.join(results_path, 'svm_pipeline.pickle')
with open(model_file, 'wb') as f:
    pickle.dump(pipeline, f)

## Test model deserialization

In [15]:
with open(model_file, 'rb') as f:
    loaded_pipeline = pickle.load(f)
evaluate_setup(loaded_pipeline);

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       537
           1       0.71      0.71      0.71       273

    accuracy                           0.80       810
   macro avg       0.78      0.78      0.78       810
weighted avg       0.80      0.80      0.80       810

     0    1
0  459   78
1   80  193
