In [1]:
from scipy.sparse import dok_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from stemming.porter2 import stem
import pandas as pd
import numpy as np
import csv
import re

In [2]:
def read_stop_words(file_name):
    stop_words = []
    with open(file_name, 'r', encoding='UTF-8-sig') as f:
        for line in f:
            line = line.replace('\n', '')
            stop_words.append(line)
    return stop_words

In [3]:
def expr_preprocess_text(text, stop_words, stop=False, stem=False):
    tokens = re.compile(r'[a-zA-Z0-9]+', re.I).findall(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words] if stop else tokens
    tokens = [stem(token) for token in tokens] if stem else tokens
    return tokens

In [4]:
def make_dataset(file_name, data_type, stop_words, stop=False, stem=False):
    file = pd.read_csv(file_name, sep='\t', header=None)
    data, label = [], []
    for index in range(len(file)):
        corpora_name = file.iloc[index][0]
        text = file.iloc[index][1]
        tokens = expr_preprocess_text(text, stop_words, stop=stop, stem=stem)
        data.append(tokens)
        label.append(corpora_name)
    if data_type == 'train':
        all_tokens = [token for line in data for token in line]
        unqiue_tokens = list(set(all_tokens))
        token_dict = {k: v for v, k in enumerate(unqiue_tokens)}
        label_dict = {'Quran': 0, 'OT': 1, 'NT': 2}
        return data, label, token_dict, label_dict
    return data, label

In [5]:
def encode_data_label_bow(data, label, token_dict, label_dict):
    encoded_data = dok_matrix((len(data), len(token_dict)))
    for doc_index, line in enumerate(data):
        line = [token_dict.get(token, -1) for token in line]
        for token_idx, freq in Counter(line).items():
            if token_idx != -1:
                encoded_data[doc_index, token_idx] = freq
    encoded_label = [label_dict[lab] for lab in label]
    return encoded_data, encoded_label

In [6]:
def encode_data_label_tfidf(vectorizer, data, label, label_dict):
    data = [' '.join(row) for row in data]
    encoded_data = vectorizer.transform(data)
    encoded_label = [label_dict[lab] for lab in label]
    return encoded_data, encoded_label

In [7]:
def three_wrong_prediction_samples(X_dev_text, label_dict, y_dev, X_dev_pred):
    wrong_quran_sample, quran_pred = None, None
    wrong_ot_sample, ot_pred = None, None
    wrong_nt_sample, nt_pred = None, None
    label_dict = {v: k for k, v in label_dict.items()}
    for index, (truth, pred) in enumerate(zip(y_dev, X_dev_pred)):
        if quran_pred is None and truth != pred and truth == 0:
            if len(X_dev_text[index]) <= 6:
                wrong_quran_sample = X_dev_text[index]
                quran_pred = label_dict[pred]
        if ot_pred is None and truth != pred and truth == 1:
            if len(X_dev_text[index]) <= 6:
                wrong_ot_sample = X_dev_text[index]
                ot_pred = label_dict[pred]
        if nt_pred is None and truth != pred and truth == 2:
            if len(X_dev_text[index]) <= 6:
                wrong_nt_sample = X_dev_text[index]
                nt_pred = label_dict[pred]
        if quran_pred and ot_pred and nt_pred:
            break
    with open('wrong_samples.txt', 'w') as f:
        f.write(f'wrong_sample: {wrong_quran_sample}, label: Quran, pred: {quran_pred}\n')
        f.write(f'wrong_sample: {wrong_ot_sample}, label: OT, pred: {ot_pred}\n')
        f.write(f'wrong_sample: {wrong_nt_sample}, label: NT, pred: {nt_pred}\n')
    return

In [8]:
def prepare_train_val_test_dataset(stop=False, stem=False):
    stop_words = read_stop_words('./englishST.txt')
    data, label, token_dict, label_dict = make_dataset(
        './train_and_dev.tsv', 'train', stop_words, stop=stop, stem=stem)
    X_train_text, X_dev_text, y_train_text, y_dev_text = train_test_split(
        data, label, test_size=0.2, stratify=label, random_state=42)
    X_train, y_train = encode_data_label_bow(
        X_train_text, y_train_text, token_dict, label_dict)
    X_dev, y_dev = encode_data_label_bow(
        X_dev_text, y_dev_text, token_dict, label_dict)
    data, label = make_dataset(
        './test.tsv', 'test', stop_words, stop=stop, stem=stem)
    X_test, y_test = encode_data_label_bow(
        data, label, token_dict, label_dict)
    dataset = {'X_train': X_train, 'y_train': y_train,
               'X_dev': X_dev, 'y_dev': y_dev,
               'X_test': X_test, 'y_test': y_test,
               'X_dev_text': X_dev_text,
               'label_dict': label_dict}
    return dataset

In [9]:
def prepare_improved_train_val_test_dataset(stop=False, stem=False):
    stop_words = read_stop_words('./englishST.txt')
    data, label, token_dict, label_dict = make_dataset(
        './train_and_dev.tsv', 'train', stop_words, stop=stop, stem=stem)
    X_train_text, X_dev_text, y_train_text, y_dev_text = train_test_split(
        data, label, test_size=0.2, stratify=label, random_state=42)
    vectorizer = TfidfVectorizer()
    vectorizer.fit([' '.join(row) for row in X_train_text])
    X_train, y_train = encode_data_label_tfidf(
        vectorizer, X_train_text, y_train_text, label_dict)
    X_dev, y_dev = encode_data_label_tfidf(
        vectorizer, X_dev_text, y_dev_text, label_dict)
    data, label = make_dataset(
        './test.tsv', 'test', stop_words, stop=stop, stem=stem)
    X_test, y_test = encode_data_label_tfidf(
        vectorizer, data, label, label_dict)
    dataset = {'X_train': X_train, 'y_train': y_train,
               'X_dev': X_dev, 'y_dev': y_dev,
               'X_test': X_test, 'y_test': y_test,
               'X_dev_text': X_dev_text,
               'label_dict': label_dict}
    return dataset

In [10]:
def train_experiment_model(tfidf=False, stop=False, stem=False, param=False):
    if tfidf is True:
        dataset = prepare_improved_train_val_test_dataset(stop=stop, stem=stem)
    if tfidf is False:
        dataset = prepare_train_val_test_dataset(stop=stop, stem=stem)
    C = 1.0 if param else 1000
    model = LinearSVC(C=C, random_state=42)
    model.fit(dataset['X_train'], dataset['y_train'])
    X_train_pred = model.predict(dataset['X_train'])
    X_dev_pred = model.predict(dataset['X_dev'])
    X_test_pred = model.predict(dataset['X_test'])
    three_wrong_prediction_samples(dataset['X_dev_text'], dataset['label_dict'], 
                                   dataset['y_dev'], X_dev_pred)
    target_names = list(dataset['label_dict'].keys())
    train_report = classification_report(
        dataset['y_train'], X_train_pred, target_names=target_names, output_dict=True)
    val_report = classification_report(
        dataset['y_dev'], X_dev_pred, target_names=target_names, output_dict=True)
    test_report = classification_report(
        dataset['y_test'], X_test_pred, target_names=target_names, output_dict=True)
    return train_report, val_report, test_report

In [11]:
params = {'tfidf': False, 'stop': False, 'stem': False, 'param': False}
base_train_report, base_val_report, base_test_report = train_experiment_model(**params)



In [12]:
params = {'tfidf': True, 'stop': False, 'stem': False, 'param': True}
improv_train_report, improv_val_report, improv_test_report = train_experiment_model(**params)

In [13]:
def read_metrics_from_report(report):
    metrics = []
    for key1 in ['Quran', 'OT', 'NT', 'macro avg']:
        for key2 in ['precision', 'recall', 'f1-score']:
            metric = report[key1][key2]
            metrics.append('{:.3f}'.format(metric))
    return metrics

In [14]:
def write_classification_csv(base_train_report, base_val_report, base_test_report,
    improv_train_report, improv_val_report, improv_test_report):
    with open('classification.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        header = ['system', 'split', 'p-quran', 'r-quran', 'f-quran', 
                  'p-ot', 'r-ot', 'f-ot', 'p-nt', 'r-nt', 'f-nt', 
                  'p-macro', 'r-macro', 'f-macro']
        writer.writerow(header)
        line1 = ['baseline', 'train'] + read_metrics_from_report(base_train_report)
        writer.writerow(line1)
        line2 = ['baseline', 'dev'] + read_metrics_from_report(base_val_report)
        writer.writerow(line2)
        line3 = ['baseline', 'test'] + read_metrics_from_report(base_test_report)
        writer.writerow(line3)
        line4 = ['improved', 'train'] + read_metrics_from_report(improv_train_report)
        writer.writerow(line4)
        line5 = ['improved', 'dev'] + read_metrics_from_report(improv_val_report)
        writer.writerow(line5)
        line6 = ['improved', 'test'] + read_metrics_from_report(improv_test_report)
        writer.writerow(line6)
    return

In [15]:
write_classification_csv(base_train_report, base_val_report, base_test_report,
    improv_train_report, improv_val_report, improv_test_report)