This reproduction study prerequisites that you have obtained a copy of the [ETC Corpus of Non-Native and Written English](https://catalog.ldc.upenn.edu/LDC2014T06).
Proceed to download and set up dependencies by running `pip3 install -r requirements.txt`.
Then execute this notebook.

In [6]:
import requests
import os
from nltk import download

# NLTK downloads
download('punkt')
download('averaged_perceptron_tagger')

# Download the onix stopwords corpus (http://www.lextek.com/manuals/onix/stopwords1.html)
onix_path = os.path.join(os.getcwd(), 'onix.txt')
if not os.path.exists(onix_path):
    print("Downloading to " + onix_path)
    url = 'https://raw.githubusercontent.com/igorbrigadir/stopwords/master/en/onix.txt'
    r = requests.get(url, allow_redirects=True)
    with open(onix_path, 'wb') as f:
        f.write(r.content)

Downloading to /home/fohlen/Documents/Bachelor/koppl-reproduction/onix.txt


[nltk_data] Downloading package punkt to /home/fohlen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fohlen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Load datasets and vectorize test and training data.

In [23]:
from KopplVectorizer import KopplVectorizer
from datasets import load_dataset
import numpy as np

dataset = load_dataset('toefl11.py',
    data_dir="/home/fohlen/Documents/Bachelor/Data/ETS_Corpus_of_Non-Native_Written_English_LDC2014T06/ETS_Corpus_of_Non-Native_Written_English/data/text",
    split=['train', 'test']
)

train = dataset[0]
vectorizer = KopplVectorizer()
labels = list(set(train["Language"]))
X_train = vectorizer.fit_transform(train["Text"])
y_train = np.array([labels.index(language) for language in train["Language"]])
test = dataset[1]
X_test = vectorizer.transform(test["Text"])
y_test = np.array([labels.index(language) for language in test["Language"]])

Using custom data configuration default-0d2e219b700317ed
Reusing dataset toef_l11 (/home/fohlen/.cache/huggingface/datasets/toef_l11/default-0d2e219b700317ed/0.0.0/bdabab2a1366fe9b74e89df1120d27eef5bc909b9c54ce78766eb5b7345f8d85)
Using custom data configuration default-1647815034adb763
Reusing dataset text (/home/fohlen/.cache/huggingface/datasets/text/default-1647815034adb763/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Exception ignored in: <function tqdm.__del__ at 0x7feca4ec7280>
Traceback (most recent call last):
  File "/home/fohlen/Documents/Bachelor/koppl-reproduction/venv/lib/python3.8/site-packages/tqdm/std.py", line 1090, in __del__
    self.close()
  File "/home/fohlen/Documents/Bachelor/koppl-reproduction/venv/lib/python3.8/site-packages/tqdm/notebook.py", line 260, in close
    self.sp(close=True)
AttributeError: 'tqdm' object has no attribute 'sp'


Train a multi-class linear SVM according to Koppel et al.
Try optimisation for different feature sets (see KopplVectorizer).
Depending on configuration also perform 10x fold of the data for more reliable results

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree': [1, 10, 100]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
