This reproduction study prerequisites that you have obtained a copy of the [ETC Corpus of Non-Native and Written English](https://catalog.ldc.upenn.edu/LDC2014T06).
Proceed to download and set up dependencies by running `pip3 install -r requirements.txt`.
Then execute this notebook.

In [1]:
DATA_DIR = "/home/fohlen/Documents/Bachelor/Data/ETS_Corpus_of_Non-Native_Written_English_LDC2014T06/ETS_Corpus_of_Non-Native_Written_English/data/text"

import requests
import os
from nltk import download

# NLTK downloads
download('punkt')
download('averaged_perceptron_tagger')

# Download the onix stopwords corpus (http://www.lextek.com/manuals/onix/stopwords1.html)
onix_path = os.path.join(os.getcwd(), 'onix.txt')
if not os.path.exists(onix_path):
    print("Downloading to " + onix_path)
    url = 'https://raw.githubusercontent.com/igorbrigadir/stopwords/master/en/onix.txt'
    r = requests.get(url, allow_redirects=True)
    with open(onix_path, 'wb') as f:
        f.write(r.content)

[nltk_data] Downloading package punkt to /home/fohlen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fohlen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Load datasets and vectorize test and training data.

In [13]:
from KopplVectorizer import KopplVectorizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np

# The actual training dataset is broken in the LDC which is why we fall back to a random split
# It would be better to create a dedicated development set
development = load_dataset('toefl11.py',
    data_dir=DATA_DIR,
    split='train[:10%]'
)

vectorizer = KopplVectorizer()
X_train, X_test, y_train, y_test = train_test_split(development["Text"], development["Language"], test_size=0.33, random_state=42)
labels = list(set(development["Language"]))
X_train = vectorizer.fit_transform(X_train)
y_train = np.array([labels.index(language) for language in y_train])
X_test = vectorizer.transform(X_test)
y_test = np.array([labels.index(language) for language in y_test])

Using custom data configuration default-0d2e219b700317ed
Reusing dataset toef_l11 (/home/fohlen/.cache/huggingface/datasets/toef_l11/default-0d2e219b700317ed/0.0.0/bdabab2a1366fe9b74e89df1120d27eef5bc909b9c54ce78766eb5b7345f8d85)
Using custom data configuration default-1647815034adb763
Reusing dataset text (/home/fohlen/.cache/huggingface/datasets/text/default-1647815034adb763/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


Train a multi-class linear SVM according to Koppel et al.
Optionally one could try retraining for different feature sets (see KopplVectorizer).

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

tuned_parameters = [
    {'kernel': ['rbf', 'poly'], 'C': [100, 1000]},
]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best parameters set found on development set:

{'C': 1000, 'kernel': 'rbf'}

Grid scores on development set:

0.295 (+/-0.068) for {'C': 100, 'kernel': 'rbf'}
0.061 (+/-0.193) for {'C': 100, 'kernel': 'poly'}
0.344 (+/-0.088) for {'C': 1000, 'kernel': 'rbf'}
0.059 (+/-0.186) for {'C': 1000, 'kernel': 'poly'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.34      0.39      0.37        33
           1       0.29      0.36      0.32        28
           2       0.32      0.21      0.26        28
           3       0.29      0.37      0.32        27
           4       0.29      0.42      0.34        24
           5       0.48      0.29      0.36        35
           6       0.48      0.42      0.45        36
           7       0.22      0.15      0.18        33
           8       0.28      0.24      0.26        29
           9

Perform 10 fold cross-validation as described in Koppl et al.

In [19]:
from statistics import mean
from sklearn.metrics import accuracy_score

vals_ds = load_dataset(TOEFL11_SCRIPT_PATH,
    data_dir=DATA_DIR,
    split=[
        f'validation[{k}%:{k+10}%]' for k in range(0, 100, 10)
    ]
)
trains_ds = load_dataset(TOEFL11_SCRIPT_PATH,
    data_dir=DATA_DIR,
    split=[
        f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 10)
    ]
)

scores = []
max_model, max_pred, max_test = None

for val_ds, train_ds in zip(vals_ds, trains_ds):
    X_train, X_test = vectorizer.transform(train_ds["Text"]), vectorizer.transform(val_ds["Text"])
    y_train, y_test = np.array([labels.index(language) for language in train_ds["Language"]]), np.array([labels.index(language) for language in val_ds["Language"]])
    model = SVC(kernel='rbf', C=1000)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score = accuracy_score(y_test, y_predict)
    if score > max(scores, default=0):
        max_model = model
        max_pred, max_test = y_pred, y_test
    scores.append(score)

Using custom data configuration default-0d2e219b700317ed
Reusing dataset toef_l11 (/home/fohlen/.cache/huggingface/datasets/toef_l11/default-0d2e219b700317ed/0.0.0/bdabab2a1366fe9b74e89df1120d27eef5bc909b9c54ce78766eb5b7345f8d85)
Using custom data configuration default-0d2e219b700317ed
Reusing dataset toef_l11 (/home/fohlen/.cache/huggingface/datasets/toef_l11/default-0d2e219b700317ed/0.0.0/bdabab2a1366fe9b74e89df1120d27eef5bc909b9c54ce78766eb5b7345f8d85)


KeyboardInterrupt: 