In [1]:
import numpy as np
from scipy.stats import pearsonr
import spacy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from tqdm import tqdm
from essay_evaluation.corpus import read_flip_texts
from essay_evaluation.lexical_variation_taaled import TaaledTokenClassifier, LexicalAccuracyTaaled
import pandas as pd

## 0. Setup
Set paths here:

In [2]:
flip_path = '/home/simon/Downloads/flip.csv'
levels = ['A1.1', 'A1.2', 'B1.1', 'B1.2', 'B2.1', 'B2.2', 'C1.1']


In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(TaaledTokenClassifier(), name=TaaledTokenClassifier.name, last=True)
nlp.add_pipe(LexicalAccuracyTaaled(), name=LexicalAccuracyTaaled.name, last=True)
nlp.remove_pipe('ner')
texts, grades, _ = read_flip_texts(flip_path, levels=levels)

## 1. Preproccessing

In [4]:
X = []
feature_names = None
for doc in tqdm(nlp.pipe(texts), total=len(texts)):
    if feature_names is None:
        feature_names = list(doc._.features.keys())
    X.append(list(doc._.features.values()))

100%|██████████| 332/332 [01:01<00:00,  5.38it/s]


In [5]:
X = np.array(X)
y = np.array(grades)

df = pd.DataFrame(X, columns=feature_names)
df.insert(len(feature_names),'AVG_VOCAB_GRADE', y)

# Cross validation split
cv = KFold(10, True)

def pearsonr_scorer(estimator, X, y):
    pred = estimator.predict(X)
    r, pval = pearsonr(y, pred)
    return r

## 2. Regression

In [6]:
clf = RandomForestRegressor(n_estimators=100)
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.46 (+/- 0.25)
[0.54556715 0.41488162 0.54180242 0.67122565 0.31240396 0.47114191
 0.46733895 0.22452411 0.41031483 0.56044299]


## 3. Classification

In [7]:
clf = RandomForestClassifier(n_estimators=100)
y_int = y.astype(int)
clf_pearson = cross_val_score(clf, X, y_int, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (clf_pearson.mean(), clf_pearson.std() * 2))
print(clf_pearson)

Pearson r: 0.20 (+/- 0.35)
[ 0.2868838   0.43577712  0.28986377  0.19234218  0.29385147  0.09188115
 -0.03071838  0.47610294  0.         -0.01785714]
