In [1]:
import numpy as np
from scipy.stats import pearsonr
import spacy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from tqdm import tqdm
from essay_evaluation.corpus import read_flip_texts
from essay_evaluation.lexical_variation_taaled import TaaledTokenClassifier, LexicalAccuracyTaaled
import pandas as pd

## Experiment 2b)
Remove A1 & A2 from the dataset and add the level as a features (e.g. B1 = 1, B2 = 2, C1 = 3)

In [2]:
flip_path = '/home/simon/Downloads/flip.csv'
levels = ['B1.1', 'B1.2', 'B2.1', 'B2.2', 'C1.1']


In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(TaaledTokenClassifier(), name=TaaledTokenClassifier.name, last=True)
nlp.add_pipe(LexicalAccuracyTaaled(), name=LexicalAccuracyTaaled.name, last=True)
nlp.remove_pipe('ner')
texts, grades, text_levels = read_flip_texts(flip_path, levels=levels)

## 1. Preproccessing

In [4]:
level_mapping = {
    'B1.1': 1,
    'B1.2': 1,
    'B2.1': 2,
    'B2.2': 2,
    'C1.1': 3
}
    
X = []
feature_names = None
for index, doc in tqdm(enumerate(nlp.pipe(texts)), total=len(texts)):
    if feature_names is None:
        feature_names = list(doc._.features.keys()) + ['LEVEL']
    level = level_mapping[text_levels[index]]
    X.append(list(doc._.features.values()) + [level])

100%|██████████| 308/308 [01:30<00:00,  3.42it/s]


In [5]:
X = np.array(X)
y = np.array(grades)

df = pd.DataFrame(X, columns=feature_names)
df.insert(len(feature_names),'AVG_VOCAB_GRADE', y)

# Cross validation split
cv = KFold(10, True)

def pearsonr_scorer(estimator, X, y):
    pred = estimator.predict(X)
    r, pval = pearsonr(y, pred)
    return r

## 2. Regression

In [6]:
clf = RandomForestRegressor(n_estimators=100)
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.42 (+/- 0.25)
[0.41478631 0.580842   0.37581343 0.12146043 0.5073765  0.55783741
 0.48344183 0.45410671 0.30289985 0.4279086 ]


## 3. Classification

In [7]:
clf = RandomForestClassifier(n_estimators=100)
y_int = y.astype(int)
clf_pearson = cross_val_score(clf, X, y_int, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (clf_pearson.mean(), clf_pearson.std() * 2))
print(clf_pearson)

Pearson r: 0.20 (+/- 0.38)
[ 0.1738854   0.44263638  0.33495786  0.20262471  0.08783671  0.01575047
  0.38953985 -0.01309794 -0.08908708  0.45331954]


In [8]:
df

Unnamed: 0,TAALED_TTR_AW,TAALED_MAAS_TTR_AW,TAALED_MTLD_MA_WRAP_AW,TAALED_MTLD_MA_WRAP_CW,TAALED_MAAS_TTR_CW,TAALED_BASIC_NCONTENT_TOKENS,TAALED_BASIC_NFUNCTION_TYPES,LEVEL,AVG_VOCAB_GRADE
0,5.659453,0.068980,28.911765,25.467742,0.059244,62.0,26.0,1.0,3.0
1,6.310373,0.063681,49.346821,54.632911,0.048140,79.0,30.0,2.0,2.0
2,6.851458,0.061581,44.561947,80.941176,0.038706,85.0,42.0,2.0,3.0
3,6.634387,0.066385,40.698113,50.840000,0.061031,100.0,51.0,1.0,3.0
4,6.794507,0.063674,53.036290,51.575472,0.051780,106.0,42.0,2.0,2.5
...,...,...,...,...,...,...,...,...,...
303,7.435027,0.053840,67.199052,75.033708,0.041314,89.0,46.0,3.0,3.0
304,8.277330,0.051174,64.425532,58.528169,0.042753,142.0,49.0,3.0,3.5
305,6.949586,0.061839,50.764228,36.646465,0.058306,99.0,51.0,2.0,3.0
306,6.709937,0.058237,53.441860,39.073529,0.053392,68.0,43.0,1.0,2.0
