## Experiment 3
<h3>Testing for modified labels Level+Grade+BAWE_VERSION_3</h3>

<h3>Imports</h3>

In [1]:
from __future__ import unicode_literals

import numpy as np
import spacy
import csv
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, SGDClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from tqdm import tqdm_notebook
from essay_evaluation.corpus import read_flip_texts
from essay_evaluation.lexical_variation_taaled import TaaledTokenClassifier, LexicalAccuracyTaaled
import pandas as pd

<h3>Setting up the paths</h3>

In [2]:
flip_path = "/usr/local/datasets/flip_new.csv"

In [3]:
texts = []
levels = []

with open(flip_path, 'r') as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        texts.append(row['Essay'].replace('\n', ' '))
        levels.append(row['Level'])

<h3>Create the pipeline</h3>

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(TaaledTokenClassifier(), name=TaaledTokenClassifier.name, last=True)
nlp.add_pipe(LexicalAccuracyTaaled(), name=LexicalAccuracyTaaled.name, last=True)
nlp.remove_pipe('ner')

doc = nlp(texts[1])

<h3>Create the data and the labels</h3>

In [5]:
X = []
feature_names = None
for doc in tqdm_notebook(nlp.pipe(list(texts)), total=int(len(texts))):
    if feature_names is None:
        feature_names = list(doc._.features.keys())
    X.append(list(doc._.features.values()))

HBox(children=(IntProgress(value=0, max=489), HTML(value='')))




<h3>Prepare data</h3>

In [6]:
X = np.array(X)
y = np.array(levels).astype(np.float)

#df = pd.DataFrame(X, columns=feature_names)
#df.insert(len(feature_names),'AVG_VOCAB_GRADE', y)

# Cross validation split
cv = KFold(10, True)

def pearsonr_scorer(estimator, X, y):
    pred = estimator.predict(X)
    r, pval = pearsonr(y, pred)
    return r

<h3>Random Forest Regression</h3>

In [21]:
clf = RandomForestRegressor(n_estimators=100)
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.78 (+/- 0.14)
[0.68485407 0.80894287 0.68653772 0.72255685 0.87145353 0.85769831
 0.70824863 0.86608115 0.82410039 0.77329637]


In [19]:
import pickle
with open("/usr/local/models/model_reg.pkl", 'wb') as save_file:
    pickle.dump(clf, save_file)

<h3>Random Forest Classifier</h3>

In [26]:
clf = RandomForestClassifier(n_estimators=100)
y_int = y.astype(int)
clf_pearson = cross_val_score(clf, X, y_int, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)"  % (clf_pearson.mean(), clf_pearson.std() * 2))
print(clf_pearson)

Pearson r: 0.69 (+/- 0.13)
[0.63743688 0.71915163 0.67803161 0.79073625 0.81930132 0.70543781
 0.62299692 0.68887356 0.60963596 0.65062315]


In [25]:
with open("/usr/local/models/model_clf.pkl", 'wb') as save_file:
    pickle.dump(clf, save_file)

<h3>ElasticNet Regressor (Linear Regression with L1 and L2 regularization)</h3>

In [9]:
clf = ElasticNet()
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.68 (+/- 0.17)
[0.74261764 0.60385669 0.69122213 0.63877133 0.78652647 0.70751696
 0.6501254  0.70874293 0.75679104 0.47097049]


<h3>Linear Regression</h3>

In [10]:
clf = LinearRegression()
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.67 (+/- 0.23)
[0.72703339 0.71371838 0.72094075 0.64502339 0.34191284 0.67280092
 0.69065372 0.75296406 0.76112287 0.71161658]


<h3>Stochastic Gradient Descent Classifier (Linear model with SGD learning)</h3>

In [11]:
clf = SGDClassifier()
y_int = y.astype(int)
clf_pearson = cross_val_score(clf, X, y_int, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)"  % (clf_pearson.mean(), clf_pearson.std() * 2))
print(clf_pearson)

Pearson r: 0.49 (+/- 0.19)
[0.46719591 0.49344017 0.67331827 0.59510868 0.39239381 0.51676156
 0.3200492  0.51576044 0.48673938 0.40359295]


<h3>Perceptron (SGD with parameters as follows: <br/>
    <i>SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)</i> <br/>
)</h3>

In [12]:
from sklearn.linear_model import Perceptron
clf = Perceptron(penalty='elasticnet')
reg_pearson = cross_val_score(clf, X, y, cv=cv, scoring=pearsonr_scorer)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.52 (+/- 0.34)
[0.6843704  0.10913775 0.65869171 0.65726463 0.50591574 0.60919334
 0.57032507 0.43274906 0.60628717 0.35105489]


<h3>Fixed train test set (Since there is not a lot of data, I also tested on a fixed train set, rather than cv</h3>
<h4>20% of the train data is held out for testing</h4>

In [28]:
x_test = X[:98]
y_test = y[:98]
x_train = X[98:]
y_train = y[98:]

<h3>Random forest regressor</h3>

In [43]:
clf = RandomForestRegressor(n_estimators=100)
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.77 (+/- 0.00)
0.7747449331756743


In [34]:
with open("/usr/local/models/model_reg.pkl", 'wb') as save_file:
    pickle.dump(clf, save_file)

<h3>Random forest classifier</h3>

In [55]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.70 (+/- 0.00)
0.7009074912893876


In [57]:
import pickle
with open("/usr/local/models/model_clf.pkl", 'wb') as save_file:
    pickle.dump(clf, save_file)

<h3>Linear Regression</h3>

In [16]:
clf = LinearRegression()
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.66 (+/- 0.00)
0.6616187699318261


<h3>Elastic Net (Linear Regression with L1 and L2 regularization)</h3>

In [17]:
clf = ElasticNet()
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.65 (+/- 0.00)
0.6472582402335995


<h3>Stochastic Gradient Descent Classifier (Linear model with SGD learning)</h3>

In [18]:
clf = SGDClassifier()
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.35 (+/- 0.00)
0.3450056965484282


<h3>Perceptron (SGD with parameters as follows: <br/>
    <i>SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)</i> <br/>
)</h3>

In [19]:
clf = Perceptron()
clf.fit(x_train, y_train)
reg_pearson = pearsonr_scorer(clf, x_test, y_test)
print("Pearson r: %0.2f (+/- %0.2f)" % (reg_pearson.mean(), reg_pearson.std() * 2))
print(reg_pearson)

Pearson r: 0.49 (+/- 0.00)
0.4921779940742049
