# Project: Automated Essay Scoring
### Ambroise Decouttere and Harry Solomons

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics

## Preparing data: loading csv, filling data, train-test splits and tf-idf vectorization

In [2]:
df = pd.read_csv("./for_analysis.csv")
df = df.fillna(method='ffill')
essays = df.essay
scores = df.rater1_domain1
X_train, X_test, y_train, y_test = train_test_split(essays, scores, test_size=0.2, random_state=0)

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=7,
                                 stop_words='english')
# Tokenization included in vectorization

features_train = vectorizer.fit_transform(X_train)
features_test = vectorizer.transform(X_test)



## Naive Bayes Implementation

In [4]:
# Some code here taken from https://towardsdatascience.com/training-a-naive-bayes-model-to-identify-the-author-of-an-email-or-document-17dc85fa630a

from time import time

t0 = time()
clf = MultinomialNB()
clf.fit(features_train, y_train)
print(f"\nTraining time: {round(time()-t0, 3)}s")

t1 = time()
training_score = clf.score(features_train, y_train)
print(f"\nPrediction time: {round(time()-t1, 3)}s")
print("Train set score:", training_score)

t2 = time()
test_score = clf.score(features_test, y_test)
print(f"\nPrediction time (test): {round(time()-t2, 3)}s")

qwke = metrics.cohen_kappa_score(clf.predict(features_test), y_test)

print("Test set score:", test_score)
print("QWKE score:", qwke)



Training time: 0.009s

Prediction time: 0.009s
Train set score: 0.511698642957417

Prediction time (test): 0.002s
Test set score: 0.46024321796071094
QWKE score: 0.30716829260964196


## SVR Implementation

In [9]:
from sklearn import svm

t0 = time()
svrmodel = svm.SVR(C=1.0, epsilon = 0.2)
svrmodel.fit(features_train, y_train)
print(f"\nTraining time: {round(time()-t0, 3)}s")

t1 = time()
training_score = svrmodel.score(features_train, y_train)
print(f"\nPrediction time: {round(time()-t1, 3)}s")
print("Train set score:", training_score)

t2 = time()
test_score = svrmodel.score(features_test, y_test)
print(f"\nPrediction time (test): {round(time()-t2, 3)}s")

qwke = metrics.cohen_kappa_score(np.around(svrmodel.predict(features_test), decimals=0), y_test)

print("Test set score:", test_score)
print("QWKE score:", qwke)


Training time: 31.698s

Prediction time: 28.227s
Train set score: 0.9581370162870131

Prediction time (test): 7.091s
Test set score: 0.7799543053683166
QWKE score: 0.4516881623356501


### SVM Parameter Tuning Using GridSearchCV

In [14]:
# Adapted from https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0

from sklearn.model_selection import GridSearchCV
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    epsilons = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'epsilon' : epsilons}
    grid_search = GridSearchCV(svm.SVR(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_


In [15]:
svc_param_selection(features_train, y_train, 5)

{'C': 10, 'epsilon': 0.001}

### Rerunning SVM with updated parameters

In [16]:
t0 = time()
svrmodel = svm.SVR(C=10, epsilon = 0.001)
svrmodel.fit(features_train, y_train)
print(f"\nTraining time: {round(time()-t0, 3)}s")

t1 = time()
training_score = svrmodel.score(features_train, y_train)
print(f"\nPrediction time: {round(time()-t1, 3)}s")
print("Train set score:", training_score)

t2 = time()
test_score = svrmodel.score(features_test, y_test)
print(f"\nPrediction time (test): {round(time()-t2, 3)}s")

qwke = metrics.cohen_kappa_score(np.around(svrmodel.predict(features_test), decimals=0), y_test)

print("Test set score:", test_score)
print("QWKE score:", qwke)


Training time: 132.088s

Prediction time: 38.514s
Train set score: 0.9999650928273224

Prediction time (test): 9.518s
Test set score: 0.7850412027725532
QWKE score: 0.46787489875811816
