In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import language_tool_python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

language_tool = language_tool_python.LanguageTool('en-US')
stop_words = stopwords.words('english')

In [2]:
df = pd.read_excel("training_set_rel3.xls")
essay_set = 6
df = df[df["essay_set"] == essay_set]

In [3]:
def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(essay)
    return len(words)

def unique_word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    unique_words = set(words)
    return len(unique_words)

def sentence_count(essay):
    sentences = nltk.sent_tokenize(essay)
    return len(sentences)

def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    return sum(len(word) for word in words) / len(words)

def grammar_errors(essay):
    errors = language_tool.check(essay)
    return len(errors)

def autocorrect_essay(essay):
    corrected_essay = language_tool.correct(essay)
    return corrected_essay

In [4]:
def define_dataframe(df):
    clean_df = df[['essay', 'domain1_score']].copy()
    clean_df = clean_df.rename(columns={'domain1_score': 'actual_score'})

    print("Getting Word Count")
    clean_df['word_count'] = clean_df['essay'].apply(word_count)
    print("Getting Unique Word Count")
    clean_df['unique_word_count'] = clean_df['essay'].apply(unique_word_count)
    print("Getting Sentence Count")
    clean_df['sentence_count'] = clean_df['essay'].apply(sentence_count)
    print("Getting Average Word Length")
    clean_df['avg_word_len'] = clean_df['essay'].apply(avg_word_len)

    print("Getting Grammatical Errors")
    clean_df['grammar_errors'] = clean_df['essay'].apply(grammar_errors)

    print("Autocorrecting Essay")
    clean_df['essay'] = clean_df['essay'].apply(autocorrect_essay)

    print("Preprocess for tokenization")
    clean_df.reset_index(drop=True, inplace=True)
    clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")
    clean_df['essay'] = clean_df['essay'].apply(lambda x: x.lower())

    print("Tokenization Start")
    tokenized_doc = clean_df['essay'].apply(lambda x: x.split())

    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    print("Detokenize")
    detokenized_doc = []
    for i in range(len(clean_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    clean_df['essay'] = detokenized_doc

    return clean_df

In [5]:
main_df = define_dataframe(df)

Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Tokenization Start
Removing Stop Words
Word Stemming
Detokenize


In [6]:
source = pd.read_csv('source_essays.txt', sep="|", header=None)
stacked_source = source.stack().reset_index()
source_essay = stacked_source.drop(['level_0', 'level_1'], axis=1).rename(columns={0: 'essay'})
source_essay.insert(0, "essay_set", [6, 5, 4, 3], True)
source_essay = source_essay.sort_values(by=['essay_set'], ascending=True)
source_essay = source_essay.loc[source_essay['essay_set'] == essay_set]
source_essay.head()

Unnamed: 0,essay_set,essay
0,6,"When the Empire State Building was conceived, ..."


In [7]:
def clean_source_essay(source_essay):
    print("Preprocess for tokenization")
    source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")
    source_essay['essay'] = source_essay['essay'].apply(lambda x: x.lower())

    print("Tokenization Start")
    tokenized_doc = source_essay['essay'].apply(lambda x: x.split())

    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    print("Detokenize")
    detokenized_doc = []
    for i in range(len(source_essay)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    source_essay['essay'] = detokenized_doc

    return source_essay

In [8]:
cleaned_source_essay = clean_source_essay(source_essay)

Preprocess for tokenization
Tokenization Start
Removing Stop Words
Word Stemming
Detokenize


  source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")


In [9]:
frames = [main_df, cleaned_source_essay]
combined_df = pd.concat(frames)

combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors,essay_set
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0,
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0,
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0,
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0,
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0,
...,...,...,...,...,...,...,...,...
1796,problem construct dock dirig natur caus like h...,2.0,72.0,53.0,3.0,4.833333,2.0,
1797,builder empir state build face obstacl attempt...,3.0,119.0,71.0,5.0,4.657143,3.0,
1798,obstacl builder empir state build could move e...,2.0,71.0,54.0,2.0,4.485294,2.0,
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0,


In [10]:
combined_essay = combined_df[combined_df['actual_score'] >= 3]
combined_essay = combined_essay.groupby('actual_score').sample(10, random_state=26)

combined_df = combined_df.drop(index = combined_essay.index)
combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors,essay_set
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0,
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0,
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0,
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0,
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0,
...,...,...,...,...,...,...,...,...
1796,problem construct dock dirig natur caus like h...,2.0,72.0,53.0,3.0,4.833333,2.0,
1797,builder empir state build face obstacl attempt...,3.0,119.0,71.0,5.0,4.657143,3.0,
1798,obstacl builder empir state build could move e...,2.0,71.0,54.0,2.0,4.485294,2.0,
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0,


In [15]:
tokenizer = RegexpTokenizer(r'\w+')
tfidf_lsa_vectorizer = TfidfVectorizer(lowercase=True,
                                        stop_words='english',
                                        ngram_range = (1,3),
                                        tokenizer = tokenizer.tokenize,
                                        max_features=350,
                                        max_df=0.8,
                                        min_df=3)

tfidf_lsa_matrix = tfidf_lsa_vectorizer.fit_transform(combined_essay["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_lsa_matrix.shape}")

Train TFIDF Matrix Shape: (20, 284)


In [16]:
svd_lsa_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)
    
svd_lsa = svd_lsa_model.fit_transform(tfidf_lsa_matrix)
normalized_svd = Normalizer(copy=False).fit_transform(svd_lsa)

In [17]:
def lsa_score(essay):
    essay_matrix = tfidf_lsa_vectorizer.transform([essay])

    essay_svd = svd_lsa_model.transform(essay_matrix)
    normalized_essay_svd = Normalizer(copy=False).fit_transform(essay_svd)

    similarities = cosine_similarity(normalized_svd, normalized_essay_svd).max()

    return similarities.max()

In [20]:
combined_df['lsa_score'] = combined_df['essay'].apply(lsa_score)
combined_df = combined_df.fillna(0)
combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors,essay_set,lsa_score
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0,0.0,0.589376
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0,0.0,0.658643
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0,0.0,0.770081
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0,0.0,0.620654
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0,0.0,0.772386
...,...,...,...,...,...,...,...,...,...
1796,problem construct dock dirig natur caus like h...,2.0,72.0,53.0,3.0,4.833333,2.0,0.0,0.665028
1797,builder empir state build face obstacl attempt...,3.0,119.0,71.0,5.0,4.657143,3.0,0.0,0.630899
1798,obstacl builder empir state build could move e...,2.0,71.0,54.0,2.0,4.485294,2.0,0.0,0.774887
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0,0.0,0.645588


In [37]:
# VECTORIZER FOR: Training data
tokenizer = RegexpTokenizer(r'\w+')
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                   stop_words='english',
                                   ngram_range = (1,3),
                                   tokenizer = tokenizer.tokenize,
                                   max_features=10000,
                                   max_df=0.8,
                                   min_df=5)

tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_matrix.shape}")
svd_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)

svd = svd_model.fit_transform(tfidf_matrix)
print(type(svd))

Train TFIDF Matrix Shape: (1781, 6098)
<class 'numpy.ndarray'>


In [38]:
x_df_features = combined_df[['word_count', 'unique_word_count', 'sentence_count', 'avg_word_len', 'grammar_errors', 'lsa_score']]

x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
y_features = combined_df['actual_score'].to_numpy()

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

In [40]:
print("Building Linear Regression Model")
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

print("Building SVR Model")
svr_model = SVR()
svr_model.fit(x_train, y_train)

print("Building Decision Tree Model")
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

print("Building Bayesian Regressor")
bayes_model = BayesianRidge()
bayes_model.fit(x_train, y_train)

print("Building AdaBoost Regressor")
ada_model = AdaBoostRegressor(n_estimators=100)
ada_model.fit(x_train, y_train)

print("Building Random Forest Regressor")
ran_model = RandomForestRegressor()
ran_model.fit(x_train, y_train)

print("Building Gradient Boosting Regressor")
grad_model = GradientBoostingRegressor(n_estimators=200)
grad_model.fit(x_train, y_train)

print("Building Logistic Regression Model")
log_model = LogisticRegression(solver="saga", max_iter=10000)
log_model.fit(x_train, y_train)

Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model


In [41]:
predictions = [lr_model.predict(x_test),
               svr_model.predict(x_test),
               tree_model.predict(x_test),
               bayes_model.predict(x_test),
               ada_model.predict(x_test),
               ran_model.predict(x_test),
               grad_model.predict(x_test),
               log_model.predict(x_test)]

In [42]:
for idx, pred in enumerate(predictions):
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r_score = r2_score(y_test, pred)

    print(f"Model {idx}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 score: {r_score}\n")

Model 0
Mean Absolute Error: 0.4179766701514603
Mean Squared Error: 0.27705123201788273
Root Mean Squared Error: 0.5263565635744297
R2 score: 0.7028586368196518

Model 1
Mean Absolute Error: 0.49692305073962595
Mean Squared Error: 0.44865528420549466
Root Mean Squared Error: 0.6698173513768471
R2 score: 0.5188108647779547

Model 2
Mean Absolute Error: 0.5098039215686274
Mean Squared Error: 0.6106442577030813
Root Mean Squared Error: 0.7814373024773524
R2 score: 0.34507540056550423

Model 3
Mean Absolute Error: 0.4148862176016638
Mean Squared Error: 0.27066810071822184
Root Mean Squared Error: 0.5202577252845189
R2 score: 0.7097046353807337

Model 4
Mean Absolute Error: 0.44412564161380175
Mean Squared Error: 0.3167705741978974
Root Mean Squared Error: 0.5628237505630848
R2 score: 0.6602590807951745

Model 5
Mean Absolute Error: 0.4153221288515406
Mean Squared Error: 0.2894798319327731
Root Mean Squared Error: 0.5380333000221948
R2 score: 0.6895287877002827

Model 6
Mean Absolute Error:

In [43]:
from sklearn.model_selection import cross_val_score

scores = [cross_val_score(lr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(svr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(tree_model, x_train, y_train, cv=10).mean(),
          cross_val_score(bayes_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ada_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ran_model, x_train, y_train, cv=10).mean(),
          cross_val_score(grad_model, x_train, y_train, cv=10).mean(),
          cross_val_score(log_model, x_train, y_train, cv=10).mean()]

In [45]:
for idx, score in enumerate(scores):
    print(f"Model: {idx}")
    print(f"Overall Score: {score}\n")

Model: 0
Overall Score: 0.7204673236068857

Model: 1
Overall Score: 0.5125873393932061

Model: 2
Overall Score: 0.353916279625689

Model: 3
Overall Score: 0.7233026350033527

Model: 4
Overall Score: 0.6232416101511905

Model: 5
Overall Score: 0.6700420912002907

Model: 6
Overall Score: 0.6987423607990226

Model: 7
Overall Score: 0.5730424505072393

