In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import language_tool_python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

language_tool = language_tool_python.LanguageTool('en-US')
stop_words = stopwords.words('english')

In [4]:
df = pd.read_excel("training_set_rel3.xls")
essay_set = 6
df = df[df["essay_set"] == essay_set]

In [5]:
def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(essay)
    return len(words)

def unique_word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    unique_words = set(words)
    return len(unique_words)

def sentence_count(essay):
    sentences = nltk.sent_tokenize(essay)
    return len(sentences)

def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    return sum(len(word) for word in words) / len(words)

def grammar_errors(essay):
    errors = language_tool.check(essay)
    return len(errors)

def autocorrect_essay(essay):
    corrected_essay = language_tool.correct(essay)
    return corrected_essay

In [6]:
def define_dataframe(df):
    clean_df = df[['essay', 'domain1_score']].copy()
    clean_df = clean_df.rename(columns={'domain1_score': 'actual_score'})

    print("Getting Word Count")
    clean_df['word_count'] = clean_df['essay'].apply(word_count)
    print("Getting Unique Word Count")
    clean_df['unique_word_count'] = clean_df['essay'].apply(unique_word_count)
    print("Getting Sentence Count")
    clean_df['sentence_count'] = clean_df['essay'].apply(sentence_count)
    print("Getting Average Word Length")
    clean_df['avg_word_len'] = clean_df['essay'].apply(avg_word_len)

    print("Getting Grammatical Errors")
    clean_df['grammar_errors'] = clean_df['essay'].apply(grammar_errors)

    print("Autocorrecting Essay")
    clean_df['essay'] = clean_df['essay'].apply(autocorrect_essay)

    print("Preprocess for tokenization")
    clean_df.reset_index(drop=True, inplace=True)
    clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")
    clean_df['essay'] = clean_df['essay'].apply(lambda x: x.lower())

    print("Tokenization Start")
    tokenized_doc = clean_df['essay'].apply(lambda x: x.split())

    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    print("Detokenize")
    detokenized_doc = []
    for i in range(len(clean_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    clean_df['essay'] = detokenized_doc

    return clean_df

In [7]:
main_df = define_dataframe(df)

Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Tokenization Start
Removing Stop Words
Word Stemming
Detokenize


In [21]:
source = pd.read_csv('source_essays.txt', sep="|", header=None)
stacked_source = source.stack().reset_index()
source_essay = stacked_source.drop(['level_0', 'level_1'], axis=1).rename(columns={0: 'essay'})
source_essay.insert(0, "essay_set", [6, 5, 4, 3], True)
source_essay = source_essay.sort_values(by=['essay_set'], ascending=True)
source_essay.head()

Unnamed: 0,essay_set,essay
3,3,FORGET THAT OLD SAYING ABOUT NEVER taking cand...
2,4,"Saeng, a teenage girl, and her family have mov..."
1,5,"My parents, originally from Cuba, arrived in t..."
0,6,"When the Empire State Building was conceived, ..."


In [8]:
def clean_source_essay(source_essay):
    print("Preprocess for tokenization")
    source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")
    source_essay['essay'] = source_essay['essay'].apply(lambda x: x.lower())

    print("Tokenization Start")
    tokenized_doc = source_essay['essay'].apply(lambda x: x.split())

    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    print("Detokenize")
    detokenized_doc = []
    for i in range(len(source_essay)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    source_essay['essay'] = detokenized_doc

    return source_essay

In [9]:
cleaned_source_essay = clean_source_essay(source_essay)

Preprocess for tokenization
Tokenization Start
Removing Stop Words
Word Stemming
Detokenize


  source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")


In [10]:
frames = [main_df, cleaned_source_essay]
combined_df = pd.concat(frames)

combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0
...,...,...,...,...,...,...,...
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0
0,empir state build conceiv plan world tallest b...,,,,,,
1,parent origin cuba arriv unit state live year ...,,,,,,
2,saeng teenag girl famili move unit state vietn...,,,,,,


In [11]:
combined_essay = combined_df[combined_df['actual_score'] >= 3]
combined_essay = combined_essay.groupby('actual_score').sample(10, random_state=26)

combined_df = combined_df.drop(index = combined_essay.index)
combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0
...,...,...,...,...,...,...,...
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0
0,empir state build conceiv plan world tallest b...,,,,,,
1,parent origin cuba arriv unit state live year ...,,,,,,
2,saeng teenag girl famili move unit state vietn...,,,,,,


In [13]:
tokenizer = RegexpTokenizer(r'\w+')
tfidf_lsa_vectorizer = TfidfVectorizer(lowercase=True,
                                        stop_words='english',
                                        ngram_range = (1,3),
                                        tokenizer = tokenizer.tokenize,
                                        max_features=350,
                                        max_df=0.8,
                                        min_df=3)

tfidf_lsa_matrix = tfidf_lsa_vectorizer.fit_transform(combined_essay["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_lsa_matrix.shape}")

Train TFIDF Matrix Shape: (20, 284)


In [14]:
svd_lsa_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)
    
svd_lsa = svd_lsa_model.fit_transform(tfidf_lsa_matrix)
normalized_svd = Normalizer(copy=False).fit_transform(svd_lsa)

In [15]:
def lsa_score(essay):
    essay_matrix = tfidf_lsa_vectorizer.transform([essay])

    essay_svd = svd_lsa_model.transform(essay_matrix)
    normalized_essay_svd = Normalizer(copy=False).fit_transform(essay_svd)

    similarities = cosine_similarity(normalized_svd, normalized_essay_svd).max()

    return similarities.max()

In [28]:
combined_df['lsa_score'] = combined_df['essay'].apply(lsa_score)
combined_df = combined_df.fillna(0)
combined_df

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors,lsa_score
0,mani obstacl builder face attempt dirig dock e...,2.0,134.0,90.0,6.0,4.560976,3.0,0.589376
1,start would mani problem allow dirig dock num ...,3.0,201.0,116.0,9.0,4.733333,2.0,0.658643
2,builder empir state build face mani obstacl at...,4.0,180.0,104.0,8.0,4.491124,2.0,0.770081
3,passag moor mast marcia amid cap builder empir...,1.0,213.0,118.0,7.0,4.417085,12.0,0.620654
4,builder empir state build face mani obstacl at...,3.0,176.0,93.0,10.0,4.654321,1.0,0.772386
...,...,...,...,...,...,...,...,...
1799,want tell go attempt allow dirig dock well tel...,2.0,179.0,87.0,9.0,4.177215,7.0,0.645588
0,empir state build conceiv plan world tallest b...,0.0,0.0,0.0,0.0,0.000000,0.0,0.693081
1,parent origin cuba arriv unit state live year ...,0.0,0.0,0.0,0.0,0.000000,0.0,0.609972
2,saeng teenag girl famili move unit state vietn...,0.0,0.0,0.0,0.0,0.000000,0.0,0.462469


In [29]:
# VECTORIZER FOR: Training data
tokenizer = RegexpTokenizer(r'\w+')
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                   stop_words='english',
                                   ngram_range = (1,3),
                                   tokenizer = tokenizer.tokenize,
                                   max_features=10000,
                                   max_df=0.8,
                                   min_df=5)

tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_matrix.shape}")
svd_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)

svd = svd_model.fit_transform(tfidf_matrix)
print(type(svd))

Train TFIDF Matrix Shape: (1784, 6124)
<class 'numpy.ndarray'>


In [30]:
x_df_features = combined_df[['word_count', 'unique_word_count', 'sentence_count', 'avg_word_len', 'grammar_errors', 'lsa_score']]

x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
y_features = combined_df['actual_score'].to_numpy()

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

In [32]:
print("Building Linear Regression Model")
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

print("Building SVR Model")
svr_model = SVR()
svr_model.fit(x_train, y_train)

print("Building Decision Tree Model")
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

print("Building Bayesian Regressor")
bayes_model = BayesianRidge()
bayes_model.fit(x_train, y_train)

print("Building AdaBoost Regressor")
ada_model = AdaBoostRegressor(n_estimators=100)
ada_model.fit(x_train, y_train)

print("Building Random Forest Regressor")
ran_model = RandomForestRegressor()
ran_model.fit(x_train, y_train)

print("Building Gradient Boosting Regressor")
grad_model = GradientBoostingRegressor(n_estimators=200)
grad_model.fit(x_train, y_train)

print("Building Logistic Regression Model")
log_model = LogisticRegression(solver="saga", max_iter=10000)
log_model.fit(x_train, y_train)

Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model


In [33]:
predictions = [lr_model.predict(x_test),
               svr_model.predict(x_test),
               tree_model.predict(x_test),
               bayes_model.predict(x_test),
               ada_model.predict(x_test),
               ran_model.predict(x_test),
               grad_model.predict(x_test),
               log_model.predict(x_test)]

In [34]:
for idx, pred in enumerate(predictions):
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r_score = r2_score(y_test, pred)

    print(f"Model {idx}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 score: {r_score}\n")

Model 0
Mean Absolute Error: 0.41013054420938155
Mean Squared Error: 0.2682114130074931
Root Mean Squared Error: 0.5178913138946174
R2 score: 0.7092517106711576

Model 1
Mean Absolute Error: 0.4996685564394106
Mean Squared Error: 0.4420318028633608
Root Mean Squared Error: 0.6648547231263089
R2 score: 0.5208257953293147

Model 2
Mean Absolute Error: 0.5378151260504201
Mean Squared Error: 0.6218487394957983
Root Mean Squared Error: 0.7885738643245782
R2 score: 0.3258994641490175

Model 3
Mean Absolute Error: 0.4055375364743919
Mean Squared Error: 0.260572049596366
Root Mean Squared Error: 0.5104625839337943
R2 score: 0.7175329833375245

Model 4
Mean Absolute Error: 0.4544088515643958
Mean Squared Error: 0.33359512489924026
Root Mean Squared Error: 0.5775769428389955
R2 score: 0.6383740318679656

Model 5
Mean Absolute Error: 0.42523809523809525
Mean Squared Error: 0.2948282913165266
Root Mean Squared Error: 0.5429809308958525
R2 score: 0.6803983082418984

Model 6
Mean Absolute Error: 0.4

In [35]:
from sklearn.model_selection import cross_val_score

scores = [cross_val_score(lr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(svr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(tree_model, x_train, y_train, cv=10).mean(),
          cross_val_score(bayes_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ada_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ran_model, x_train, y_train, cv=10).mean(),
          cross_val_score(grad_model, x_train, y_train, cv=10).mean(),
          cross_val_score(log_model, x_train, y_train, cv=10).mean()]

In [36]:
for idx, score in enumerate(scores):
    print(f"Model: {idx}")
    print(f"Overall Score: {score}\n")

Model: 0
Overall Score: 0.7247304896062313

Model: 1
Overall Score: 0.515950047408619

Model: 2
Overall Score: 0.2850713848038679

Model: 3
Overall Score: 0.726816770861288

Model: 4
Overall Score: 0.6242683968268962

Model: 5
Overall Score: 0.664171638869558

Model: 6
Overall Score: 0.6947171034120672

Model: 7
Overall Score: 0.5837387964148528

