In [200]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import language_tool_python
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor


language_tool = language_tool_python.LanguageTool('en-US')
stop_words = stopwords.words('english')

In [201]:
df = pd.read_excel("training_set_rel3.xls")
df.head()
df.shape

(12978, 28)

In [202]:
df = df[df["essay_set"]==1]

print(df.shape)

(1783, 28)


In [203]:
# essay structure
import re
import nltk

def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(essay)

    return len(words)

def unique_word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    unique_words = set(words)

    return len(unique_words)

def sentence_count(essay):
    sentences = nltk.sent_tokenize(essay)
    
    return len(sentences)

def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    return sum(len(word) for word in words) / len(words)

# get grammar errors
def grammar_errors(essay):
    errors = language_tool.check(essay)
    return len(errors)

# fix errors
def autocorrect_essay(essay):
    corrected_essay = language_tool.correct(essay)
    return corrected_essay

In [204]:
def preprocess_dataframe(df):
    clean_df = df[['essay', 'domain1_score']].copy()
    clean_df = clean_df.rename(columns={'domain1_score': 'actual_score'})

    # get essay structure
    print("Getting Word Count")
    clean_df['word_count'] = clean_df['essay'].apply(word_count)
    print("Getting Unique Word Count")
    clean_df['unique_word_count'] = clean_df['essay'].apply(unique_word_count)
    print("Getting Sentence Count")
    clean_df['sentence_count'] = clean_df['essay'].apply(sentence_count)
    print("Getting Average Word Length")
    clean_df['avg_word_len'] = clean_df['essay'].apply(avg_word_len)

    # get grammatical errors
    print("Getting Grammatical Errors")
    clean_df['grammar_errors'] = clean_df['essay'].apply(grammar_errors)
    
    # autocorrect errors
    print("Autocorrecting Essay")
    clean_df['essay'] = clean_df['essay'].apply(autocorrect_essay)

    # preprocess essay for tokenization
    print("Preprocess for tokenization")
    clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")
    clean_df['essay'] = clean_df['essay'].apply(lambda x: x.lower())

    # tokenization
    print("Tokenization Start")
    tokenized_doc = clean_df['essay'].apply(lambda x: x.split())

    # remove stop-words
    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    # stemming
    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    # de-tokenization
    print("Detokenize")
    detokenized_doc = []
    for i in range(len(clean_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    clean_df['essay'] = detokenized_doc

    return clean_df

In [205]:
clean_df = preprocess_dataframe(df)

Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization
Tokenization Start
Removing Stop Words


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Word Stemming
Detokenize


In [206]:
clean_df.shape

(1783, 7)

In [207]:
# Create a vectorizer for the training data
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                        stop_words='english',
                                        ngram_range = (1,3),
                                        tokenizer = tokenizer.tokenize,
                                        max_features=10000,
                                        max_df=0.8,
                                        min_df=5)

tfidf_matrix = tfidf_vectorizer.fit_transform(clean_df["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_matrix.shape}")

Train TFIDF Matrix Shape: (1783, 10000)


In [208]:
print(type(tfidf_matrix))

<class 'scipy.sparse._csr.csr_matrix'>


In [209]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

# TFIDF to SVD
svd_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)
    
svd = svd_model.fit_transform(tfidf_matrix)
#normalized_svd = Normalizer(copy=False).fit_transform(svd)
print(type(svd))

<class 'numpy.ndarray'>


In [210]:
clean_df.head()

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors
0,dear local newspap think effect comput peopl g...,8,386,173,16,4.237143,16
1,dear cap cap believ use comput benefit us mani...,9,464,205,20,4.312057,25
2,dear cap cap cap peopl use comput everyon agre...,7,313,160,14,4.342756,17
3,dear local newspap cap found mani expert say c...,10,611,260,27,4.813208,29
4,dear locat know comput posit effect peopl comp...,8,517,210,30,4.334038,17


In [211]:
x_df_features = clean_df[['word_count', 'unique_word_count', 'sentence_count', 'avg_word_len', 'grammar_errors']]

x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
y_features = clean_df['actual_score'].to_numpy()

In [212]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

In [213]:
# linear regression, LogisticRegression, SVR

print("Building Linear Regression Model")
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

print("Building SVR Model")
svr_model = SVR()
svr_model.fit(x_train, y_train)

print("Building Decision Tree Model")
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

print("Building Bayesian Regressor")
bayes_model = BayesianRidge()
bayes_model.fit(x_train, y_train)

print("Building AdaBoost Regressor")
ada_model = AdaBoostRegressor(n_estimators=100)
ada_model.fit(x_train, y_train)

print("Building Random Forest Regressor")
ran_model = RandomForestRegressor()
ran_model.fit(x_train, y_train)

print("Building Gradient Boosting Regressor")
grad_model = GradientBoostingRegressor(n_estimators=200)
grad_model.fit(x_train, y_train)

print("Building Logistic Regression Model")
log_model = LogisticRegression(solver="saga", max_iter=10000)
log_model.fit(x_train, y_train)

Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model


In [214]:
predictions = [lr_model.predict(x_test),
               svr_model.predict(x_test),
               tree_model.predict(x_test),
               bayes_model.predict(x_test),
               ada_model.predict(x_test),
               ran_model.predict(x_test),
               grad_model.predict(x_test),
               log_model.predict(x_test)]

In [215]:
for idx, pred in enumerate(predictions):
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r_score = r2_score(y_test, pred)

    print(f"Model {idx}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 score: {r_score}\n")

Model 0
Mean Absolute Error: 0.6962940315898762
Mean Squared Error: 0.7564628276790326
Root Mean Squared Error: 0.8697487152500041
R2 score: 0.7033926762996239

Model 1
Mean Absolute Error: 0.6457503899350246
Mean Squared Error: 0.6485110830277652
Root Mean Squared Error: 0.8053018583287668
R2 score: 0.7457203054946233

Model 2
Mean Absolute Error: 0.8151260504201681
Mean Squared Error: 1.3025210084033614
Root Mean Squared Error: 1.1412804249628403
R2 score: 0.48928452763318186

Model 3
Mean Absolute Error: 0.6653825727816108
Mean Squared Error: 0.7034018230485286
Root Mean Squared Error: 0.8386905406933648
R2 score: 0.7241977733915657

Model 4
Mean Absolute Error: 0.6806680819693842
Mean Squared Error: 0.6746980830037042
Root Mean Squared Error: 0.8214000748744208
R2 score: 0.7354524434207704

Model 5
Mean Absolute Error: 0.6171148459383754
Mean Squared Error: 0.6293649859943977
Root Mean Squared Error: 0.7933252712440199
R2 score: 0.7532274458227193

Model 6
Mean Absolute Error: 0.65

In [216]:
from sklearn.model_selection import cross_val_score

scores = [cross_val_score(lr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(svr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(tree_model, x_train, y_train, cv=10).mean(),
          cross_val_score(bayes_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ada_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ran_model, x_train, y_train, cv=10).mean(),
          cross_val_score(grad_model, x_train, y_train, cv=10).mean(),
          cross_val_score(log_model, x_train, y_train, cv=10).mean()]



In [217]:
for idx, score in enumerate(scores):
    print(f"Model {idx}")
    print(f"Overall Score: {score}\n")

Model 0
Overall Score: 0.6846740218545614

Model 1
Overall Score: 0.717342881527865

Model 2
Overall Score: 0.41126713856826014

Model 3
Overall Score: 0.7026016271830992

Model 4
Overall Score: 0.6964191507053006

Model 5
Overall Score: 0.7224860731647178

Model 6
Overall Score: 0.713184690037685

Model 7
Overall Score: 0.47543090712104796



# Machine Learning with LSA score

In [250]:
df_lsa = clean_df.copy()

chosen_essay = df_lsa[df_lsa['actual_score'] >= 11]
chosen_essay = chosen_essay.groupby('actual_score').sample(10, random_state=26)

df_lsa = df_lsa.drop(index = chosen_essay.index)

In [251]:
# Create a vectorizer for lsa similarity
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf_lsa_vectorizer = TfidfVectorizer(lowercase=True,
                                        stop_words='english',
                                        ngram_range = (1,3),
                                        tokenizer = tokenizer.tokenize,
                                        max_features=250,
                                        max_df=0.8,
                                        min_df=3)

tfidf_lsa_matrix = tfidf_lsa_vectorizer.fit_transform(chosen_essay["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_lsa_matrix.shape}")

Train TFIDF Matrix Shape: (20, 250)


In [252]:
# TFIDF to SVD
svd_lsa_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)
    
svd_lsa = svd_lsa_model.fit_transform(tfidf_lsa_matrix)
normalized_svd = Normalizer(copy=False).fit_transform(svd_lsa)

In [253]:
def lsa_score(essay):
    essay_matrix = tfidf_lsa_vectorizer.transform([essay])

    essay_svd = svd_lsa_model.transform(essay_matrix)
    normalized_essay_svd = Normalizer(copy=False).fit_transform(essay_svd)

    similarities = cosine_similarity(normalized_svd, normalized_essay_svd).max()

    return similarities.max()

In [255]:
df_lsa['lsa_score'] = df_lsa['essay'].apply(lsa_score)

df_lsa.head(20)

Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,grammar_errors,lsa_score
0,dear local newspap think effect comput peopl g...,8,386,173,16,4.237143,16,0.645607
1,dear cap cap believ use comput benefit us mani...,9,464,205,20,4.312057,25,0.746648
2,dear cap cap cap peopl use comput everyon agre...,7,313,160,14,4.342756,17,0.676962
3,dear local newspap cap found mani expert say c...,10,611,260,27,4.813208,29,0.729032
4,dear locat know comput posit effect peopl comp...,8,517,210,30,4.334038,17,0.728624
5,dear locat think comput neg effect us mani peo...,8,274,135,15,4.052632,17,0.722233
6,know peopl day depend comput safeti natur educ...,10,580,231,30,4.385827,6,0.664469
7,percent peopl agre comput make life less compl...,10,556,223,39,4.242126,7,0.930333
8,dear reader organ dramat effect human life cha...,9,512,224,35,4.190687,13,0.721024
9,locat technolog comput say comput good societi...,9,561,220,26,3.982659,20,0.624251


In [256]:
# Create a vectorizer for the training data
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                   stop_words='english',
                                   ngram_range = (1,3),
                                   tokenizer = tokenizer.tokenize,
                                   max_features=10000,
                                   max_df=0.8,
                                   min_df=5)

tfidf_matrix = tfidf_vectorizer.fit_transform(df_lsa["essay"])
print(f"Train TFIDF Matrix Shape: {tfidf_matrix.shape}")

# TFIDF to SVD
svd_model = TruncatedSVD(n_components=100,
                         n_iter=200,
                         random_state=69)
    
svd = svd_model.fit_transform(tfidf_matrix)
#normalized_svd = Normalizer(copy=False).fit_transform(svd)
print(type(svd))

Train TFIDF Matrix Shape: (1763, 10000)
<class 'numpy.ndarray'>


In [257]:
x_df_features = df_lsa[['word_count', 'unique_word_count', 'sentence_count', 'avg_word_len', 'grammar_errors', 'lsa_score']]

x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
y_features = df_lsa['actual_score'].to_numpy()

In [258]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

In [259]:
# linear regression, LogisticRegression, SVR

print("Building Linear Regression Model")
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

print("Building SVR Model")
svr_model = SVR()
svr_model.fit(x_train, y_train)

print("Building Decision Tree Model")
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

print("Building Bayesian Regressor")
bayes_model = BayesianRidge()
bayes_model.fit(x_train, y_train)

print("Building AdaBoost Regressor")
ada_model = AdaBoostRegressor(n_estimators=100)
ada_model.fit(x_train, y_train)

print("Building Random Forest Regressor")
ran_model = RandomForestRegressor()
ran_model.fit(x_train, y_train)

print("Building Gradient Boosting Regressor")
grad_model = GradientBoostingRegressor(n_estimators=200)
grad_model.fit(x_train, y_train)

print("Building Logistic Regression Model")
log_model = LogisticRegression(solver="saga", max_iter=10000)
log_model.fit(x_train, y_train)

Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model


In [260]:
predictions = [lr_model.predict(x_test),
               svr_model.predict(x_test),
               tree_model.predict(x_test),
               bayes_model.predict(x_test),
               ada_model.predict(x_test),
               ran_model.predict(x_test),
               grad_model.predict(x_test),
               log_model.predict(x_test)]

In [261]:
for idx, pred in enumerate(predictions):
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r_score = r2_score(y_test, pred)

    print(f"Model {idx}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 score: {r_score}\n")

Model 0
Mean Absolute Error: 0.6526003711290022
Mean Squared Error: 0.7103185147316082
Root Mean Squared Error: 0.8428039598457094
R2 score: 0.7107316027459067

Model 1
Mean Absolute Error: 0.6413297427444046
Mean Squared Error: 0.6365106386053946
Root Mean Squared Error: 0.7978161684281627
R2 score: 0.7407889440497943

Model 2
Mean Absolute Error: 0.6912181303116147
Mean Squared Error: 1.048158640226629
Root Mean Squared Error: 1.0237961907658326
R2 score: 0.5731504055741112

Model 3
Mean Absolute Error: 0.6494706226345482
Mean Squared Error: 0.6963751743662591
Root Mean Squared Error: 0.8344909672166974
R2 score: 0.7164098550175328

Model 4
Mean Absolute Error: 0.6799157946086015
Mean Squared Error: 0.6780406364736538
Root Mean Squared Error: 0.8234322294358254
R2 score: 0.7238763679699545

Model 5
Mean Absolute Error: 0.6102266288951842
Mean Squared Error: 0.5878518413597734
Root Mean Squared Error: 0.7667149674812495
R2 score: 0.7606046351793873

Model 6
Mean Absolute Error: 0.6129

In [262]:
from sklearn.model_selection import cross_val_score

scores = [cross_val_score(lr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(svr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(tree_model, x_train, y_train, cv=10).mean(),
          cross_val_score(bayes_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ada_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ran_model, x_train, y_train, cv=10).mean(),
          cross_val_score(grad_model, x_train, y_train, cv=10).mean(),
          cross_val_score(log_model, x_train, y_train, cv=10).mean()]



In [263]:
for idx, score in enumerate(scores):
    print(f"Model {idx}")
    print(f"Overall Score: {score}\n")

Model 0
Overall Score: 0.6762906202765445

Model 1
Overall Score: 0.7204772810151014

Model 2
Overall Score: 0.4326863192518579

Model 3
Overall Score: 0.7001464221689246

Model 4
Overall Score: 0.702954700103647

Model 5
Overall Score: 0.7219493410579478

Model 6
Overall Score: 0.7071275213281665

Model 7
Overall Score: 0.48723404255319147



# Summary:

### Best Performing Model for Essay Set 1
**Model 5 (Random Forest Regressor, no LSA score)**<br>
Mean Absolute Error: 0.6238935574229693<br>
Mean Squared Error: 0.6290591036414567<br>
Root Mean Squared Error: 0.7931324628594247<br>
R2 score: 0.7533473815852623<br><br>

**Model 5 (Random Forest Regressor, no LSA score, 10-Fold Validation)**<br>
Overall Score: 0.7219474896299766<br><br>

**Model 5 (Random Forest Regressor, with LSA score)**<br>
Mean Absolute Error: 0.563399433427762<br>
Mean Squared Error: 0.5285376770538244<br>
Root Mean Squared Error: 0.727005967687903<br>
R2 score: 0.7660819899557456<br><br>

**Model 5 (Random Forest Regressor, with LSA score, 10-Fold Validation)**<br>
Overall Score: 0.7126748931617651<br>