In [3]:
import pandas as pd
import numpy as np
import string
import nltk
import time
import re
import language_tool_python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from time import sleep
from threading import Thread
from textstat import flesch_reading_ease

stop_words = stopwords.words('english')

# Persuasive/Narrative/Expository Essays Summary
## Essay Set 1, 2, 7, 8

Features Used
- Topic Relevance
    - LSA (TF-IDF) Matrix
    - Cosine similarity between an essay to the top scoring essays by their TruncatedSVD Matrix
- Word Usage and Sentence complexity
    - Number of words, Sentences, Unique words, Average word length
    - Parts-Of-Speech Tagging
- Grammar and Mechanics
    - Language Tool (number of mistakes)
- Readability (Text Complexity)
    - Flesch Reading Ease

In [4]:
# Thread for LanguageTool
class LanguageCorrect(Thread):
    def __init__(self, df, idx, lt_servers):
        Thread.__init__(self)
        self.value = None
        self.df = df
        self.index = idx
        self.lt_servers = lt_servers
 
    def run(self):
        self.df['essay'] = self.df['essay'].apply(self.autocorrect_essay)
        self.value = self.df
        return
    
    def autocorrect_essay(self, essay):
        corrected_essay = self.lt_servers[self.index].correct(essay)
        return corrected_essay

class LanguageCheck(Thread):
    def __init__(self, df, idx, lt_servers):
        Thread.__init__(self)
        self.value = None
        self.df = df
        self.index = idx
        self.lt_servers = lt_servers

    def run(self):
        self.df['grammar_errors'] = self.df['essay'].apply(self.grammar_errors)
        self.value = self.df
        return
    
    def grammar_errors(self, essay):
        errors = self.lt_servers[self.index].check(essay)
        return len(errors)

In [44]:
# essay structure

def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)

    return len(words)

def unique_word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    unique_words = set(words)

    return len(unique_words)

def sentence_count(essay):
    sentences = nltk.sent_tokenize(essay)
    
    return len(sentences)

def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    return sum(len(word) for word in words) / len(words)


def sentence_to_wordlist(raw_sentence):
    
    clean_sentence = re.sub("[^a-zA-Z0-9]"," ", raw_sentence)
    tokens = nltk.word_tokenize(clean_sentence)
    
    return tokens

def tokenize(essay):
    stripped_essay = essay.strip()
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(stripped_essay)
    
    tokenized_sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences.append(sentence_to_wordlist(raw_sentence))
    
    return tokenized_sentences

def count_pos(essay):
    
    tokenized_sentences = tokenize(essay)
    
    noun_count = 0
    adj_count = 0
    verb_count = 0
    adv_count = 0
    
    for sentence in tokenized_sentences:
        tagged_tokens = nltk.pos_tag(sentence)
        
        for token_tuple in tagged_tokens:
            pos_tag = token_tuple[1]
        
            if pos_tag.startswith('N'): 
                noun_count += 1
            elif pos_tag.startswith('J'):
                adj_count += 1
            elif pos_tag.startswith('V'):
                verb_count += 1
            elif pos_tag.startswith('R'):
                adv_count += 1
            
    return noun_count, adj_count, verb_count, adv_count


def readability_score(essay):
    score = flesch_reading_ease(essay)
    return score

In [51]:
def preprocess_dataframe(essay_set):
    lt_servers = []
    for _ in range(3):
        lt_servers.append(language_tool_python.LanguageTool('en-US'))

    thread_list = []

    df = pd.read_excel("training_set_rel3.xls")
    df = df[df["essay_set"]==essay_set]

    print(f"Retrieving Essay Set: #{essay_set}")
    print(f"DataFrame Shape: {df.shape}")

    clean_df = df[['essay', 'domain1_score']].copy()
    clean_df = clean_df.rename(columns={'domain1_score': 'actual_score'})

    if essay_set == 2:
        clean_df = df[['essay', 'domain1_score', 'domain2_score']].copy()
        clean_df['actual_score'] = clean_df['domain1_score'] + clean_df['domain2_score']
        clean_df.drop(['domain1_score', 'domain2_score'], axis=1, inplace=True)

    # get essay structure
    print("\n- PREPROCESSING ESSAY SETS -\n")
    print("Getting Word Count..")
    clean_df['word_count'] = clean_df['essay'].apply(word_count)
    print("Getting Unique Word Count..")
    clean_df['unique_word_count'] = clean_df['essay'].apply(unique_word_count)
    print("Getting Sentence Count..")
    clean_df['sentence_count'] = clean_df['essay'].apply(sentence_count)
    print("Getting Average Word Length..")
    clean_df['avg_word_len'] = clean_df['essay'].apply(avg_word_len)
    print("POS Tagging..")
    clean_df['noun_count'], clean_df['adj_count'], clean_df['verb_count'], clean_df['adv_count'] = zip(*clean_df['essay'].map(count_pos))
    print("Getting Readability..")
    clean_df['readability_score'] = clean_df['essay'].apply(readability_score)

    # get grammatical errors
    print("Getting Grammatical Errors..")
    df_split = np.array_split(clean_df, len(lt_servers))
    # put threads into list
    for idx, i in enumerate(df_split):
        thread_langcheck = LanguageCheck(df=i, idx=idx, lt_servers=lt_servers)
        thread_list.append(thread_langcheck)

    # start thread list
    for thread in thread_list:
        thread.start()

    # join all threads
    for thread in thread_list:
        thread.join()
    
    clean_df = pd.concat([thread.value for thread in thread_list], axis=0)
    
    thread_list.clear()

    # autocorrect errors
    print("Autocorrecting Essay..")
    df_split = np.array_split(clean_df, len(lt_servers))
    # put threads into list
    for idx, i in enumerate(df_split):
        thread_langcheck = LanguageCorrect(df=i, idx=idx, lt_servers=lt_servers)
        thread_list.append(thread_langcheck)

    # start thread list
    for thread in thread_list:
        thread.start()

    # join all threads
    for thread in thread_list:
        thread.join()
    
    clean_df = pd.concat([thread.value for thread in thread_list], axis=0)

    for tool in lt_servers:
        tool.close()
    
    thread_list.clear()
    lt_servers.clear()

    # preprocess essay for tokenization
    clean_df.reset_index(drop=True, inplace=True)
    clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")
    clean_df['essay'] = clean_df['essay'].apply(lambda x: x.lower())

    # tokenization
    print("Tokenizing..")
    tokenized_doc = clean_df['essay'].apply(lambda x: x.split())

    # remove stop-words
    print("Removing Stop Words..")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    # stemming
    print("Stemming Words..")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    # de-tokenization
    print("Detokenizing..")
    detokenized_doc = []
    for i in range(len(clean_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    clean_df['essay'] = detokenized_doc
    print("\n- FINISHED -\n")

    return clean_df

In [46]:
def vectorization_process(df, sample_essays, max_features):
    df_lsa = df.copy()
    largest_possible_score = df_lsa.nlargest(1, 'actual_score')['actual_score'].values[0]

    top_score = largest_possible_score - (largest_possible_score * 0.10)

    chosen_essay = df_lsa[df_lsa['actual_score'] >= top_score]
    chosen_essay = chosen_essay.groupby('actual_score').sample(sample_essays, random_state=26)

    df_lsa = df_lsa.drop(index = chosen_essay.index)

    # Create a vectorizer for lsa similarity
    tokenizer = RegexpTokenizer(r'\w+')

    # Vectorize document using TF-IDF
    tfidf_lsa_vectorizer = TfidfVectorizer(lowercase=True,
                                            stop_words='english',
                                            ngram_range = (1,3),
                                            tokenizer = tokenizer.tokenize)

    tfidf_lsa_matrix = tfidf_lsa_vectorizer.fit_transform(chosen_essay["essay"])

    # TFIDF to SVD
    svd_lsa_model = TruncatedSVD(n_components=100,
                            n_iter=200,
                            random_state=69)
        
    svd_lsa = svd_lsa_model.fit_transform(tfidf_lsa_matrix)
    normalized_svd = Normalizer(copy=False).fit_transform(svd_lsa)

    def lsa_score(essay):
        essay_matrix = tfidf_lsa_vectorizer.transform([essay])
        essay_svd = svd_lsa_model.transform(essay_matrix)
        normalized_essay_svd = Normalizer(copy=False).fit_transform(essay_svd)

        # Compare current essay to the top 10% scored essay
        similarities = cosine_similarity(normalized_svd, normalized_essay_svd).max()

        return similarities.max()
    
    df_lsa['lsa_score'] = df_lsa['essay'].apply(lsa_score)

    # Create a vectorizer for the training data
    tokenizer = RegexpTokenizer(r'\w+')

    # Vectorize document using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                    stop_words='english',
                                    ngram_range = (1,3),
                                    tokenizer = tokenizer.tokenize,
                                    max_features=max_features)

    tfidf_matrix = tfidf_vectorizer.fit_transform(df_lsa["essay"])

    # TFIDF to SVD
    svd_model = TruncatedSVD(n_components=100,
                            n_iter=200,
                            random_state=69)
        
    svd = svd_model.fit_transform(tfidf_matrix)

    return df_lsa, svd

In [47]:
def training_process(df_lsa, svd):
    print("Getting Features")
    x_df_features = df_lsa[['word_count', 
                            'unique_word_count',
                            'sentence_count',
                            'avg_word_len',
                            'grammar_errors',
                            'lsa_score', 
                            'readability_score',
                            'noun_count',
                            'adj_count',
                            'verb_count',
                            'adv_count']]

    x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
    y_features = df_lsa['actual_score'].to_numpy()

    print("Splitting Dataset")
    x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

    print("Building Linear Regression Model")
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)

    print("Building SVR Model")
    svr_model = SVR()
    svr_model.fit(x_train, y_train)

    print("Building Decision Tree Model")
    tree_model = DecisionTreeRegressor()
    tree_model.fit(x_train, y_train)

    print("Building Bayesian Regressor")
    bayes_model = BayesianRidge()
    bayes_model.fit(x_train, y_train)

    print("Building AdaBoost Regressor")
    ada_model = AdaBoostRegressor(n_estimators=100)
    ada_model.fit(x_train, y_train)

    print("Building Random Forest Regressor")
    ran_model = RandomForestRegressor()
    ran_model.fit(x_train, y_train)

    print("Building Gradient Boosting Regressor")
    grad_model = GradientBoostingRegressor(n_estimators=200)
    grad_model.fit(x_train, y_train)

    print("Building Logistic Regression Model")
    log_model = LogisticRegression(solver="saga", max_iter=10000)
    log_model.fit(x_train, y_train)

    print("Getting Predictions")
    predictions = [ lr_model.predict(x_test),
                    svr_model.predict(x_test),
                    tree_model.predict(x_test),
                    bayes_model.predict(x_test),
                    ada_model.predict(x_test),
                    ran_model.predict(x_test),
                    grad_model.predict(x_test),
                    log_model.predict(x_test)]
    scores = []
    
    for idx, pred in enumerate(predictions):
        mae = mean_absolute_error(y_test, pred)
        mse = mean_squared_error(y_test, pred)
        rmse = np.sqrt(mse)
        r_score = r2_score(y_test, pred)

        scores.append([idx, mae, mse, rmse, r_score])
    
    print("\nResults:")
    best_score = max(scores, key=lambda sublist: sublist[-1])
    print(f"Model {best_score[0]}")
    print(f"Mean Absolute Error: {best_score[1]}")
    print(f"Mean Squared Error: {best_score[2]}")
    print(f"Root Mean Squared Error: {best_score[3]}")
    print(f"R2 score: {best_score[4]}\n")

    print("Cross Validation 10-Folds")

    kf = KFold(n_splits=10)

    scores = [cross_val_score(lr_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(svr_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(tree_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(bayes_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(ada_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(ran_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(grad_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(log_model, x_features, y_features, cv=kf).mean()]
    
    print(f"Model {scores.index(max(scores))}")
    print(f"Overall Score: {max(scores)}\n")

    return best_score, (scores.index(max(scores)), max(scores))# average_score

In [14]:
# not source summary
# (essay_set, sample_essay, max_features)
summaries = [(1, 10, 10000),
            (2, 5, 10000), 
            (7, 10, 10000), 
            (8, 1, 1000)]

summary_scores = []

for summary in summaries:
    print("Preprocess Start")
    clean_df = preprocess_dataframe(summary[0])
    
    print("Vectorization Start")
    df_lsa, svd = vectorization_process(clean_df, summary[1], summary[2])
    
    print("Training Start")
    best_score, average_score = training_process(df_lsa, svd)

    summary_scores.append([best_score, average_score])

    print("\n")
    

Preprocess Start
Retrieving Essay Set #1
Dataframe shape: (1783, 28)
Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
POS Tagging
Getting Readability
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization
Tokenization Start
Removing Stop Words


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Word Stemming
Detokenize
Vectorization Start
Training Start
Getting Features
Splitting Dataset
Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model
Getting Predictions

Results:
Model 5
Mean Absolute Error: 0.5931728045325778
Mean Squared Error: 0.5577917847025495
Root Mean Squared Error: 0.746854594082777
R2 score: 0.7728462200885008

Cross Validation 10-Folds
Model 1
Overall Score: 0.7318932133096692



Preprocess Start
Retrieving Essay Set #2
Dataframe shape: (1800, 28)
Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
POS Tagging
Getting Readability
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization
Tokenization Start
Removing Stop Words


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Word Stemming
Detokenize
Vectorization Start
Training Start
Getting Features
Splitting Dataset
Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model
Getting Predictions

Results:
Model 6
Mean Absolute Error: 0.6484774183609476
Mean Squared Error: 0.6747706545099783
Root Mean Squared Error: 0.8214442491794427
R2 score: 0.5977613666762005

Cross Validation 10-Folds
Model 6
Overall Score: 0.6286681761170474



Preprocess Start
Retrieving Essay Set #7
Dataframe shape: (1569, 28)
Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
POS Tagging
Getting Readability
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization
Tokenization Start
Removing Stop Words


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Word Stemming
Detokenize
Vectorization Start
Training Start
Getting Features
Splitting Dataset
Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model
Getting Predictions

Results:
Model 6
Mean Absolute Error: 2.0788689025580025
Mean Squared Error: 6.874569049361013
Root Mean Squared Error: 2.621939940075099
R2 score: 0.6652576217071945

Cross Validation 10-Folds
Model 6
Overall Score: 0.6322037224022616



Preprocess Start
Retrieving Essay Set #8
Dataframe shape: (723, 28)
Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
POS Tagging
Getting Readability
Getting Grammatical Errors
Autocorrecting Essay
Preprocess for tokenization
Tokenization Start
Removing Stop Words


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Word Stemming
Detokenize
Vectorization Start
Training Start
Getting Features
Splitting Dataset
Building Linear Regression Model
Building SVR Model
Building Decision Tree Model
Building Bayesian Regressor
Building AdaBoost Regressor
Building Random Forest Regressor
Building Gradient Boosting Regressor
Building Logistic Regression Model
Getting Predictions

Results:
Model 5
Mean Absolute Error: 2.9186896551724137
Mean Squared Error: 14.778853103448276
Root Mean Squared Error: 3.8443273928540838
R2 score: 0.5665732280751676

Cross Validation 10-Folds
Model 5
Overall Score: 0.5403300813474997





# Source Dependent Essays Summary
## Essay Sets 3, 4, 5 and 6

Features Used
- Topic Relevance
    - LSA (TF-IDF) Matrix
    - Cosine similarity between an essay to the top scoring essays by their TruncatedSVD Matrix
- Word Usage and Sentence complexity
    - Number of words, Sentences, Unique words, Average word length
    - Parts-Of-Speech Tagging
- Grammar and Mechanics
    - Language Tool (number of mistakes)
- Readability (Text Complexity)
    - Flesch Reading Ease

In [48]:
def preprocess_source_essay(essay_set):
    source = pd.read_csv('source_essays.txt', sep="|", header=None)
    stacked_source = source.stack().reset_index()
    source_essay = stacked_source.drop(['level_0', 'level_1'], axis=1).rename(columns={0: 'essay'})
    source_essay.insert(0, "essay_set", [6, 5, 4, 3], True)
    source_essay = source_essay.sort_values(by=['essay_set'], ascending=True)
    source_essay = source_essay.loc[source_essay['essay_set'] == essay_set]

    source_essay.reset_index(drop=True, inplace=True)

    print("\n- PREPROCESSING SOURCE ESSAY -\n")
    print("Tokenizing..")
    source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")
    source_essay['essay'] = source_essay['essay'].apply(lambda x: x.lower())

    tokenized_doc = source_essay['essay'].apply(lambda x: x.split())

    print("Removing Stop Words..")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    print("Stemming Words..")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    print("Detokenizing..")
    detokenized_doc = []
    for i in range(len(source_essay)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    source_essay['essay'] = detokenized_doc
    print("\n- FINISHED -\n")
    
    return source_essay

In [49]:
def sd_vectorization_process(df, sdf, sample_essays, max_features):
    df_lsa = df.copy()
    largest_possible_score = df_lsa.nlargest(1, 'actual_score')['actual_score'].values[0]
    top_score = largest_possible_score - (largest_possible_score * 0.10)

    df_source = sdf.copy()
    frames = [df_lsa, df_source]
    combined_df = pd.concat(frames)

    combined_essay = combined_df[combined_df['actual_score'] >= top_score]
    combined_essay = combined_essay.groupby('actual_score').sample(sample_essays, random_state=26)
    combined_df = combined_df.drop(index = combined_essay.index)

    # Create a vectorizer for LSA Similarity
    tokenizer = RegexpTokenizer(r'\w+')

    # Vectorize using TF-IDF
    tfidf_lsa_vectorizer = TfidfVectorizer(lowercase=True,
                                            stop_words='english',
                                            ngram_range = (1,3),
                                            tokenizer = tokenizer.tokenize)

    tfidf_lsa_matrix = tfidf_lsa_vectorizer.fit_transform(combined_essay["essay"])

    # TFIDF to SVD
    svd_lsa_model = TruncatedSVD(n_components=100,
                            n_iter=200,
                            random_state=69)
        
    svd_lsa = svd_lsa_model.fit_transform(tfidf_lsa_matrix)
    normalized_svd = Normalizer(copy=False).fit_transform(svd_lsa)

    def lsa_score(essay):
        essay_matrix = tfidf_lsa_vectorizer.transform([essay])
        essay_svd = svd_lsa_model.transform(essay_matrix)
        normalized_essay_svd = Normalizer(copy=False).fit_transform(essay_svd)

        # Compare current essay to the top 10% scored essay
        similarities = cosine_similarity(normalized_svd, normalized_essay_svd).max()

        return similarities.max()
    
    combined_df['lsa_score'] = combined_df['essay'].apply(lsa_score)
    combined_df = combined_df.fillna(0)

    # VECTORIZE: Training Data
    tokenizer = RegexpTokenizer(r'\w+')
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                    stop_words='english',
                                    ngram_range = (1,3),
                                    tokenizer = tokenizer.tokenize,
                                    max_features=max_features)

    tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df["essay"])

    # TFIDF to SVD
    svd_model = TruncatedSVD(n_components=100,
                            n_iter=200,
                            random_state=69)
   
    svd = svd_model.fit_transform(tfidf_matrix)

    return combined_df, svd

In [56]:
def sd_training_process(df_lsa, svd):
    print("Getting Features..")
    x_df_features = df_lsa[['word_count', 
                            'unique_word_count',
                            'sentence_count',
                            'avg_word_len',
                            'grammar_errors',
                            'lsa_score', 
                            'readability_score',
                            'noun_count',
                            'adj_count',
                            'verb_count',
                            'adv_count']]

    x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
    y_features = df_lsa['actual_score'].to_numpy()

    print("Splitting Dataset..")
    x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

    print("Building Linear Regression Model..")
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)

    print("Building SVR Model..")
    svr_model = SVR()
    svr_model.fit(x_train, y_train)

    print("Building Decision Tree Model..")
    tree_model = DecisionTreeRegressor()
    tree_model.fit(x_train, y_train)

    print("Building Bayesian Regressor..")
    bayes_model = BayesianRidge()
    bayes_model.fit(x_train, y_train)

    print("Building AdaBoost Regressor..")
    ada_model = AdaBoostRegressor(n_estimators=100)
    ada_model.fit(x_train, y_train)

    print("Building Random Forest Regressor..")
    ran_model = RandomForestRegressor()
    ran_model.fit(x_train, y_train)

    print("Building Gradient Boosting Regressor..")
    grad_model = GradientBoostingRegressor(n_estimators=200)
    grad_model.fit(x_train, y_train)

    print("Building Logistic Regression Model..")
    log_model = LogisticRegression(solver="saga", max_iter=10000)
    log_model.fit(x_train, y_train)

    print("Getting Predictions..")
    predictions = [ lr_model.predict(x_test),
                    svr_model.predict(x_test),
                    tree_model.predict(x_test),
                    bayes_model.predict(x_test),
                    ada_model.predict(x_test),
                    ran_model.predict(x_test),
                    grad_model.predict(x_test),
                    log_model.predict(x_test)]
    scores = []
    
    for idx, pred in enumerate(predictions):
        mae = mean_absolute_error(y_test, pred)
        mse = mean_squared_error(y_test, pred)
        rmse = np.sqrt(mse)
        r_score = r2_score(y_test, pred)

        scores.append([idx, mae, mse, rmse, r_score])
    
    print("\nResults:")
    best_score = max(scores, key=lambda sublist: sublist[-1])
    print(f"MODEL: {best_score[0]}")
    print(f"Mean Absolute Error: {best_score[1]}")
    print(f"Mean Squared Error: {best_score[2]}")
    print(f"Root Mean Squared Error: {best_score[3]}")
    print(f"R2 score: {best_score[4]}\n")

    print("[Cross Validation 10-Folds]")

    kf = KFold(n_splits=10)

    scores = [cross_val_score(lr_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(svr_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(tree_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(bayes_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(ada_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(ran_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(grad_model, x_features, y_features, cv=kf).mean(),
          cross_val_score(log_model, x_features, y_features, cv=kf).mean()]
    
    print(f"MODEL: {scores.index(max(scores))}")
    print(f"Overall Score: {max(scores)}\n")

    return best_score, (scores.index(max(scores)), max(scores))# average_score

In [57]:
clean_df = preprocess_dataframe(3)
source_df = preprocess_source_essay(3)
combined_df, svd = sd_vectorization_process(clean_df, source_df, 10, 10000)

combined_df

Retrieving Essay Set: #3
DataFrame Shape: (1726, 28)

- PREPROCESSING ESSAY SETS -

Getting Word Count..
Getting Unique Word Count..
Getting Sentence Count..
Getting Average Word Length..
POS Tagging..
Getting Readability..
Getting Grammatical Errors..
Autocorrecting Essay..


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Tokenizing..
Removing Stop Words..
Stemming Words..
Detokenizing..

- FINISHED -


- PREPROCESSING SOURCE ESSAY -

Tokenizing..
Removing Stop Words..
Stemming Words..
Detokenizing..

- FINISHED -



  source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,essay,actual_score,word_count,unique_word_count,sentence_count,avg_word_len,noun_count,adj_count,verb_count,adv_count,readability_score,grammar_errors
0,featur set affect cyclist mani way featur set ...,1,51,32,3,4.098039,15,1,9,0,71.14,1
1,featur set affect cyclist neg desert dri hot t...,2,179,106,12,4.418994,49,13,29,7,70.63,19
2,everyon travel unfamiliar place sometim get lo...,1,97,73,8,4.164948,22,8,19,11,83.05,1
3,believ featur cyclist affect impati trustworth...,1,87,62,3,3.896552,19,11,13,3,67.72,14
4,set effect cyclist set differ stori would made...,2,134,80,3,4.126866,31,15,21,8,63.06,11
...,...,...,...,...,...,...,...,...,...,...,...,...
1721,stori set affect cyclist mani way exampl condi...,2,66,52,6,4.136364,19,5,14,1,85.69,1
1722,featur set affect cyclist like group hill alon...,1,54,48,3,4.333333,15,5,10,7,78.59,2
1723,set greatli affect cyclist tri get yosemit nat...,2,113,74,5,4.159292,26,7,21,7,69.62,3
1724,featur set affect cyclist author say californi...,2,152,90,7,4.302632,33,11,30,15,66.67,2


In [53]:
sd_summaries = [(3, 10, 10000),
            (4, 5, 10000), 
            (5, 10, 10000), 
            (6, 1, 1000)]

sd_summary_scores = []

for summary in sd_summaries:
    print("START: Preprocessing..\n")
    clean_df = preprocess_dataframe(summary[0])
    source_df = preprocess_source_essay(summary[0])
    
    print("PROCESS: Vectorization..\n")
    df_lsa, svd = sd_vectorization_process(clean_df, source_df, summary[1], summary[2])
    df_lsa
    
    print("PROCESS: Training..\n")
    best_score, average_score = training_process(df_lsa, svd)

    summary_scores.append([best_score, average_score])
    print("\n")

START: Preprocessing..

Retrieving Essay Set: #3
DataFrame Shape: (1726, 28)

- PREPROCESSING ESSAY SETS -

Getting Word Count..
Getting Unique Word Count..
Getting Sentence Count..
Getting Average Word Length..
POS Tagging..
Getting Readability..
Getting Grammatical Errors..
Autocorrecting Essay..


  clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")


Tokenizing..
Removing Stop Words..
Stemming Words..
Detokenizing..

- FINISHED -


- PREPROCESSING SOURCE ESSAY -

Tokenizing..
Removing Stop Words..
Stemming Words..
Detokenizing..

- FINISHED -

PROCESS: Vectorization..



  source_essay['essay'] = source_essay['essay'].str.replace("[^a-zA-Z#]", " ")


PROCESS: Training..

Getting Features


KeyError: "['lsa_score'] not in index"