In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import language_tool_python
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from time import sleep
from threading import Thread

lt_servers = []
thread_list = []
    
stop_words = stopwords.words('english')

In [26]:
df = pd.read_excel("training_set_rel3.xls")
df.head()
df.shape

(12978, 28)

In [27]:
df = df[df["essay_set"]==2]
print(f"Retrieving Essay Set #{1}")
print(f"Dataframe shape: {df.shape}")
clean_df = df[['essay', 'domain1_score', 'domain2_score']].copy()
clean_df['actual_score'] = clean_df['domain1_score'] + clean_df['domain2_score']
clean_df.drop(['domain1_score', 'domain2_score'], axis=1, inplace=True)

Retrieving Essay Set #1
Dataframe shape: (1800, 28)


In [25]:
clean_df.head()

Unnamed: 0,essay,actual_score
1783,Certain materials being removed from libraries...,8.0
1784,Write a persuasive essay to a newspaper reflec...,2.0
1785,Do you think that libraries should remove cert...,5.0
1786,"In @DATE1's world, there are many things found...",8.0
1787,In life you have the 'offensive things'. The l...,8.0


In [4]:
# essay structure
import re
import nltk

def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(essay)

    return len(words)

def unique_word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    unique_words = set(words)

    return len(unique_words)

def sentence_count(essay):
    sentences = nltk.sent_tokenize(essay)
    
    return len(sentences)

def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    
    return sum(len(word) for word in words) / len(words)

In [5]:
# custom thread
class LanguageCheck(Thread):
    # constructor
    def __init__(self, df, idx):
        # execute the base constructor
        Thread.__init__(self)
        # set a default value
        self.value = None
        self.df = df
        self.index = idx
 
    # function executed in a new thread
    def run(self):
        self.df['grammar_errors'] = self.df['essay'].apply(self.grammar_errors)
        self.value = self.df
    
    def grammar_errors(self, essay):
        errors = lt_servers[self.index].check(essay)
        return len(errors)

In [6]:
# fix errors
def autocorrect_essay(essay):
    language_tool = language_tool_python.LanguageTool('en-US')
    corrected_essay = language_tool.correct(essay)
    return corrected_essay

# pool autocorrect
def pool_autocorrect(essay):
    language_tool = language_tool_python.LanguageTool('en-US')
    corrected_essay = language_tool.correct(essay)
    return corrected_essay

df_split = np.array_split(clean_df, len(lt_servers))

# put threads into list
for idx, i in enumerate(df_split):
    thread_langcheck = LanguageCheck(df=i, idx=idx)
    thread_list.append(thread_langcheck)

# start thread list
for thread in thread_list:
    thread.start()

# join all threads
for thread in thread_list:
    thread.join()

In [15]:
df_join = pd.concat([thread.value for thread in thread_list], axis=0)

Unnamed: 0,essay,actual_score,grammar_errors
0,"Dear local newspaper, I think effects computer...",8,16
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,25
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,17
3,"Dear Local Newspaper, @CAPS1 I have found that...",10,29
4,"Dear @LOCATION1, I know having computers has a...",8,17
...,...,...,...
1778,"Dear @CAPS1, @CAPS2 several reasons on way I t...",8,33
1779,Do a adults and kids spend to much time on the...,7,16
1780,My opinion is that people should have computer...,8,15
1781,"Dear readers, I think that its good and bad to...",2,0


In [4]:
def preprocess_dataframe(df, essay_set):
    df = df[df["essay_set"]==essay_set]
    print(f"Retrieving Essay Set #{essay_set}")
    print(f"Dataframe shape: {df.shape}")
    clean_df = df[['essay', 'domain1_score']].copy()
    clean_df = clean_df.rename(columns={'domain1_score': 'actual_score'})

    # get essay structure
    print("Getting Word Count")
    clean_df['word_count'] = clean_df['essay'].apply(word_count)
    print("Getting Unique Word Count")
    clean_df['unique_word_count'] = clean_df['essay'].apply(unique_word_count)
    print("Getting Sentence Count")
    clean_df['sentence_count'] = clean_df['essay'].apply(sentence_count)
    print("Getting Average Word Length")
    clean_df['avg_word_len'] = clean_df['essay'].apply(avg_word_len)

    # get grammatical errors
    print("Getting Grammatical Errors")
    clean_df['grammar_errors'] = clean_df['essay'].apply(grammar_errors)

    # autocorrect errors
    print("Autocorrecting Essay")
    clean_df['essay'] = clean_df['essay'].apply(autocorrect_essay)

    # preprocess essay for tokenization
    print("Preprocess for tokenization")
    clean_df.reset_index(drop=True, inplace=True)
    clean_df['essay'] = clean_df['essay'].str.replace("[^a-zA-Z#]", " ")
    clean_df['essay'] = clean_df['essay'].apply(lambda x: x.lower())

    # tokenization
    print("Tokenization Start")
    tokenized_doc = clean_df['essay'].apply(lambda x: x.split())

    # remove stop-words
    print("Removing Stop Words")
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    # stemming
    print("Word Stemming")
    porter_stemmer = PorterStemmer()
    tokenized_doc = tokenized_doc.apply(lambda x: [porter_stemmer.stem(item) for item in x])

    # de-tokenization
    print("Detokenize")
    detokenized_doc = []
    for i in range(len(clean_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)

    clean_df['essay'] = detokenized_doc

    return clean_df

In [5]:
def scorer_no_lsa_similarity():
    # Essay Set 1, max_features = 10000, min_df = 5

    essay_set = 1
    max_features = 10000
    min_df = 5

    print("Preprocess Start")
    clean_df = preprocess_dataframe(df, essay_set)

    print("Creating TF-IDF Vectorizer")
    # Create a vectorizer for the training data
    tokenizer = RegexpTokenizer(r'\w+')

    # Vectorize document using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                            stop_words='english',
                                            ngram_range = (1,3),
                                            tokenizer = tokenizer.tokenize,
                                            max_features=max_features,
                                            max_df=0.8,
                                            min_df=min_df)
    print("Building Matrix")
    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_df["essay"])
    print(f"Train TFIDF Matrix Shape: {tfidf_matrix.shape}")

    print("Convert TF-IDF matrix to SVD")
    # TFIDF to SVD
    svd_model = TruncatedSVD(n_components=100,
                            n_iter=200,
                            random_state=69)
        
    svd = svd_model.fit_transform(tfidf_matrix)
    #normalized_svd = Normalizer(copy=False).fit_transform(svd)

    print("Training Start")

    print("Getting Features")
    x_df_features = clean_df[['word_count', 'unique_word_count', 'sentence_count', 'avg_word_len', 'grammar_errors']]
    x_features = np.concatenate((x_df_features.to_numpy(), svd), axis=1)
    y_features = clean_df['actual_score'].to_numpy()

    print("Splitting Dataset")
    x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size = 0.2, train_size = 0.8, random_state = 420)

    print("Building Linear Regression Model")
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)

    print("Building SVR Model")
    svr_model = SVR()
    svr_model.fit(x_train, y_train)

    print("Building Decision Tree Model")
    tree_model = DecisionTreeRegressor()
    tree_model.fit(x_train, y_train)

    print("Building Bayesian Regressor")
    bayes_model = BayesianRidge()
    bayes_model.fit(x_train, y_train)

    print("Building AdaBoost Regressor")
    ada_model = AdaBoostRegressor(n_estimators=100)
    ada_model.fit(x_train, y_train)

    print("Building Random Forest Regressor")
    ran_model = RandomForestRegressor()
    ran_model.fit(x_train, y_train)

    print("Building Gradient Boosting Regressor")
    grad_model = GradientBoostingRegressor(n_estimators=200)
    grad_model.fit(x_train, y_train)

    print("Building Logistic Regression Model")
    log_model = LogisticRegression(solver="saga", max_iter=10000)
    log_model.fit(x_train, y_train)

    print("Getting Predictions")
    predictions = [ lr_model.predict(x_test),
                    svr_model.predict(x_test),
                    tree_model.predict(x_test),
                    bayes_model.predict(x_test),
                    ada_model.predict(x_test),
                    ran_model.predict(x_test),
                    grad_model.predict(x_test),
                    log_model.predict(x_test)]
    scores = []
    
    for idx, pred in enumerate(predictions):
        mae = mean_absolute_error(y_test, pred)
        mse = mean_squared_error(y_test, pred)
        rmse = np.sqrt(mse)
        r_score = r2_score(y_test, pred)

        scores.append([idx, mae, mse, rmse, r_score])
    
    print("\nResults:")
    best_score = max(scores, key=lambda sublist: sublist[0])
    print(f"Model {best_score[0]}")
    print(f"Mean Absolute Error: {best_score[1]}")
    print(f"Mean Squared Error: {best_score[2]}")
    print(f"Root Mean Squared Error: {best_score[3]}")
    print(f"R2 score: {best_score[4]}\n")

    print("Cross Validation 10-Folds")
    scores = [cross_val_score(lr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(svr_model, x_train, y_train, cv=10).mean(),
          cross_val_score(tree_model, x_train, y_train, cv=10).mean(),
          cross_val_score(bayes_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ada_model, x_train, y_train, cv=10).mean(),
          cross_val_score(ran_model, x_train, y_train, cv=10).mean(),
          cross_val_score(grad_model, x_train, y_train, cv=10).mean(),
          cross_val_score(log_model, x_train, y_train, cv=10).mean()]
    
    print(f"Model {scores.index(max(scores))}")
    print(f"Overall Score: {max(scores)}\n")

In [6]:
scorer_no_lsa_similarity()

Preprocess Start
Retrieving Essay Set #1
Dataframe shape: (1783, 28)
Getting Word Count
Getting Unique Word Count
Getting Sentence Count
Getting Average Word Length
Getting Grammatical Errors
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
THREAD!
