In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
np.random.seed(0)
plt.style.use('ggplot')

In [2]:
# Import training and testing data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data_trim = train_data[["id", "excerpt", "target"]]

In [19]:
# Vectorize words of train data
vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    stop_words='english',
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    lowercase=True,
    max_features=13000) #looped through multiple numbers and had minimal rmse with 13000 features

X = vectorizer.fit_transform(train_data['excerpt'].values)

# Get target values of train data
y = train_data.loc[:,'target']

In [6]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
# Train MLP Regressor using best parameters
best_mlp = MLPRegressor(hidden_layer_sizes = (50, 50, 50), 
                    activation = 'relu',
                    alpha = 0.0001,
                    learning_rate = 'constant',
                    max_iter= 1000,
                    n_iter_no_change=100,
                    random_state=42
                    ).fit(X_train, y_train)

In [23]:
# Read csv file of top 100 books
df = pd.read_csv('top100books.csv')

In [24]:
df.head()

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content
0,1342,Pride and Prejudice,Jane Austen,54222,"It is a truth universally acknowledged, that ..."
1,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,43135,"St. Petersburgh, Dec. 11th, 17—. You will rej..."
2,11,Alice's Adventures in Wonderland,Lewis Carroll,27199,Alice was beginning to get very tired of sitt...
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...


In [25]:
# Drop rows with no book content
df.dropna(subset=['book_content'], inplace=True)

In [26]:
# Vectorize text
X_books = vectorizer.transform(df['book_content'].values)

In [29]:
# predict texts using model
val_preds = best_mlp.predict(X_books)

In [30]:
# Add predicted scores to dataframe
df['book_score'] = val_preds

In [31]:
df.head()

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
0,1342,Pride and Prejudice,Jane Austen,54222,"It is a truth universally acknowledged, that ...",-1.350155
1,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,43135,"St. Petersburgh, Dec. 11th, 17—. You will rej...",-3.09869
2,11,Alice's Adventures in Wonderland,Lewis Carroll,27199,Alice was beginning to get very tired of sitt...,-0.13825
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...,-2.585912
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...,-3.256031


In [33]:
df.to_csv('top100books_scored.csv', index=False, encoding="utf-8-sig")