In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
np.random.seed(0)
plt.style.use('ggplot')

In [2]:
# Import training and testing data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data_trim = train_data[["id", "excerpt", "target"]]

In [19]:
# Vectorize words of train data
vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    stop_words='english',
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    lowercase=True,
    max_features=13000) #looped through multiple numbers and had minimal rmse with 13000 features

X = vectorizer.fit_transform(train_data['excerpt'].values)

# Get target values of train data
y = train_data.loc[:,'target']

In [6]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
# Train MLP Regressor using best parameters
best_mlp = MLPRegressor(hidden_layer_sizes = (50, 50, 50), 
                    activation = 'relu',
                    alpha = 0.0001,
                    learning_rate = 'constant',
                    max_iter= 1000,
                    n_iter_no_change=100,
                    random_state=42
                    ).fit(X_train, y_train)

In [23]:
# Read csv file of top 100 books
df = pd.read_csv('top100books.csv')

In [24]:
df.head()

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content
0,1342,Pride and Prejudice,Jane Austen,54222,"It is a truth universally acknowledged, that ..."
1,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,43135,"St. Petersburgh, Dec. 11th, 17—. You will rej..."
2,11,Alice's Adventures in Wonderland,Lewis Carroll,27199,Alice was beginning to get very tired of sitt...
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...


In [25]:
# Drop rows with no book content
df.dropna(subset=['book_content'], inplace=True)

In [26]:
# Vectorize text
X_books = vectorizer.transform(df['book_content'].values)

In [29]:
# predict texts using model
val_preds = best_mlp.predict(X_books)

In [30]:
# Add predicted scores to dataframe
df['book_score'] = val_preds

In [31]:
df.head()

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
0,1342,Pride and Prejudice,Jane Austen,54222,"It is a truth universally acknowledged, that ...",-1.350155
1,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,43135,"St. Petersburgh, Dec. 11th, 17—. You will rej...",-3.09869
2,11,Alice's Adventures in Wonderland,Lewis Carroll,27199,Alice was beginning to get very tired of sitt...,-0.13825
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...,-2.585912
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...,-3.256031


In [33]:
df.to_csv('top100books_scored.csv', index=False, encoding="utf-8-sig")

In [34]:
df_sorted = df.sort_values(by=['book_score'])

In [56]:
df_sorted.iloc[0]

book_code                                                      1497
book_title                                             The Republic
book_author                                                   Plato
book_downloads                                                12673
book_content       Note: See also “The Republic” by Plato, Jowet...
book_score                                                -4.209269
Name: 23, dtype: object

In [60]:
# Year 13
df[df['book_score'] < -3.5]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
21,6130,The Iliad,Homer,14133,Scepticism is as much the result of knowledge...,-3.868761
23,1497,The Republic,Plato,12673,"Note: See also “The Republic” by Plato, Jowet...",-4.209269
39,7370,Second Treatise of Government,John Locke,9646,SALUS POPULI SUPREMA LEX ESTO LONDON PRINTED ...,-3.603892
67,3600,Essays of Michel de Montaigne — Complete,Michel de Montaigne,6266,The present publication is intended to supply...,-3.786203
75,7142,The History of the Peloponnesian War,Thucydides,5876,The State of Greece from the earliest Times t...,-3.579429
95,4363,Beyond Good and Evil,Friedrich Wilhelm Nietzsche,4918,The following is a reprint of the Helen Zimme...,-4.145548


In [58]:
# Year 13
df_sorted[0:9]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
23,1497,The Republic,Plato,12673,"Note: See also “The Republic” by Plato, Jowet...",-4.209269
95,4363,Beyond Good and Evil,Friedrich Wilhelm Nietzsche,4918,The following is a reprint of the Helen Zimme...,-4.145548
21,6130,The Iliad,Homer,14133,Scepticism is as much the result of knowledge...,-3.868761
67,3600,Essays of Michel de Montaigne — Complete,Michel de Montaigne,6266,The present publication is intended to supply...,-3.786203
39,7370,Second Treatise of Government,John Locke,9646,SALUS POPULI SUPREMA LEX ESTO LONDON PRINTED ...,-3.603892
75,7142,The History of the Peloponnesian War,Thucydides,5876,The State of Greece from the earliest Times t...,-3.579429
42,15399,The Interesting Narrative of the Life of Olaud...,Olaudah Equiano,9156,"Sold also by Mr. Johnson, St. Paul's Church-Y...",-3.395914
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...,-3.256031
54,36,The War of the Worlds,H. G. Wells,7315,No one would have believed in the last years ...,-3.212231


In [52]:
# Year 12
df_sorted[11:20]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
78,27827,The Kama Sutra of Vatsyayana,Vatsyayana,5621,[1] [2] [3] While the introduction will bear ...,-3.093092
96,3090,Complete Original Short Stories of Guy De Maup...,Guy de Maupassant,4868,A STUDY BY POL. NEVEUX “I entered literary li...,-3.019937
89,1998,Thus Spake Zarathustra: A Book for All and None,Friedrich Wilhelm Nietzsche,5210,PG Editor’s Note: Archaic spelling and punctu...,-3.016426
27,23,"Narrative of the Life of Frederick Douglass, a...",Frederick Douglass,11721,Note from the original file: This electronic ...,-2.991007
99,3825,Pygmalion,Bernard Shaw,4784,"As will be seen later on, Pygmalion needs, no...",-2.964467
24,408,The Souls of Black Folk,W. E. B. Du Bois,12527,Herein lie buried many things which if read w...,-2.877077
86,11030,"Incidents in the Life of a Slave Girl, Written...",Harriet A. Jacobs,5322,[Transcriberâs note: The spelling irregular...,-2.850805
68,135,Les Misérables,Victor Hugo,6261,"So long as there shall exist, by virtue of la...",-2.831422
52,41,The Legend of Sleepy Hollow,Washington Irving,7808,In the bosom of one of those spacious coves w...,-2.735644


In [66]:
# Year 12
df[(df['book_score'] < -3) & (df['book_score'] > -3.5)]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
1,84,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,43135,"St. Petersburgh, Dec. 11th, 17—. You will rej...",-3.09869
4,25344,The Scarlet Letter,Nathaniel Hawthorne,25582,BY NATHANIEL HAWTHORNE. Illustrated. The auth...,-3.256031
42,15399,The Interesting Narrative of the Life of Olaud...,Olaudah Equiano,9156,"Sold also by Mr. Johnson, St. Paul's Church-Y...",-3.395914
54,36,The War of the Worlds,H. G. Wells,7315,No one would have believed in the last years ...,-3.212231
56,244,A Study in Scarlet,Arthur Conan Doyle,7147,Original Transcriber’s Note: This etext is pr...,-3.094899
78,27827,The Kama Sutra of Vatsyayana,Vatsyayana,5621,[1] [2] [3] While the introduction will bear ...,-3.093092
89,1998,Thus Spake Zarathustra: A Book for All and None,Friedrich Wilhelm Nietzsche,5210,PG Editor’s Note: Archaic spelling and punctu...,-3.016426
96,3090,Complete Original Short Stories of Guy De Maup...,Guy de Maupassant,4868,A STUDY BY POL. NEVEUX “I entered literary li...,-3.019937


In [53]:
# Year 11
df_sorted[21:30]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
81,28054,The Brothers Karamazov,Fyodor Dostoyevsky,5546,Alexey Fyodorovitch Karamazov was the third s...,-2.651708
77,5827,The Problems of Philosophy,Bertrand Russell,5720,In the following pages I have confined myself...,-2.640564
9,174,The Picture of Dorian Gray,Oscar Wilde,18927,The artist is the creator of beautiful things...,-2.605979
92,3296,The Confessions of St. Augustine,Bishop of Hippo Saint Augustine,5128,"Great art Thou, O Lord, and greatly to be pra...",-2.603193
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...,-2.585912
14,1080,A Modest Proposal,Jonathan Swift,16650,"It is a melancholy object to those, who walk ...",-2.553162
18,205,"Walden, and On The Duty Of Civil Disobedience",Henry David Thoreau,14647,"When I wrote the following pages, or rather t...",-2.552296
82,10007,Carmilla,Joseph Sheridan Le Fanu,5539,Upon a paper attached to the Narrative which ...,-2.548299
6,2701,"Moby Dick; Or, The Whale",Herman Melville,22079,"This text is a combination of etexts, one fro...",-2.534993


In [67]:
# Year 11
df[(df['book_score'] < -2.5) & (df['book_score'] > -3)]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
3,16328,Beowulf: An Anglo-Saxon Epic Poem,,25709,CONTENTS. The Heyne-Socin text and glossary h...,-2.585912
6,2701,"Moby Dick; Or, The Whale",Herman Melville,22079,"This text is a combination of etexts, one fro...",-2.534993
9,174,The Picture of Dorian Gray,Oscar Wilde,18927,The artist is the creator of beautiful things...,-2.605979
14,1080,A Modest Proposal,Jonathan Swift,16650,"It is a melancholy object to those, who walk ...",-2.553162
18,205,"Walden, and On The Duty Of Civil Disobedience",Henry David Thoreau,14647,"When I wrote the following pages, or rather t...",-2.552296
24,408,The Souls of Black Folk,W. E. B. Du Bois,12527,Herein lie buried many things which if read w...,-2.877077
27,23,"Narrative of the Life of Frederick Douglass, a...",Frederick Douglass,11721,Note from the original file: This electronic ...,-2.991007
52,41,The Legend of Sleepy Hollow,Washington Irving,7808,In the bosom of one of those spacious coves w...,-2.735644
68,135,Les Misérables,Victor Hugo,6261,"So long as there shall exist, by virtue of la...",-2.831422
71,829,Gulliver's Travels into Several Remote Nations...,Jonathan Swift,6222,"The author of these Travels, Mr. Lemuel Gulli...",-2.684519


In [54]:
# Year 10
df_sorted[31:40]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
8,1232,The Prince,Niccolò Machiavelli,20783,Nicolo Machiavelli was born at Florence on 3r...,-2.426964
19,1260,Jane Eyre: An Autobiography,Charlotte Brontë,14236,A preface to the first edition of “Jane Eyre”...,-2.408566
12,98,A Tale of Two Cities,Charles Dickens,17895,"It was the best of times, it was the worst of...",-2.373994
76,2500,Siddhartha,Hermann Hesse,5859,"In the shade of the house, in the sunshine of...",-2.356447
97,6133,"The Extraordinary Adventures of Arsene Lupin, ...",Maurice Leblanc,4846,It was a strange ending to a voyage that had ...,-2.347675
93,103,Around the World in Eighty Days,Jules Verne,4975,"Mr. Phileas Fogg lived, in 1872, at No. 7, Sa...",-2.312369
91,521,The Life and Adventures of Robinson Crusoe,Daniel Defoe,5147,"I was born in the year 1632, in the city of Y...",-2.281668
34,2554,Crime and Punishment,Fyodor Dostoyevsky,10143,A few words about Dostoevsky himself may help...,-2.275184
25,1727,The Odyssey,Homer,12502,This translation is intended to supplement a ...,-2.262257


In [68]:
# Year 10
df[(df['book_score'] < -2) & (df['book_score'] > -2.5)]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
8,1232,The Prince,Niccolò Machiavelli,20783,Nicolo Machiavelli was born at Florence on 3r...,-2.426964
12,98,A Tale of Two Cities,Charles Dickens,17895,"It was the best of times, it was the worst of...",-2.373994
16,43,The Strange Case of Dr. Jekyll and Mr. Hyde,Robert Louis Stevenson,15803,Mr. Utterson the lawyer was a man of a rugged...,-2.154277
19,1260,Jane Eyre: An Autobiography,Charlotte Brontë,14236,A preface to the first edition of “Jane Eyre”...,-2.408566
25,1727,The Odyssey,Homer,12502,This translation is intended to supplement a ...,-2.262257
34,2554,Crime and Punishment,Fyodor Dostoyevsky,10143,A few words about Dostoevsky himself may help...,-2.275184
51,996,Don Quixote,Miguel de Cervantes Saavedra,8014,The book cover and spine above and the images...,-2.044886
63,730,Oliver Twist,Charles Dickens,6424,Among other public buildings in a certain tow...,-2.126464
69,19942,Candide,Voltaire,6255,The Publishers will be glad to mail complete ...,-2.134335
76,2500,Siddhartha,Hermann Hesse,5859,"In the shade of the house, in the sunshine of...",-2.356447


In [55]:
# Year 9
df_sorted[41:50]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
80,2148,The Works of Edgar Allan Poe — Volume 2,Edgar Allan Poe,5610,[Redactor’s Note—Some endnotes are by Poe and...,-2.143657
88,140,The Jungle,Upton Sinclair,5233,It was four o’clock when the ceremony was ove...,-2.141842
69,19942,Candide,Voltaire,6255,The Publishers will be glad to mail complete ...,-2.134335
63,730,Oliver Twist,Charles Dickens,6424,Among other public buildings in a certain tow...,-2.126464
83,766,David Copperfield,Charles Dickens,5482,I do not find it easy to get sufficiently far...,-2.125256
98,161,Sense and Sensibility,Jane Austen,4837,The family of Dashwood had long been settled ...,-2.114286
51,996,Don Quixote,Miguel de Cervantes Saavedra,8014,The book cover and spine above and the images...,-2.044886
57,120,Treasure Island,Robert Louis Stevenson,7142,"To S.L.O., an American gentleman in accordanc...",-1.966408
65,147,Common Sense,Thomas Paine,6374,"INHABITANTS AMERICA, Thomson. PHILADELPHIA Pr...",-1.926352


In [69]:
# Year 9
df[(df['book_score'] < -1.5) & (df['book_score'] > -2)]

Unnamed: 0,book_code,book_title,book_author,book_downloads,book_content,book_score
5,1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,22279,I had seen little of Holmes lately. My marria...,-1.813193
7,1952,The Yellow Wallpaper,Charlotte Perkins Gilman,21211,It is very seldom that mere ordinary people l...,-1.806235
13,64317,The Great Gatsby,F. Scott Fitzgerald,17454,In my younger and more vulnerable years my fa...,-1.742899
15,345,Dracula,Bram Stoker,15919,D R A C U L A Having had some time at my di...,-1.800629
26,219,Heart of Darkness,Joseph Conrad,12145,"The Nellie, a cruising yawl, swung to her anc...",-1.892097
32,160,"The Awakening, and Selected Short Stories",Kate Chopin,10184,"A green and yellow parrot, which hung in a ca...",-1.801302
48,58585,The Prophet,Kahlil Gibran,8496,“His power came from some great reservoir of ...,-1.735148
49,1184,"The Count of Monte Cristo, Illustrated",Alexandre Dumas,8440,"As usual, a pilot put off immediately, and ro...",-1.564179
53,768,Wuthering Heights,Emily Brontë,7589,1801—I have just returned from a visit to my ...,-1.898053
55,63256,The American Diary of a Japanese Girl,Yoné Noguchi,7264,"January, 1902 Ever since my childhood, thy ...",-1.731087
