 # Predicting Book Success

In [41]:
#Import Dependencies
from path import Path
import pandas as pd

In [42]:
#Import the dataset
data = Path('books.csv')
df = pd.read_csv(data)

# We can see from the preview of the DataFrame that multiple variables (also called features), such as the isbn13,
#published_year, average_rating, num_pages, ratings_count, can be used to predict the outcome: whether a book will have a good 
#rating (1) or will not (0) based on the fact that an average rating below of 4.5 will not likely be successful

df.loc[df['average_rating'] <= 4.5, 'rating_classification'] = 'Low_Rating' 
df.loc[df['average_rating'] > 4.5, 'rating_classification'] = 'High_Rating'

df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,rating_classification
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Low_Rating
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Low_Rating
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,Low_Rating
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Low_Rating
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,Low_Rating


In [43]:
# We will check for the variables from all columns
df.dtypes

isbn13                     int64
isbn10                    object
title                     object
subtitle                  object
authors                   object
categories                object
thumbnail                 object
description               object
published_year           float64
average_rating           float64
num_pages                float64
ratings_count            float64
rating_classification     object
dtype: object

In [44]:
#Count the current NaN values from the dataframe
df.isnull().sum().sum()

5369

In [45]:
#Drop all the NaN values
df.dropna(inplace=True)

In [46]:
#Recount all the NaN values to make sure they are dropped
df.isnull().sum().sum()

0

In [47]:
#Use method to convert String to int
def rating(x):
    if x == 'Low_Rating':
        return 0
    if x == 'High_Rating':
        return 1
    

In [48]:
#Apply the method to the rating_classification column
df['rating_classification'] = df['rating_classification'].apply(rating)

In [49]:
#Create a new dataframe with the target variables
df2 = df.drop(['isbn10', 'title', 'subtitle', 'thumbnail', 'subtitle', 'description', 'authors', 'categories'], axis=1)
df2.tail()

Unnamed: 0,isbn13,published_year,average_rating,num_pages,ratings_count,rating_classification
6790,9783856305581,1997.0,4.03,142.0,373.0,0
6793,9784766113389,2003.0,4.14,132.0,65.0,0
6796,9784770028037,2002.0,3.54,176.0,27.0,0
6805,9788185300535,1999.0,4.51,531.0,104.0,1
6809,9789042003408,1998.0,3.7,136.0,10.0,0


##  Separate the Features (X) from the Target (y)

In [50]:
#The Outcome column is defined as y, or the target.
#X, or features, is created by dropping the Outcome column from the DataFrame.

y = df2["rating_classification"]
X = df2.drop(columns="rating_classification")


 ## Split our data into training and testing

In [51]:
#We first split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape


(1636, 5)

In [52]:
#Examining the shape of the training set with X_train.shape returned (1636,5), meaning that there are 1636 samples (rows) and 
#five features (columns).

In [53]:
#The next step was to create a logistic regression model with the specified arguments for solver, max_iter, and random_state
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [54]:
#we trained the model with the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [55]:
#To create predictions for y-values, we used the X_test set
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)


Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [56]:
#The final step is to measure the accuracy of the logistic regression model created

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9816849816849816


In [57]:
#taking into account that the accuracy score is simply the percentage of predictions that are correct. In this case, 
#the model's accuracy score was 0.9816, meaning that the model was correct 98.16% of the time.