 # Predicting Book Success

In [1]:
#Import Dependencies
from path import Path
import pandas as pd

In [2]:
#Import the dataset
data = Path('books_clean.csv')
df = pd.read_csv(data)

# We can see from the preview of the DataFrame that multiple variables (also called features), such as the isbn13,
#published_year, average_rating, num_pages, ratings_count, can be used to predict the outcome: whether a book will have a good 
#rating (1) or will not (0) based on the fact that an average rating below of 4.5 will not likely be successful

df.loc[df['Rating'] <= 4.5, 'Rating_Classification'] = 'Low_Rating' 
df.loc[df['Rating'] > 4.5, 'Rating_Classification'] = 'High_Rating'

df.reset_index(inplace=True, drop=True)
df.head()

NameError: name 'Path' is not defined

In [3]:
# We will check for the variables from all columns
df.dtypes

ISBN                       int64
Rating                   float64
Price                    float64
Pages                      int64
Rating Count               int64
Language_Dummy            object
Size_Dummy                object
Famous_Dummy              object
Categories_Dummy          object
Serie_Dummy               object
Rating_Classification     object
dtype: object

In [4]:
#Count the current NaN values from the dataframe
df.isnull().sum().sum()

0

In [5]:
#Drop all the NaN values
df.dropna(inplace=True)

In [6]:
#Recount all the NaN values to make sure they are dropped
df.isnull().sum().sum()

0

In [7]:
#Use method to convert String to int
def rating(x):
    if x == 'Low_Rating':
        return 0
    if x == 'High_Rating':
        return 1
    

In [8]:
#Apply the method to the rating_classification column
df['Rating_Classification'] = df['Rating_Classification'].apply(rating)

In [9]:
#Use method to convert String to int
def binary(x):
    if x == 'Other':
        return 0
    if x == 'English'or 'Big' or 'Famous' or 'Serie' or 'Top_Category':
        return 1

In [10]:
#Apply the method to the rating_classification column
df['Size_Dummy'] = df['Size_Dummy'].apply(binary)

In [11]:
df['Serie_Dummy'] = df['Serie_Dummy'].apply(binary)

In [12]:
df['Famous_Dummy'] = df['Famous_Dummy'].apply(binary)

In [13]:
df['Language_Dummy'] = df['Language_Dummy'].apply(binary)

In [14]:
df['Categories_Dummy'] = df['Categories_Dummy'].apply(binary)

In [15]:
#Create a new dataframe with the target variables
df.tail(10)

Unnamed: 0,ISBN,Rating,Price,Pages,Rating Count,Language_Dummy,Size_Dummy,Famous_Dummy,Categories_Dummy,Serie_Dummy,Rating_Classification
1921,9780842360616,3.99,5.56,356,3967,1,1,1,1,1,0
1922,9780425206867,3.76,3.19,216,22369,1,1,1,1,1,0
1923,9780142302330,4.39,15.14,1008,504,1,1,1,1,0,0
1924,9780140437850,3.53,5.69,289,3471,1,1,0,1,0,0
1925,9780812975932,3.44,3.28,142,5028,1,1,0,1,0,0
1926,9780441731183,3.78,3.89,341,6092,1,1,1,1,1,0
1927,9780811213769,4.3,3.59,99,599,1,1,0,1,0,0
1928,9780553562606,4.08,3.12,518,5356,1,1,0,1,1,0
1929,9781400044870,4.02,3.59,320,4455,1,1,0,1,0,0
1930,9780553273861,3.64,3.52,291,580,1,1,1,1,0,0


##  Separate the Features (X) from the Target (y)

In [16]:
#The Outcome column is defined as y, or the target.
#X, or features, is created by dropping the Outcome column from the DataFrame.

y = df["Rating_Classification"]
X = df.drop(columns="Rating_Classification")


 ## Split our data into training and testing

In [17]:
#We first split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape


(1448, 10)

In [18]:
#Examining the shape of the training set with X_train.shape returned (1636,5), meaning that there are 1636 samples (rows) and 
#five features (columns).

In [19]:
#The next step was to create a logistic regression model with the specified arguments for solver, max_iter, and random_state
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [20]:
#we trained the model with the training data
classifier.fit(X_train, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [21]:
#To create predictions for y-values, we used the X_test set
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.tail(20)


Unnamed: 0,Prediction,Actual
463,0,0
464,0,0
465,0,0
466,0,0
467,0,0
468,0,0
469,0,0
470,0,0
471,0,0
472,0,0


In [22]:
#The final step is to measure the accuracy of the logistic regression model created

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9834368530020704


In [23]:
#taking into account that the accuracy score is simply the percentage of predictions that are correct. In this case, 
#the model's accuracy score was 0.9834, meaning that the model was correct 98.34% of the time.

In [25]:
#  import the relevant modules for validation and print the confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[475   0]
 [  8   0]]


In [26]:
# Report of sensitivity, precission and F1. La línea 0 se aplica al predictor de diabetes como hipótesis alternativa
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       475
           1       0.00      0.00      0.00         8

    accuracy                           0.98       483
   macro avg       0.49      0.50      0.50       483
weighted avg       0.97      0.98      0.98       483



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
