 # Predicting Book Success

In [1]:

# Initial imports
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

In [2]:
#Import the dataset
data = Path('books_clean.csv')
df = pd.read_csv(data)

# We can see from the preview of the DataFrame that multiple variables (also called features), such as the isbn13,
#published_year, average_rating, num_pages, ratings_count, can be used to predict the outcome: whether a book will have a good 
#rating (1) or will not (0) based on the fact that an average rating below of 4.5 will not likely be successful

df.loc[df['Rating'] <= 4.2, 'Rating_Classification'] = 'Low_Rating' 
df.loc[df['Rating'] > 4.2, 'Rating_Classification'] = 'High_Rating'

df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,ISBN,Rating,Price,Pages,Rating Count,Language_Dummy,Size_Dummy,Famous_Dummy,Categories_Dummy,Serie_Dummy,Unnamed: 10,Rating_Classification
0,9780440000000.0,4.49,7.38,870,1996446,English,Big,Famous,Top_Category,Serie,,High_Rating
1,9780320000000.0,3.59,2.1,498,4367341,English,Big,Other,Other,Serie,,Low_Rating
2,9780350000000.0,4.59,21.15,1728,97731,English,Big,Famous,Top_Category,Serie,,High_Rating
3,9780390000000.0,3.84,2.73,464,1041597,English,Big,Other,Top_Category,Other,,Low_Rating
4,9780140000000.0,4.11,5.46,532,1328143,English,Big,Other,Top_Category,Other,,Low_Rating


In [3]:
# We will check for the variables from all columns
df.dtypes

ISBN                     float64
Rating                   float64
Price                    float64
Pages                      int64
Rating Count               int64
Language_Dummy            object
Size_Dummy                object
Famous_Dummy              object
Categories_Dummy          object
Serie_Dummy               object
Unnamed: 10              float64
Rating_Classification     object
dtype: object

In [4]:
# Generate our categorical variable list
cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat

['Language_Dummy',
 'Size_Dummy',
 'Famous_Dummy',
 'Categories_Dummy',
 'Serie_Dummy',
 'Rating_Classification']

In [5]:
# Check the number of unique values in each column. To check if it needs bucketing (more than 10 unique values)
df[cat].nunique()

Language_Dummy           2
Size_Dummy               2
Famous_Dummy             2
Categories_Dummy         2
Serie_Dummy              2
Rating_Classification    2
dtype: int64

In [6]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()



Unnamed: 0,Language_Dummy_English,Language_Dummy_Other,Size_Dummy_Big,Size_Dummy_Other,Famous_Dummy_Famous,Famous_Dummy_Other,Categories_Dummy_Other,Categories_Dummy_Top_Category,Serie_Dummy_Other,Serie_Dummy_Serie,Rating_Classification_High_Rating,Rating_Classification_Low_Rating
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0


In [7]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ISBN,Rating,Price,Pages,Rating Count,Unnamed: 10,Language_Dummy_English,Language_Dummy_Other,Size_Dummy_Big,Size_Dummy_Other,Famous_Dummy_Famous,Famous_Dummy_Other,Categories_Dummy_Other,Categories_Dummy_Top_Category,Serie_Dummy_Other,Serie_Dummy_Serie,Rating_Classification_High_Rating,Rating_Classification_Low_Rating
0,9780440000000.0,4.49,7.38,870,1996446,,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,9780320000000.0,3.59,2.1,498,4367341,,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,9780350000000.0,4.59,21.15,1728,97731,,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,9780390000000.0,3.84,2.73,464,1041597,,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,9780140000000.0,4.11,5.46,532,1328143,,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0


In [8]:
# Split our preprocessed data into our features and target arrays
y = df["Rating_Classification_High_Rating"].values
X = df.drop(["Rating_Classification_High_Rating","Rating_Classification_Low_Rating","ISBN","Rating","Rating Count"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count


In [10]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

NameError: name 'LogisticRegression' is not defined

In [33]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 4

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 112       
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 153
Trainable params: 153
Non-trainable params: 0
_________________________________________________________________


In [34]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [35]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 

In [36]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

16/16 - 0s - loss: nan - accuracy: 0.7950 - 377ms/epoch - 24ms/step
Loss: nan, Accuracy: 0.7950310707092285


In [4]:
#Count the current NaN values from the dataframe
df.isnull().sum().sum()

0

In [5]:
#Drop all the NaN values
df.dropna(inplace=True)

In [6]:
#Recount all the NaN values to make sure they are dropped
df.isnull().sum().sum()

0

In [7]:
#Use method to convert String to int
def rating(x):
    if x == 'Low_Rating':
        return 0
    if x == 'High_Rating':
        return 1
    

In [8]:
#Apply the method to the rating_classification column
df['Rating_Classification'] = df['Rating_Classification'].apply(rating)

In [9]:
#Use method to convert String to int
def binary(x):
    if x == 'Other':
        return 0
    if x == 'English'or 'Big' or 'Famous' or 'Serie' or 'Top_Category':
        return 1

In [10]:
#Apply the method to the rating_classification column
df['Size_Dummy'] = df['Size_Dummy'].apply(binary)

In [11]:
df['Serie_Dummy'] = df['Serie_Dummy'].apply(binary)

In [12]:
df['Famous_Dummy'] = df['Famous_Dummy'].apply(binary)

In [13]:
df['Language_Dummy'] = df['Language_Dummy'].apply(binary)

In [14]:
df['Categories_Dummy'] = df['Categories_Dummy'].apply(binary)

In [15]:
#Create a new dataframe with the target variables
df.tail(10)

Unnamed: 0,ISBN,Rating,Price,Pages,Rating Count,Language_Dummy,Size_Dummy,Famous_Dummy,Categories_Dummy,Serie_Dummy,Rating_Classification
1921,9780842360616,3.99,5.56,356,3967,1,1,1,1,1,0
1922,9780425206867,3.76,3.19,216,22369,1,1,1,1,1,0
1923,9780142302330,4.39,15.14,1008,504,1,1,1,1,0,1
1924,9780140437850,3.53,5.69,289,3471,1,1,0,1,0,0
1925,9780812975932,3.44,3.28,142,5028,1,1,0,1,0,0
1926,9780441731183,3.78,3.89,341,6092,1,1,1,1,1,0
1927,9780811213769,4.3,3.59,99,599,1,1,0,1,0,1
1928,9780553562606,4.08,3.12,518,5356,1,1,0,1,1,0
1929,9781400044870,4.02,3.59,320,4455,1,1,0,1,0,0
1930,9780553273861,3.64,3.52,291,580,1,1,1,1,0,0


##  Separate the Features (X) from the Target (y)

In [16]:
#The Outcome column is defined as y, or the target.
#X, or features, is created by dropping the Outcome column from the DataFrame.

y = df["Rating_Classification"]
X = df.drop(columns=["Rating_Classification","ISBN","Rating","Rating Count"])


 ## Split our data into training and testing

In [17]:

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(1448, 7)
(483, 7)
(1448,)
(483,)


In [18]:
# Creating StandardScaler instance (to scalate data)
scaler = StandardScaler()

In [19]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [20]:
# Scaling and trnsforming data 
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Random-forest

In [21]:
# Create a random forest classifier. The n_estimators will allow us to set the number of trees that will be created by the algorithm
# The best practice is to use between 64 and 128 random forests
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [22]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making Predictions Using the Tree Model

In [23]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Model evaluation

In [24]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,361,23
Actual 1,81,18


In [25]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,361,23
Actual 1,81,18


Accuracy Score : 0.7846790890269151
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       384
           1       0.44      0.18      0.26        99

    accuracy                           0.78       483
   macro avg       0.63      0.56      0.57       483
weighted avg       0.74      0.78      0.75       483



In [27]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.4555133 , 0.45921355, 0.00140104, 0.01836751, 0.01995219,
       0.02248519, 0.02306722])

In [28]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4592135519182356, 'Pages'),
 (0.45551329705725374, 'Price'),
 (0.02306722445989212, 'Serie_Dummy'),
 (0.022485193005481453, 'Categories_Dummy'),
 (0.019952189641980558, 'Famous_Dummy'),
 (0.018367508006977833, 'Size_Dummy'),
 (0.0014010359101787133, 'Language_Dummy')]