# Importing the libraries

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

# Loading the dataset

In [12]:
books_data = pd.read_csv(r'C:\Users\user\Downloads\archive\best-selling-books.csv')

In [13]:
books_data

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales in millions,Genre
0,A Tale of Two Cities,Charles Dickens,English,1859,200.0,Historical fiction
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200.0,Novella
2,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120.0,Fantasy
3,And Then There Were None,Agatha Christie,English,1939,100.0,Mystery
4,Dream of the Red Chamber (紅樓夢),Cao Xueqin,Chinese,1791,100.0,Family saga
...,...,...,...,...,...,...
169,The Goal,Eliyahu M. Goldratt,English,1984,10.0,
170,Fahrenheit 451,Ray Bradbury,English,1953,10.0,
171,Angela's Ashes,Frank McCourt,English,1996,10.0,
172,The Story of My Experiments with Truth (સત્યના...,Mohandas Karamchand Gandhi,Gujarati,1929,10.0,


In [14]:
df = pd.read_csv(r'C:\Users\user\Downloads\archive\best-selling-books.csv')
column_types = df.dtypes

print(column_types)

Book                              object
Author(s)                         object
Original language                 object
First published                    int64
Approximate sales in millions    float64
Genre                             object
dtype: object


# Sorting languages in terms of popularity 
### Source gotten from 'https://www.visualcapitalist.com/100-most-spoken-languages/'

In [15]:
language_mapping = {
    'English': 0,
    'Chinese': 1, 
    'Hindi': 2,
    'Spanish': 3,
    'French': 4,
    'Russian': 5,
    'Portuguese': 6,
    'German': 7,
    'Japanese': 8,
    'Italian': 9,
    'Dutch': 10,
    'Swedish': 11,
    'Norwegian': 12,
    'Czech': 13,
    'Yiddish': 14,
    'Gujarati': 15
}
books_data['Popularity'] = books_data['Original language'].map(language_mapping)

# Features and target

In [16]:
unique_languages = books_data['Original language'].unique()
print(unique_languages)

['English' 'French' 'Chinese' 'Hindi' 'Portuguese' 'Spanish' 'German'
 'Italian' 'Norwegian' 'Russian' 'Dutch' 'Swedish' 'Japanese' 'Czech'
 'Yiddish' 'Gujarati']


In [17]:
features = ['Book', 'Author(s)', 'Original language', 'Genre','First published', 'Popularity']
target = 'Approximate sales in millions'

In [18]:
books_data

Unnamed: 0,Book,Author(s),Original language,First published,Approximate sales in millions,Genre,Popularity
0,A Tale of Two Cities,Charles Dickens,English,1859,200.0,Historical fiction,0
1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200.0,Novella,4
2,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120.0,Fantasy,0
3,And Then There Were None,Agatha Christie,English,1939,100.0,Mystery,0
4,Dream of the Red Chamber (紅樓夢),Cao Xueqin,Chinese,1791,100.0,Family saga,1
...,...,...,...,...,...,...,...
169,The Goal,Eliyahu M. Goldratt,English,1984,10.0,,0
170,Fahrenheit 451,Ray Bradbury,English,1953,10.0,,0
171,Angela's Ashes,Frank McCourt,English,1996,10.0,,0
172,The Story of My Experiments with Truth (સત્યના...,Mohandas Karamchand Gandhi,Gujarati,1929,10.0,,15


# Data preprocessing

## Label encoding for categorical features

In [19]:

label_encoders = {}
for feature in features:
    label_encoders[feature] = LabelEncoder()
    books_data[feature] = label_encoders[feature].fit_transform(books_data[feature])

# Scale the target variable
scaler = StandardScaler()
books_data[target] = scaler.fit_transform(books_data[target].values.reshape(-1, 1))

# Split data into features and target
X = books_data[features]
y = books_data[target]


In [48]:
label_encoders

{'Book': LabelEncoder(),
 'Author(s)': LabelEncoder(),
 'Original language': LabelEncoder(),
 'Genre': LabelEncoder(),
 'Popularity': LabelEncoder()}

In [10]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network
model = Sequential()
model.add(Dense(64, input_dim=len(features), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))  # Output layer

model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print(f"Mean Squared Error on test data: {loss}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on test data: 0.5872443318367004


# Increase model complexity

In [11]:
model = Sequential()
model.add(Dense(128, input_dim=len(features), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

# Change optimizer and learning rate
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model with new settings

model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test))

loss = model.evaluate(X_test, y_test)
print(f"Mean Squared Error on test data: {loss}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [12]:
print(f"Mean Squared Error on test data: {loss}")

Mean Squared Error on test data: 0.4912330210208893


In [13]:
input_shape = model.layers[0].input_shape
print("Input Shape:", input_shape)

Input Shape: (None, 6)


In [14]:
import pickle

In [3]:
import pickle


built_model = model

# Save the model to a file
with open('galactus.pkl', 'wb') as file:
    pickle.dump(built_model, file)

NameError: name 'model' is not defined

In [20]:
with open('galactus.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [21]:
new_book = {
    'Book': 'A Tale of Two Cities',
    'Author(s)': 'Charles Dickens',
    'Original language': 'English',
    'Genre': 'Fantasy',
    'First published': 2008,
    'Popularity': 0  
}

# Preprocess the input features before predicting
# (For instance, encoding categorical variables, scaling numerical data, etc.)
# Make sure the data format is consistent with what the model expects
categorical_features = ['Book', 'Author(s)', 'Original language', 'Genre', 'Popularity']

# Extract the features from the new book data


# new_book_encoded = new_book.copy()

# for feature in categorical_features:
#     new_book_encoded[feature] = label_encoders[feature].fit_transform([new_book[feature]])

# books_data[target] = scaler.fit_transform(books_data[target].values.reshape(-1, 1))

# # new_book_features = [
# #     new_book['Book'],
# #     new_book['Author(s)'],
# #     new_book['Original language'],
# #     new_book['Original language'],
# #     new_book['First published'],
# #     new_book['Popularity']
# # ]
# print(new_book_encoded)
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    new_book[feature] = label_encoders[feature].fit_transform([new_book[feature]])

# Scale the numerical feature
scaler = StandardScaler()
new_book['First published'] = scaler.fit_transform([[new_book['First published']]])
print(new_book)

# Prepare data for prediction
new_input_features = [[new_book[feature] for feature in features]]

print("Input Features are")
print()
print(new_book_features)

{'Book': array([0], dtype=int64), 'Author(s)': array([0], dtype=int64), 'Original language': array([0], dtype=int64), 'Genre': array([0], dtype=int64), 'First published': array([[0.]]), 'Popularity': array([0], dtype=int64)}
Input Features are



NameError: name 'new_book_features' is not defined

In [22]:
label_encoders

{'Book': LabelEncoder(),
 'Author(s)': LabelEncoder(),
 'Original language': LabelEncoder(),
 'Genre': LabelEncoder(),
 'Popularity': LabelEncoder()}

In [30]:
# Fit the encoders to the data
label_encoders['Book'].fit(['The Last Kingdom'])
label_encoders['Author(s)'].fit(['Brandon Sanderson'])
label_encoders['Original language'].fit(['English'])
label_encoders['Genre'].fit(['Fantasy'])
label_encoders['Popularity'].fit([0])

new_book_encoded = {
    'Book': label_encoders['Book'].transform(['The Last Kingdom'])[0],
    'Author(s)': label_encoders['Author(s)'].transform(['Brandon Sanderson'])[0],
    'Original language': label_encoders['Original language'].transform(['English'])[0],
    'Genre': label_encoders['Genre'].transform(['Fantasy'])[0],
    'First published':  2008,
    'Popularity': label_encoders['Popularity'].transform([0])[0]
}

In [31]:
print(new_book_encoded)

{'Book': 0, 'Author(s)': 0, 'Original language': 0, 'Genre': 0, 'First published': 2008, 'Popularity': 0}


In [44]:
new_input_features = [new_book_encoded[feature] for feature in features]

In [45]:
print(new_input_features)
new_input_features = [1,2,4,2,2008,2]
new_input_features = np.array(new_input_features).reshape(1, -1)

[0, 0, 0, 0, 2008, 0]


In [46]:
new_input_features

array([[   1,    2,    4,    2, 2008,    2]])

In [49]:
# Assuming the model takes a 2D array-like input, you might need to reshape the features
#features_reshaped = new_book_encoded  # Reshape to a 2D array-like structure

# Make predictions using the loaded model
predicted_popularity = loaded_model.predict(new_input_features)

print("Predicted Popularity:", abs(predicted_popularity[0][0]))

Predicted Popularity: 21.87502
