In [1]:
import pandas as pd
import numpy as np
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from sklearn.metrics import precision_score, recall_score, f1_score

# ✅ Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [41]:

df = pd.read_csv("Cleaned_audiob_adv.csv")

In [42]:
df

Unnamed: 0,Book Name,Author,Rating,Number_of_Reviews,Price,Description,Listening Time,Ranks and Genre,Time
0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,371.0,10080.0,"Over the past three years, Jay Shetty has beco...",10 hours and 54 minutes,",#1 in Audible Audiobooks & Originals (See Top...",654.0
1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3682.0,615.0,Brought to you by Penguin.,3 hours and 23 minutes,",#2 in Audible Audiobooks & Originals (See Top...",203.0
2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20306.0,10378.0,"In this generation-defining self-help guide, a...",5 hours and 17 minutes,",#3 in Audible Audiobooks & Originals (See Top...",317.0
3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4678.0,888.0,Brought to you by Penguin.,5 hours and 35 minutes,",#5 in Audible Audiobooks & Originals (See Top...",335.0
4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4308.0,1005.0,"Stop going through life, Start growing throug...",6 hours and 25 minutes,",#6 in Audible Audiobooks & Originals (See Top...",385.0
...,...,...,...,...,...,...,...,...,...
4038,"Factfulness: Wie wir lernen, die Welt so zu se...",Hans Rosling,4.6,72.0,703.0,"Sorry, we just need to make sure you're not a ...",-1,-1,10.0
4039,Late-Talking Children: A Symptom or a Stage?,Stephen M. Camarata,4.6,92.0,703.0,"Sorry, we just need to make sure you're not a ...",-1,-1,10.0
4040,"The Marketing of Evil: How Radicals, Elitists ...",David Kupelian,4.7,490.0,586.0,"Americans have come to tolerate, embrace, and ...",-1,-1,10.0
4041,Things I Wish I'd Known Before We Got Married,Gary Chapman,4.7,1388.0,516.0,\n\nOops!\nIt's rush hour and traffic is pilin...,-1,-1,10.0


In [18]:
# Combine relevant features for NLP processing
df['combined_text'] = df['Book Name'] + " " + df['Description'] + " " + df['Ranks and Genre']
df['combined_text'] = df['combined_text'].fillna('')


In [19]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# ✅ Move to Tensor & GPU
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32, device=device)

# Save Vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

# Classification Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Regression Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))


['vectorizer.pkl']

In [20]:
similarity_matrix = cosine_similarity(tfidf_matrix)

# ✅ Move to GPU
similarity_matrix = torch.tensor(similarity_matrix, dtype=torch.float32, device=device)

joblib.dump(similarity_matrix.cpu().numpy(), "similarity_matrix.pkl")


['similarity_matrix.pkl']

In [21]:
k = 10  
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)
joblib.dump(kmeans, "kmeans_model.pkl")

dbscan = DBSCAN(eps=0.5, min_samples=5).fit(tfidf_matrix)
df["dbscan_cluster"] = dbscan.labels_

agglo = AgglomerativeClustering(n_clusters=10).fit(tfidf_matrix.toarray())
df["agglo_cluster"] = agglo.labels_

joblib.dump(dbscan, "dbscan_model.pkl")
joblib.dump(agglo, "agglo_model.pkl")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

# Classification Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Regression Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))


['agglo_model.pkl']

In [26]:
reader = Reader(rating_scale=(1, 5))
ratings = pd.DataFrame({
    'User ID': np.random.randint(1, 100, size=len(df)),  # Generate random User IDs
    'Book Name': np.arange(len(df)),  # Unique IDs for books
    'Rating': np.random.randint(1, 6, size=len(df))  # Ratings between 1-5
})


data = Dataset.load_from_df(ratings[['User ID','Book Name', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd = SVD()
svd.fit(trainset)
joblib.dump(svd, "svd_model.pkl")

knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
knn.fit(trainset)
joblib.dump(knn, "knn_model.pkl")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

# Classification Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Regression Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))


Computing the cosine similarity matrix...
Done computing similarity matrix.


['knn_model.pkl']

In [43]:
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return precision, recall, f1

# ✅ Evaluate Using SVD Model
y_true = ratings['Rating']
y_pred = [svd.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()]

precision, recall, f1 = evaluate_model(y_true, np.round(y_pred))
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

# Classification Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Regression Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))


Precision: 0.1360, Recall: 0.2592, F1-Score: 0.1590


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return precision, recall, f1

# ✅ Evaluate Using SVD Model
y_true = ratings['Rating']
y_pred = [knn.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()]

precision, recall, f1 = evaluate_model(y_true, np.round(y_pred))
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score

# Classification Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Regression Metrics (modify based on the model's actual outputs)
if 'y_test' in locals() and 'y_pred' in locals():
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))


Precision: 0.9103, Recall: 0.8380, F1-Score: 0.8521


In [32]:
df.to_csv("processed_books.csv", index=False)
print("✅ All models trained, evaluated, and saved successfully on GPU!")


✅ All models trained, evaluated, and saved successfully on GPU!


In [3]:
df = pd.read_csv("processed_books.csv")