In [None]:
import pandas as pd
import numpy as np
import joblib
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from sklearn.metrics import precision_score, recall_score, f1_score

# ✅ Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [2]:

df = pd.read_csv("Cleaned_audiob_adv.csv")

In [3]:
# Combine relevant features for NLP processing
df['combined_text'] = df['Book Name'] + " "+ df['Author'] +" " + df['Description'] + " " + df['Ranks and Genre']
df['combined_text'] = df['combined_text'].fillna('')


In [4]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# ✅ Move to Tensor & GPU
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32, device=device)

# Save Vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [5]:
similarity_matrix = cosine_similarity(tfidf_matrix)

# ✅ Move to GPU
similarity_matrix = torch.tensor(similarity_matrix, dtype=torch.float32, device=device)

joblib.dump(similarity_matrix.cpu().numpy(), "similarity_matrix.pkl")


['similarity_matrix.pkl']

In [7]:
k = 10  
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)
joblib.dump(kmeans, "kmeans_model.pkl")


['agglo_model.pkl']

In [8]:
reader = Reader(rating_scale=(1, 5))
ratings = pd.DataFrame({
    'User ID': np.random.randint(1, 100, size=len(df)),  # Generate random User IDs
    'Book Name': np.arange(len(df)),  # Unique IDs for books
    'Rating': np.random.randint(1, 6, size=len(df))  # Ratings between 1-5
})


data = Dataset.load_from_df(ratings[['User ID','Book Name', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd = SVD()
svd.fit(trainset)
joblib.dump(svd, "svd_model.pkl")

knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
knn.fit(trainset)
joblib.dump(knn, "knn_model.pkl")


Computing the cosine similarity matrix...
Done computing similarity matrix.


['knn_model.pkl']

In [9]:
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return precision, recall, f1

# ✅ Evaluate Using SVD Model
y_true = ratings['Rating']
y_pred = [svd.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()]

precision, recall, f1 = evaluate_model(y_true, np.round(y_pred))
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Precision: 0.1309, Recall: 0.2533, F1-Score: 0.1522


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Convert predictions to a NumPy array
y_pred = np.array([svd.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()])

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(ratings['Rating'], y_pred))
mae = mean_absolute_error(ratings['Rating'], y_pred)

print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")


RMSE: 1.2713, MAE: 1.0983


In [14]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150], 
    'lr_all': [0.002, 0.005, 0.01], 
    'reg_all': [0.02, 0.1, 0.2]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

print(gs.best_params['rmse'])  # Best hyperparameters


{'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.2}


In [15]:
from surprise import SVD

# Use the best parameters
svd = SVD(n_factors=100, lr_all=0.002, reg_all=0.2)

# Train on the full dataset
trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x149fc1cd820>

In [25]:
user_id = 9 # Replace with actual user ID
book_name = "Some Book"  # Replace with actual book name

predicted_rating = svd.predict(user_id, book_name).est
print(f"Predicted Rating: {predicted_rating:.2f}")


Predicted Rating: 3.34


In [None]:
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return precision, recall, f1

# ✅ Evaluate Using KNN Model
y_true = ratings['Rating']
y_pred = [knn.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()]

precision, recall, f1 = evaluate_model(y_true, np.round(y_pred))
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


Precision: 0.9140, Recall: 0.8439, F1-Score: 0.8582


In [64]:
df.to_csv("processed_books.csv", index=False)
print("✅ All models trained, evaluated, and saved successfully ")


✅ All models trained, evaluated, and saved successfully 
