# Setting Up Requirements

In [1]:
import pandas as pd
import numpy as np
import joblib
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from sklearn.metrics import precision_score, recall_score, f1_score

# ✅ Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


# Get Data

In [2]:

df = pd.read_csv("Cleaned_audiob_adv.csv")

# NLP Processing 

In [3]:
# Combine relevant features for NLP processing
df['combined_text'] = df['Book Name'] + " "+ df['Author'] +" " + df['Description'] + " " + df['Ranks and Genre']
df['combined_text'] = df['combined_text'].fillna('')


In [4]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# ✅ Move to Tensor & GPU
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32, device=device)

# Save Vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [5]:
similarity_matrix = cosine_similarity(tfidf_matrix)

# ✅ Move to GPU
similarity_matrix = torch.tensor(similarity_matrix, dtype=torch.float32, device=device)

joblib.dump(similarity_matrix.cpu().numpy(), "similarity_matrix.pkl")


['similarity_matrix.pkl']

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity between documents
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Extract upper triangle values (excluding diagonal)
cosine_sim_values = cosine_sim_matrix[np.triu_indices_from(cosine_sim_matrix, k=1)]

# Print mean and standard deviation of cosine similarity
print(f"Cosine Similarity: Mean = {np.mean(cosine_sim_values):.4f}, Std = {np.std(cosine_sim_values):.4f}")


Cosine Similarity: Mean = 0.0495, Std = 0.1115


# K-Means Clustering

In [6]:
k = 10  
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)
joblib.dump(kmeans, "kmeans_model.pkl")


['kmeans_model.pkl']

# KNN Model & SVD Model

In [9]:
reader = Reader(rating_scale=(1, 5))
ratings = pd.DataFrame({
    'User ID': np.random.randint(1, 100, size=len(df)),  # Generate random User IDs
    'Book Name': np.arange(len(df)),  # Unique IDs for books
    'Rating': np.random.randint(1, 6, size=len(df))  # Ratings between 1-5
})


data = Dataset.load_from_df(ratings[['User ID','Book Name', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd = SVD()
svd.fit(trainset)
joblib.dump(svd, "svd_model.pkl")

knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
knn.fit(trainset)
joblib.dump(knn, "knn_model.pkl")


Computing the cosine similarity matrix...
Done computing similarity matrix.


['knn_model.pkl']

# Improving & Evaluating Models

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Convert predictions to a NumPy array
y_pred = np.array([svd.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()])

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(ratings['Rating'], y_pred))
mae = mean_absolute_error(ratings['Rating'], y_pred)

print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")


RMSE: 1.0966, MAE: 0.9353


In [12]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150], 
    'lr_all': [0.002, 0.005, 0.01], 
    'reg_all': [0.02, 0.1, 0.2]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

print(gs.best_params['rmse'])  # Best hyperparameters


{'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.2}


In [None]:
from surprise import SVD

# Use the best parameters
svd = SVD(n_factors=100, lr_all=0.002, reg_all=0.2)

# Train on the full dataset
trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x29f3cdc15e0>

In [21]:
user_id = 9 # Replace with actual user ID
book_name = "Sherlock Holmes: The Definitive Collection"  # Replace with actual book name

predicted_rating = svd.predict(user_id, book_name).est
print(f"Predicted Rating: {predicted_rating:.2f}")


Predicted Rating: 3.22


In [24]:
def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return precision, recall, f1

# ✅ Evaluate Using KNN Model
y_true = ratings['Rating']
y_pred = [knn.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()]

precision, recall, f1 = evaluate_model(y_true, np.round(y_pred))
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


Precision: 0.9127, Recall: 0.8422, F1-Score: 0.8561


In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Convert predictions to a NumPy array
y_pred = np.array([knn.predict(row['User ID'], row['Book Name']).est for _, row in ratings.iterrows()])

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(ratings['Rating'], y_pred))
mae = mean_absolute_error(ratings['Rating'], y_pred)

print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")


RMSE: 0.6434, MAE: 0.2436


In [23]:
user_id = 1 # Replace with actual user ID
book_name = "The Intelligent Investor Rev Ed."  # Replace with actual book name

predicted_rating = knn.predict(user_id, book_name).est
print(f"Predicted Rating: {predicted_rating:.2f}")


Predicted Rating: 3.01


In [64]:
df.to_csv("processed_books.csv", index=False)
print("✅ All models trained, evaluated, and saved successfully ")


✅ All models trained, evaluated, and saved successfully 


In [34]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score

# Load models and data
vectorizer = joblib.load("vectorizer.pkl")
similarity_matrix = joblib.load("similarity_matrix.pkl")
kmeans = joblib.load("kmeans_model.pkl")
svd = joblib.load("svd_model.pkl")
processed_books = pd.read_csv("processed_books.csv")

# Ensure the 'cluster' column exists in the processed_books dataframe
if 'cluster' not in processed_books.columns:
    processed_books['cluster'] = kmeans.predict(vectorizer.transform(processed_books['Book Name']))

def recommend_content_based(book_title, min_rating=0, num_recommendations=5):
    if book_title not in processed_books['Book Name'].values:
        raise ValueError(f"Book '{book_title}' not found in the dataset.")
    idx = processed_books[processed_books['Book Name'] == book_title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    filtered_books = processed_books.iloc[[i[0] for i in sim_scores]][['Book Name', 'Author', 'Rating']]
    return filtered_books[filtered_books['Rating'] >= min_rating].head(num_recommendations)

def recommend_clustering(book_title, min_rating=0, num_recommendations=5):
    if book_title not in processed_books['Book Name'].values:
        raise ValueError(f"Book '{book_title}' not found in the dataset.")
    idx = processed_books[processed_books['Book Name'] == book_title].index[0]
    cluster_label = processed_books.iloc[idx]['cluster']
    cluster_books = processed_books[processed_books['cluster'] == cluster_label]
    return cluster_books[cluster_books['Rating'] >= min_rating].sample(min(num_recommendations, len(cluster_books)))[['Book Name', 'Author', 'Rating']]

def recommend_hybrid(book_title, min_rating=0, num_recommendations=5):
    content_recs = recommend_content_based(book_title, min_rating, num_recommendations * 2)
    clustering_recs = recommend_clustering(book_title, min_rating, num_recommendations * 2)
    hybrid_recs = pd.concat([content_recs, clustering_recs]).drop_duplicates().nlargest(num_recommendations, 'Rating')
    return hybrid_recs

def evaluate_model(recommendations, ground_truth):
    predicted_ratings = recommendations['Rating']
    true_ratings = ground_truth['Rating']
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    
    # Converting ratings to binary for F1 score
    predicted_binary = predicted_ratings >= 3.0  # Example threshold
    true_binary = true_ratings >= 3.0
    f1 = f1_score(true_binary, predicted_binary)
    
    return rmse, mae, f1

# Initialize lists to store evaluation metrics
content_metrics = []
clustering_metrics = []
hybrid_metrics = []

# Loop through each book in the dataset
for book_title in processed_books['Book Name'].unique():
    try:
        # Get recommendations for each model
        content_based_recs = recommend_content_based(book_title)
        cluster_based_recs = recommend_clustering(book_title)
        hybrid_recs = recommend_hybrid(book_title)
        
        # Simulate ground truth for evaluation (replace with actual ground truth)
        ground_truth = processed_books.sample(len(content_based_recs))
        
        # Evaluate Content-Based Model
        content_metrics.append(evaluate_model(content_based_recs, ground_truth))
        
        # Evaluate Clustering-Based Model
        clustering_metrics.append(evaluate_model(cluster_based_recs, ground_truth))
        
        # Evaluate Hybrid Model
        hybrid_metrics.append(evaluate_model(hybrid_recs, ground_truth))
    
    except ValueError as e:
        print(e)

# Calculate average metrics for each model
def calculate_average_metrics(metrics):
    metrics = np.array(metrics)
    return np.mean(metrics, axis=0)

content_avg_metrics = calculate_average_metrics(content_metrics)
clustering_avg_metrics = calculate_average_metrics(clustering_metrics)
hybrid_avg_metrics = calculate_average_metrics(hybrid_metrics)

print(f"Content-Based Model - Avg RMSE: {content_avg_metrics[0]:.4f}, Avg MAE: {content_avg_metrics[1]:.4f}, Avg F1 Score: {content_avg_metrics[2]:.4f}")
print(f"Clustering-Based Model - Avg RMSE: {clustering_avg_metrics[0]:.4f}, Avg MAE: {clustering_avg_metrics[1]:.4f}, Avg F1 Score: {clustering_avg_metrics[2]:.4f}")
print(f"Hybrid Model - Avg RMSE: {hybrid_avg_metrics[0]:.4f}, Avg MAE: {hybrid_avg_metrics[1]:.4f}, Avg F1 Score: {hybrid_avg_metrics[2]:.4f}")


Content-Based Model - Avg RMSE: 0.4277, Avg MAE: 0.3309, Avg F1 Score: 0.9942
Clustering-Based Model - Avg RMSE: 0.4284, Avg MAE: 0.3297, Avg F1 Score: 0.9923
Hybrid Model - Avg RMSE: 0.4037, Avg MAE: 0.3152, Avg F1 Score: 0.9966
