In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import pickle

In [34]:
clustered_df = pd.read_csv("C:/Users/Admin/OneDrive/Documents/Book Recommendation/clustered_data.csv")
clustered_df.head()

Unnamed: 0.1,Unnamed: 0,Book Name,Author,Rating,Number of Reviews,Price,Rank,Genre,Listening Time (minutes),cleaned_description,cluster
0,0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,371.0,10080.0,1,Society & Culture (Books),654.0,over past three year jay shetty become one wor...,2
1,1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3682.0,615.0,2,Personal Success,203.0,brought penguin,1
2,2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20306.0,10378.0,3,Personal Development & Self-Help,317.0,in guide superstar blogger cut crap show u sto...,2
3,3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4678.0,888.0,5,Personal Success,335.0,brought penguin,1
4,4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4308.0,1005.0,6,Spiritualism,385.0,stop going life start growing life,2


In [35]:
clustered_df.drop('Unnamed: 0', axis = 1, inplace = True)	

In [36]:
# Content-Based Filtering: Cosine similarity using genres, authors, and cleaned descriptions
def calculate_similarity(df):

    # Fill NaN values with empty strings in the relevant columns
    df['Genre'] = df['Genre'].fillna('')
    df['Author'] = df['Author'].fillna('')
    df['cleaned_description'] = df['cleaned_description'].fillna('')
    
    
    df['combined_features'] = df['Genre'].astype(str) + ' ' + df['Author'] + ' ' + df['cleaned_description']
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Save the cosine similarity matrix using pickle
    with open('C:/Users/Admin/OneDrive/Documents/Book Recommendation/cosine_similarity_matrix.pkl', 'wb') as file:
        pickle.dump(cosine_sim, file)
    
    return cosine_sim

In [37]:
# Function to recommend books based on a given book (by index)
def recommend_books(book_index, cosine_sim, df, top_n=5):
    similar_books = list(enumerate(cosine_sim[book_index]))
    sorted_books = sorted(similar_books, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommendations = [df.iloc[i[0]]['Book Name'] for i in sorted_books]
    return recommendations

In [38]:
recommend_books(book_index=1, cosine_sim=calculate_similarity(clustered_df), df=clustered_df, top_n=5)

['Ikigai: Los secretos de Japón para una vida larga y feliz',
 'Atomic Habits: An Easy and Proven Way to Build Good Habits and Break Bad Ones',
 'The Power of Habit: Why We Do What We Do, and How to Change',
 'Thinking, Fast and Slow',
 'Decisive: How to Make Better Choices in Life and Work']

In [39]:
# Clustering-based recommendations using KMeans
def perform_clustering(df, n_clusters=5):
    # Using numerical features for clustering
    X = df[['Rating', 'Number of Reviews', 'Price', 'Rank', 'Listening Time (minutes)']]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(X)
    
    # Save the clustering model using pickle
    with open('C:/Users/Admin/OneDrive/Documents/Book Recommendation/kmeans_clustering_model.pkl', 'wb') as file:
        pickle.dump(kmeans, file)
    
    return kmeans

In [40]:
# Recommend books from the same cluster
def recommend_from_cluster(book_index, df, top_n=5):
    cluster_label = df.iloc[book_index]['cluster']
    cluster_books = df[df['cluster'] == cluster_label].head(top_n)['Book Name'].tolist()
    return cluster_books

In [43]:
recommend_from_cluster(2,clustered_df,3)

['Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now',
 'The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life',
 "Life's Amazing Secrets: How to Find Balance and Purpose in Your Life"]

In [44]:
# Hybrid: Combine content-based and clustering recommendations
def hybrid_recommendation(book_index, cosine_sim, df, top_n=5):
    content_recommendations = recommend_books(book_index, cosine_sim, df, top_n=top_n)
    cluster_recommendations = recommend_from_cluster(book_index, df, top_n=top_n)
    return list(set(content_recommendations + cluster_recommendations))

In [45]:
hybrid_recommendation(4,calculate_similarity(clustered_df),clustered_df,5)

['Conquering Procrastination: How to Stop Stalling and Start Achieving!',
 'Dark Psychology and Manipulation: For a Better Life: The Ultimate Guide to Learning the Art of Persuasion, Emotional Influence, NLP Secrets, Hypnosis, Body Language, and Mind Control Techniques',
 'The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life',
 'The Anatomy of Success: Management Lessons from a Surgeon',
 'Anything You Want: 40 Lessons for a New Kind of Entrepreneur',
 'Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now',
 "Life's Amazing Secrets: How to Find Balance and Purpose in Your Life",
 'As the Crow Flies',
 'Sapiens']

In [46]:
# Precision and Recall calculation
def calculate_precision_recall(recommended, actual):
    recommended_set = set(recommended)
    actual_set = set(actual)

    true_positives = len(recommended_set.intersection(actual_set))
    precision = true_positives / len(recommended_set) if recommended_set else 0
    recall = true_positives / len(actual_set) if actual_set else 0

    return precision, recall

In [47]:
# RMSE Calculation using Linear Regression for rating prediction
def calculate_rmse(df):
    # Train a simple Linear Regression model for rating prediction
    X = df[['Rank', 'Number of Reviews', 'Price', 'Listening Time (minutes)']]
    y = df['Rating']

    model = LinearRegression()
    model.fit(X, y)
    
    # Make predictions
    predicted_ratings = model.predict(X)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y, predicted_ratings)
    
    # Take square root of MSE to get RMSE
    rmse = np.sqrt(mse)
    
    # Save the linear regression model
    with open('C:/Users/Admin/OneDrive/Documents/Book Recommendation/linear_regression_model.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    return rmse

In [48]:
# Step 1: Calculate similarity for content-based filtering
cosine_sim = calculate_similarity(clustered_df)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.03838458, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03838458, 1.        , ..., 0.        , 0.03623307,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.03623307, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [49]:
# Step 2: Perform clustering
perform_clustering(clustered_df)

In [51]:
# Step 3: Get hybrid recommendations for a given book index (example index 0)
book_index = 0
hybrid_recommendations = hybrid_recommendation(book_index, cosine_sim, clustered_df, top_n=5)
print(f'Hybrid Recommendations: {hybrid_recommendations}')

Hybrid Recommendations: ['Thank You for Arguing', 'The Defining Decade: Why Your Twenties Matter and How to Make the Most of Them Now', 'Influence: The Psychology of Persuasion', 'The Classic Tales Podcast, Season Five', 'The Intelligent Investor Rev Ed.', 'Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now', 'Secrets of the Millionaire Mind: Mastering the Inner Game of Wealth', 'The Facebook Effect: The Inside Story of the Company That Is Connecting the World', 'The 5AM Club: Own Your Morning. Elevate Your Life.', 'Das Think Like a Monk-Prinzip: Finde innere Ruhe und Kraft für ein erfülltes und sinnvolles Leben']


In [53]:
list(clustered_df['Book Name'].unique())

['Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now',
 'Ikigai: The Japanese Secret to a Long and Happy Life',
 'The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life',
 'Atomic Habits: An Easy and Proven Way to Build Good Habits and Break Bad Ones',
 "Life's Amazing Secrets: How to Find Balance and Purpose in Your Life",
 'Extraordinary Leadership',
 'Sapiens',
 'The Intelligent Investor Rev Ed.',
 'Rich Dad Poor Dad: What the Rich Teach Their Kids About Money - That the Poor and Middle Class Do Not!',
 'The 5AM Club: Own Your Morning. Elevate Your Life.',
 'Dream With Your Eyes Open: An Entrepreneurial Journey',
 'Influence: The Psychology of Persuasion',
 'The Sandman',
 'The Alchemist: A Fable About Following Your Dream',
 'How to Win Friends and Influence People',
 'Thinking, Fast and Slow',
 'Sherlock Holmes: The Definitive Collection',
 'Becoming',
 'Just Do It: The Nike Spirit in the Corporate World',
 'The

In [54]:
actual_books = ['Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now',
                'Thank You for Arguing','Chanakya Neeti','The 5AM Club: Own Your Morning. Elevate Your Life.',
                'वॉयर - एक कामुक लघुकथा']  # Replace with actual books liked by the user
precision, recall = calculate_precision_recall(hybrid_recommendations, actual_books)
print(f'Precision: {precision}, Recall: {recall}')

Precision: 0.3, Recall: 0.6


In [55]:
# Calculate RMSE
rmse = calculate_rmse(clustered_df)
print(f'RMSE: {rmse}')

RMSE: 0.33754594883402267
