<a href="https://colab.research.google.com/github/Choudharynipun/Book_reccomendation_system/blob/main/Book_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
data = ! kaggle datasets download <book_recommendation_data>

In [None]:
df = data

num_rows, num_cols = df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

column_names = df.columns.tolist()
print("Column names:", column_names)

sample_data = df.head(10)
print("Sample data:")
print(sample_data)

data_stats = df.describe()
print("Dataset statistics:")
print(data_stats)

In [None]:
import matplotlib.pyplot as plt

plt.hist(df['average_rating'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.scatter(df['num_pages'], df['ratings_count'], alpha=0.5, color='orange')
plt.title('Number of Pages vs Ratings Count')
plt.xlabel('Number of Pages')
plt.ylabel('Ratings Count')
plt.show()


In [None]:
import seaborn as sns
plt.figure(figsize=(12, 6))
sns.countplot(x=df['published_year'], palette='viridis')
plt.title('Distribution of Published Years')
plt.xlabel('Published Year')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
top_rated_books = df.nlargest(10, 'average_rating')
plt.figure(figsize=(12, 8))
sns.barplot(x='average_rating', y='title', hue='categories', data=top_rated_books, palette='coolwarm')
plt.title('Top 10 Books with the Highest Ratings by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Book Title')
plt.legend(title='Genre', bbox_to_anchor=(1, 1))
plt.show()

In [None]:
top_authors = df['authors'].value_counts().head(10)
plt.figure(figsize=(12, 6))
top_authors.plot(kind='bar', color='salmon')
plt.title('Top 10 Authors and Their Book Counts')
plt.xlabel('Authors')
plt.ylabel('Book Counts')
plt.show()

In [None]:
df['subtitle'].fillna('', inplace=True)
df['description'].fillna('', inplace=True)
df['published_year'] = df['published_year'].astype('int', errors='ignore')


In [None]:
df['full_text'] = df['title'] + ' ' + df['subtitle'] + ' ' + df['description']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_text'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
pip install gensim

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_text = df['full_text'].apply(word_tokenize)
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
import numpy as np

def embed_text(text):
    words = word_tokenize(text)
    word_embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]

    if not word_embeddings:
        return np.zeros(word2vec_model.vector_size)

    return np.mean(word_embeddings, axis=0)

df['word_embeddings'] = df['full_text'].apply(embed_text)

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities_word2vec = cosine_similarity(list(df['word_embeddings']))

In [None]:
def get_recommendations_word2vec(book_title):
    idx = df[df['title'] == book_title].index[0]
    sim_scores = list(enumerate(cosine_similarities_word2vec[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    book_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[book_indices]

In [None]:
# Example usage:
book_title = 'Star Wars'
recommendations_word2vec = get_recommendations_word2vec(book_title)
print(recommendations_word2vec)