In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arifa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
books = pd.read_csv('data/data.csv', on_bad_lines='skip', encoding='latin-1', low_memory=False)

In [3]:
books = books[['isbn10','title', 'authors', 'categories', 'published_year','thumbnail']]

Remove bad words and skip lines

In [5]:
books.rename(columns={"image_url": "thumbnail"}, inplace=True)

In [6]:
books.head()

Unnamed: 0,isbn10,title,authors,categories,published_year,thumbnail
0,2005883,Gilead,Marilynne Robinson,Fiction,2004.0,http://books.google.com/books/content?id=KQZCP...
1,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,2000.0,http://books.google.com/books/content?id=gA5GP...
2,6163831,The One Tree,Stephen R. Donaldson,American fiction,1982.0,http://books.google.com/books/content?id=OmQaw...
3,6178731,Rage of angels,Sidney Sheldon,Fiction,1993.0,http://books.google.com/books/content?id=FKo2T...
4,6280897,The Four Loves,Clive Staples Lewis,Christian life,2002.0,http://books.google.com/books/content?id=XhQ5X...


In [7]:
import pandas as pd
import re

# Define the preprocess_text function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Example DataFrame (replace with your actual books DataFrame)
# books = pd.read_csv('data/data.csv')

# Ensure each column is treated as a string and handle missing values
books['title'] = books['title'].fillna('').astype(str)
books['authors'] = books['authors'].fillna('').astype(str)
books['categories'] = books['categories'].fillna('').astype(str)
books['published_year'] = books['published_year'].fillna('').astype(str)

# Combine fields into a single 'content' column
books['content'] = books['title'] + ' ' + books['authors'] + ' ' + books['categories'] + ' ' + books['published_year']

# Apply preprocessing to the 'content' column
books['content'] = books['content'].apply(preprocess_text)



In [19]:
from collections import Counter
from math import log

# Preprocess text (lowercase and tokenize)
def preprocess(text):
    return text.lower().split()

# Compute Term Frequency (TF)
def compute_tf(doc):
    term_count = Counter(doc)
    total_terms = len(doc)
    return {term: count / total_terms for term, count in term_count.items()}

# Compute Inverse Document Frequency (IDF)
def compute_idf(corpus):
    num_docs = len(corpus)
    idf = {}
    all_terms = set(term for doc in corpus for term in doc)
    for term in all_terms:
        doc_count = sum(1 for doc in corpus if term in doc)
        idf[term] = log((num_docs + 1) / (doc_count + 1)) + 1  # Smoothing to avoid division by zero
    return idf

# Compute TF-IDF
def compute_tfidf(tf, idf):
    return {term: tf[term] * idf[term] for term in tf}


# Preprocess the book 'content' column
books['content'] = books['content'].fillna('')  # Handle missing content by filling with empty strings
processed_corpus = [preprocess(text) for text in books['content']]  # Tokenize and preprocess

# Calculate TF for each document
tf_list = [compute_tf(doc) for doc in processed_corpus]

# Calculate IDF for the corpus
idf = compute_idf(processed_corpus)

# Calculate TF-IDF for each document
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]

# Convert to a matrix (similar to TfidfVectorizer output)
vocab = sorted(idf.keys())  # Create consistent vocabulary
tfidf_matrix = pd.DataFrame(
    [{term: tfidf.get(term, 0) for term in vocab} for tfidf in tfidf_list]
).fillna(0).values  # Convert TF-IDF dictionary to a matrix

# The resulting `tfidf_matrix` is equivalent to what `TfidfVectorizer` produces.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

Function Build By Arif Asyraf

---ambik title buku sebagai input untuk cari similar book dan bagi balik title buku lain.---

In [11]:
def recommend_books(title, n_recommendations=5):
    # Find the book index based on title
    try:
        idx = books[books['title'].str.contains(title, case=False)].index[0]
    except IndexError:
        return "Book not found in the database."

    # Calculate similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_recommendations + 1]

    # Get book indices and titles
    book_indices = [i[0] for i in sim_scores]
    return books['title'].iloc[book_indices].tolist()

# Example usage
print(recommend_books("Some Book Title"))


Book not found in the database.


Creating TF-IDF model

In [13]:
# Load a list of stop words (common words like 'the', 'is', etc.)
stop_words = set([
    'the', 'is', 'in', 'and', 'to', 'of', 'a', 'for', 'it', 'on', 'this', 'that', 
    'with', 'as', 'was', 'by', 'an', 'be', 'or', 'at', 'are', 'but', 'from', 'not'
])

# Preprocess text (lowercase, remove stop words, and tokenize)
def preprocess(text):
    return [word for word in text.lower().split() if word not in stop_words]

# Compute Term Frequency (TF)
def compute_tf(doc):
    term_count = Counter(doc)
    total_terms = len(doc)
    return {term: count / total_terms for term, count in term_count.items()}

# Compute Inverse Document Frequency (IDF)
def compute_idf(corpus):
    num_docs = len(corpus)
    idf = {}
    all_terms = set(term for doc in corpus for term in doc)
    for term in all_terms:
        doc_count = sum(1 for doc in corpus if term in doc)
        idf[term] = log((num_docs + 1) / (doc_count + 1)) + 1  # Smoothing
    return idf

# Compute TF-IDF
def compute_tfidf(tf, idf):
    return {term: tf[term] * idf[term] for term in tf}

# Preprocess the book 'content' column
books['content'] = books['content'].fillna('')  # Handle missing content
processed_corpus = [preprocess(text) for text in books['content']]

# Calculate TF for each document
tf_list = [compute_tf(doc) for doc in processed_corpus]

# Calculate IDF for the corpus
idf = compute_idf(processed_corpus)

# Calculate TF-IDF for each document
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]

# Create a consistent vocabulary
vocab = sorted(idf.keys())

# Convert TF-IDF to a matrix
tfidf_matrix = pd.DataFrame(
    [{term: tfidf.get(term, 0) for term in vocab} for tfidf in tfidf_list]
).fillna(0).values

Calculate Similarity 

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [17]:
import pickle

# Save TF-IDF vocabulary and IDF values
with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump({"idf": idf, "vocab": vocab}, f)

# Save the similarity matrix
with open('similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)
