# **TextExplorer - Tahap Pemodelan**

In [None]:
##----------------##
## LUKMANUL HAKIM ##
## A11.2022.14197 ##
##----------------##

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pickle
import os

In [3]:
# Load the dataset
df = pd.read_csv('dataset/bookv2.csv')
print(f"Loaded {len(df)} books")
df.head()




Loaded 133102 books


Unnamed: 0,asin,title,author,soldBy,imgUrl,productURL,stars,reviews,price,isKindleUnlimited,category_id,isBestSeller,isEditorsPick,isGoodReadsChoice,publishedDate,category
0,B00TZE87S4,Adult Children of Emotionally Immature Parents...,Lindsay C. Gibson,Amazon.com Services LLC,https://m.media-amazon.com/images/I/713KZTsaYp...,https://www.amazon.com/dp/B00TZE87S4,4.8,0,9.99,False,6,True,False,False,2015-06-01,Parenting & Relationships
1,B08WCKY8MB,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,Penguin Group (USA) LLC,https://m.media-amazon.com/images/I/A1LZcJFs9E...,https://www.amazon.com/dp/B08WCKY8MB,4.4,0,16.99,False,6,False,False,False,2022-02-15,Parenting & Relationships
2,B09KPS84CJ,Good Inside: A Guide to Becoming the Parent Yo...,Becky Kennedy,HarperCollins Publishers,https://m.media-amazon.com/images/I/71RIWM0sv6...,https://www.amazon.com/dp/B09KPS84CJ,4.8,0,16.99,False,6,False,True,False,2022-09-13,Parenting & Relationships
3,B07S7QPG6J,Everything I Know About Love: A Memoir,Dolly Alderton,HarperCollins Publishers,https://m.media-amazon.com/images/I/71QdQpTiKZ...,https://www.amazon.com/dp/B07S7QPG6J,4.2,0,9.95,True,6,False,True,False,2020-02-25,Parenting & Relationships
4,B00N6PEQV0,The Seven Principles for Making Marriage Work:...,John Gottman,Random House LLC,https://m.media-amazon.com/images/I/813o4WOs+w...,https://www.amazon.com/dp/B00N6PEQV0,4.7,0,13.99,False,6,False,False,False,2015-05-05,Parenting & Relationships


In [14]:
class BooleanTFIDFSearchEngine:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
        self.df = None
        self.tfidf_matrix = None
        self.processed_titles = None
        self.processed_year = None
    
    def preprocess_text(self, text):
        """preprosesing"""
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text
    
    def fit(self, df):
        """modeling berdasarkan dataset"""
        self.df = df
        self.processed_titles = self.df['title'].fillna('').apply(self.preprocess_text)
        self.processed_year = self.df['publishedDate'].fillna('')
        self.tfidf_matrix = self.vectorizer.fit_transform(self.processed_titles)
        return self
    
    def boolean_filter(self, query):
        """boolean filter"""
        terms = query.split()
        filtered_indices = set(range(len(self.df)))
        operator = None
        
        for term in terms:
            if term.upper() in ['AND', 'OR', 'NOT']:
                operator = term.upper()
                continue
            
            term_indices = set([i for i, title in enumerate(self.processed_titles) 
                              if term.lower() in title])
            
            if operator == 'AND':
                filtered_indices &= term_indices
            elif operator == 'OR':
                filtered_indices |= term_indices
            elif operator == 'NOT':
                filtered_indices -= term_indices
            else:
                filtered_indices = term_indices
        
        return list(filtered_indices)
    
    def calculate_query_similarity(self, query, doc_indices):
        """hitung similarity antara query dan dokumen dengan TF-IDF"""
        processed_query = self.preprocess_text(query)
        query_vector = self.vectorizer.transform([processed_query])
        similarities = cosine_similarity(query_vector, 
                                       self.tfidf_matrix[doc_indices]).flatten()
        return list(zip(doc_indices, similarities))
    
    def search(self, query, min_similarity=0.0):
        """pencarian dengan pemfilteran boolean dan peringkat TF-IDF"""
        search_terms = ' '.join([term for term in query.split() 
                               if term.upper() not in ['AND', 'OR', 'NOT']])
        
        filtered_indices = self.boolean_filter(query)
        
        if not filtered_indices:
            return pd.DataFrame()
        
        doc_similarities = self.calculate_query_similarity(search_terms, filtered_indices)
        ranked_docs = [(idx, score) for idx, score in doc_similarities 
                      if score >= min_similarity]
        ranked_docs.sort(key=lambda x: x[1], reverse=True)
        
        if ranked_docs:
            indices, scores = zip(*ranked_docs)
            results = self.df.iloc[list(indices)].copy()
            results['Similarity_Score'] = scores
            return results
        return pd.DataFrame()
    
    def save_model(self, model_dir='models'):
        """simpan model dan komponennya"""
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        
        # simpan vectorizer
        with open(f'{model_dir}/vectorizer.pkl', 'wb') as f:
            pickle.dump(self.vectorizer, f)
        
        # simpan TF-IDF matrix
        with open(f'{model_dir}/tfidf_matrix.pkl', 'wb') as f:
            pickle.dump(self.tfidf_matrix, f)
        
        # simpan processed titles
        with open(f'{model_dir}/processed_titles.pkl', 'wb') as f:
            pickle.dump(self.processed_titles, f)
        
        # simpan DataFrame
        self.df.to_pickle(f'{model_dir}/books_df.pkl')
    
    @classmethod
    def load_model(cls, model_dir='models'):
        """load model beserta komponennya"""
        model = cls()
        
        # load vectorizer
        with open(f'{model_dir}/vectorizer.pkl', 'rb') as f:
            model.vectorizer = pickle.load(f)
        
        # load TF-IDF matrix
        with open(f'{model_dir}/tfidf_matrix.pkl', 'rb') as f:
            model.tfidf_matrix = pickle.load(f)
        
        # load processed titles
        with open(f'{model_dir}/processed_titles.pkl', 'rb') as f:
            model.processed_titles = pickle.load(f)
        
        # load DataFrame
        model.df = pd.read_pickle(f'{model_dir}/books_df.pkl')
        
        return model

## **Latih dan Simpan Model**

In [15]:
# train model
search_engine = BooleanTFIDFSearchEngine()
search_engine.fit(df)

# Save model
search_engine.save_model()
print("Model trained and saved successfully!")

Model trained and saved successfully!


## **Uji Coba**

In [19]:
# load model yang sudah ditrain
loaded_model = BooleanTFIDFSearchEngine.load_model()

# uji coba encarian data
test_query = "master"
results = loaded_model.search(test_query, min_similarity=0.3)

print(f"Ada {len(results)} buku untuk kata kunci: {test_query}")
if not results.empty:
    print("\nTop 5:")
    display(results[['title', 'author', 'Similarity_Score', 'publishedDate']].head())

Ada 176 buku untuk kata kunci: master

Top 5:


Unnamed: 0,title,author,Similarity_Score,publishedDate
70514,The Master: A Novel,Colm Toibin,0.82226,2010-12-21
95090,"The Power of One Thought: Master Your Mind, Ma...",BK Shivani,0.703129,2023-07-01
96511,Master Your Money,Christopher Noria,0.682804,2023-09-21
60070,Dream Master 3,Logan Jacobs,0.653276,
75575,The Master Teacher Within,James F Twyman,0.612321,2023-03-21
