# Preparing Import and Func

In [1]:
import tensorflow as tf
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text_id(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stop words
    stop_words = set(stopwords.words('indonesian'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

def preprocess_text_id(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [5]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
path_datasetIndo = '/content/drive/Shared drives/Capstone Project/Product Capstone/POV ML/Datasets/Books_ID_Data_Fix.csv'
path_datasetLN = '/content/drive/Shared drives/Capstone Project/Product Capstone/POV ML/Datasets/Books_Eng_Data_Fix.csv'

In [6]:
dataset_id = pd.read_csv(path_datasetIndo)

In [11]:
dataset_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6948 entries, 0 to 6947
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   books_id        6948 non-null   int64  
 1   language_code   6948 non-null   object 
 2   average_rating  6948 non-null   float64
 3   deskripsi       6948 non-null   object 
 4   penerbit        6948 non-null   object 
 5   jml_halaman     6948 non-null   float64
 6   tahun_terbit    6948 non-null   float64
 7   url             6948 non-null   object 
 8   image           6948 non-null   object 
 9   judul           6948 non-null   object 
 10  penulis         6948 non-null   object 
 11  isbn            6948 non-null   object 
 12  genre           6948 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 705.8+ KB


In [22]:
dataset_id_numeric = dataset_id[["average_rating","jml_halaman","tahun_terbit"]]
dataset_id_desc = dataset_id["deskripsi"]
dataset_id_genre = dataset_id["genre"]

In [None]:
dataset_id_numeric.info()
dataset_id_desc.info()
dataset_id_genre.info()

# Creating Synthetic User Data

# Preprocessing Data

## Preprocess Numeric

## Preprocess Desc

In [12]:
proceed_dataset_id_desc = dataset_id_desc.apply(preprocess_text_id)

In [17]:
word_vector = Word2Vec(sentences=proceed_dataset_id_desc, vector_size=100, window=10, min_count=1, workers=4)

In [15]:
len(word_vector.wv)

54486

In [18]:
word_vector.wv[0]

array([-0.26818633,  0.4809145 ,  0.99416447,  1.1659833 , -1.4649794 ,
       -2.6219893 ,  1.4568212 ,  2.8071933 , -1.8088096 , -2.327324  ,
        0.53710043, -0.07547704, -0.9130683 ,  0.69559443, -0.5554993 ,
       -1.1307062 ,  0.7356636 , -0.4344965 , -1.1287323 , -2.1643846 ,
        0.1433782 , -0.52693903,  1.3050339 , -0.02256546,  1.3058091 ,
        0.73814595, -1.9950593 ,  0.24888307, -0.56531787,  0.88593733,
       -0.78803533, -0.02629294,  0.75345665, -1.8531111 , -0.6683318 ,
        1.0391866 , -1.0069298 , -0.9083378 , -0.5605741 , -2.7719188 ,
       -1.0793867 ,  0.47234607, -1.2530079 , -0.63365215,  0.03567794,
       -1.5402477 , -1.2844883 ,  0.37321773,  0.39392716,  1.5933696 ,
       -0.08912779,  0.34408924, -0.6279506 , -0.1006326 , -0.00641139,
        0.24618377,  0.51887774, -1.2539822 , -0.6880233 ,  1.2707222 ,
       -0.27506223, -0.73909503,  0.8353189 , -1.5501755 , -1.5809695 ,
        2.1382284 ,  0.82622933, -0.64625776, -2.4837706 ,  0.93

In [33]:
vocab_size = len(word_vector.wv)
embedding_dim = word_vector.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_index = {word: idx for idx, word in enumerate(word_vector.wv.index_to_key)}
for word, idx in word_index.items():
    embedding_matrix[idx] = word_vector.wv[word]

In [19]:
similar_words = word_vector.wv.most_similar('hidup', topn=10)
print(similar_words)

[('kepahitan', 0.9303456544876099), ('harapan', 0.9290563464164734), ('kebahagiaan', 0.9282523989677429), ('segala', 0.8981820344924927), ('impian', 0.8913659453392029), ('mampu', 0.8878403306007385), ('mengubah', 0.8823624849319458), ('terdalam', 0.8702794909477234), ('mimpi', 0.8686023950576782), ('kekuatan', 0.8685709834098816)]


In [20]:
id_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 5000)
id_tokenizer.fit_on_texts(proceed_dataset_id_desc)

In [35]:
seq = id_tokenizer.texts_to_sequences(proceed_dataset_id_desc)
max_length = 100
pad = tf.keras.utils.pad_sequences(seq, maxlen=max_length)

## Preprocess Genre

In [26]:
from collections import Counter
import ast
def split_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        if isinstance(genres, list):
            return genres
        else:
            return [genre_str]
    except:
        return [genre_str]

def count_unique_genres(df):
    all_genres = [genre for sublist in df for genre in sublist]
    genre_counts = Counter(all_genres)

    return genre_counts

In [41]:
split_dataset_id_genre = dataset_id_genre.apply(split_genres)
genre_counts = count_unique_genres(split_dataset_id_genre)

# Konversi hasil ke DataFrame
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=['genre', 'jumlah_buku'])
genre_counts_df = genre_counts_df.sort_values(by='jumlah_buku', ascending=False)
genre_counts_df.head(50)

Unnamed: 0,genre,jumlah_buku
3,Fiction,2523
0,Romance,1754
4,Indonesian Literature,1066
18,Manga,1034
22,Fantasy,931
12,Young Adult,916
2,Novels,792
7,Comics,728
24,Mystery,610
61,Nonfiction,594


# Books Model


## Desc Model


In [None]:
book_desc_input = tf.keras.layers.Input(shape=(max_length,), name='desc_input')
embedding_layer_desc = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                  output_dim=embedding_dim,
                                                  weights=[embedding_matrix],
                                                  trainable=False)(book_desc_input)
LSTM_desc_layer = tf.keras.layers.LSTM(50)(embedding_layer_desc)
Dense_desc_layer = tf.keras.layers.Dense(50)(LSTM_desc_layer)
Normalized_output_layer = tf.keras.layers.BatchNormalization()(Dense_desc_layer)

tf.keras.models.Model(book_desc_input,Normalized_output_layer).summary()

# Users Model