In [1]:
!pip install -q tensorflow-recommenders

In [2]:
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_recommenders as tfrs
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [3]:
rating_df = pd.read_csv('..\Dataset\dataset_dummy.csv')
rating_df.head(100)

Unnamed: 0,user_id,nama,genre_seni,genre_seni_id,rating
0,1,User_1,Renaissance,9,3
1,1,User_1,Abstract,1,5
2,1,User_1,Impressionism,5,9
3,1,User_1,Dadaism,3,7
4,2,User_2,Fauvism,4,5
...,...,...,...,...,...
95,18,User_18,Dadaism,3,1
96,18,User_18,Renaissance,9,3
97,18,User_18,Nouveau,6,1
98,18,User_18,Surrealism,10,2


In [4]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552 entries, 0 to 551
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   user_id        552 non-null    int64 
 1   nama           552 non-null    object
 2   genre_seni     552 non-null    object
 3   genre_seni_id  552 non-null    int64 
 4   rating         552 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 21.7+ KB


In [5]:
# Melihat jumlah masing-masing nilai unik dalam kolom "genre_seni"
genre_counts = rating_df['genre_seni'].value_counts()

# Membuat dataframe dari nilai unik dan jumlah kemunculannya
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=[
                               'Genre Seni', 'Jumlah'])

# Mencetak dataframe
print(genre_counts_df)

      Genre Seni  Jumlah
0  Impressionism      61
1        Dadaism      60
2        Fauvism      59
3    Renaissance      57
4         Cubism      56
5        Realism      55
6     Surrealism      52
7        Nouveau      52
8            Pop      51
9       Abstract      49


In [6]:
# Menghitung jumlah unique user_id
jumlah_unique_user = rating_df['user_id'].nunique()

# Mencetak jumlah unique user
print("Jumlah Unique User:", jumlah_unique_user)

Jumlah Unique User: 100


In [7]:
# Menghitung jumlah masing-masing rating dan mengurutkannya secara ascending
jumlah_rating = rating_df['rating'].value_counts().sort_index()

# Mencetak jumlah masing-masing rating yang sudah diurutkan secara ascending
print("Jumlah Masing-masing Rating:")
print(jumlah_rating)

Jumlah Masing-masing Rating:
rating
1     51
2     61
3     48
4     56
5     60
6     50
7     53
8     60
9     53
10    60
Name: count, dtype: int64


In [8]:
genre_seni_df = rating_df[['genre_seni_id', 'genre_seni']]

In [9]:
unique_genre_ids = genre_seni_df['genre_seni_id'].unique()

In [10]:
unique_genres = genre_seni_df['genre_seni'].unique()

In [11]:
genre_df = pd.DataFrame(
    {'genre_seni_id': unique_genre_ids, 'genre_seni': unique_genres})
genre_df = genre_df.sort_values('genre_seni_id')
genre_df.head(10)

Unnamed: 0,genre_seni_id,genre_seni
1,1,Abstract
7,2,Cubism
3,3,Dadaism
4,4,Fauvism
2,5,Impressionism
9,6,Nouveau
6,7,Pop
5,8,Realism
0,9,Renaissance
8,10,Surrealism


In [12]:
rating_df.head(10)

Unnamed: 0,user_id,nama,genre_seni,genre_seni_id,rating
0,1,User_1,Renaissance,9,3
1,1,User_1,Abstract,1,5
2,1,User_1,Impressionism,5,9
3,1,User_1,Dadaism,3,7
4,2,User_2,Fauvism,4,5
5,3,User_3,Realism,8,3
6,3,User_3,Pop,7,1
7,3,User_3,Renaissance,9,2
8,3,User_3,Impressionism,5,7
9,3,User_3,Fauvism,4,1


In [13]:
rating_df['user_id'] = rating_df['user_id'].astype(str)
genre_df['genre_seni_id'] = rating_df['genre_seni_id'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(
    dict(rating_df[['user_id', 'genre_seni', 'rating']]))

genres = tf.data.Dataset.from_tensor_slices(
    dict(genre_df[['genre_seni']]))

ratings = ratings.map(lambda x: {
    "genre_seni": x["genre_seni"],
    "user_id": x["user_id"],
    "rating": int(x["rating"])
})

genres = genres.map(lambda x: x["genre_seni"])

In [14]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552 entries, 0 to 551
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   user_id        552 non-null    object
 1   nama           552 non-null    object
 2   genre_seni     552 non-null    object
 3   genre_seni_id  552 non-null    int64 
 4   rating         552 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 21.7+ KB


In [15]:
genre_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 1 to 8
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   genre_seni_id  10 non-null     object
 1   genre_seni     10 non-null     object
dtypes: object(2)
memory usage: 240.0+ bytes


In [16]:
SEED = 8
SHUFFLE = 16
BATCH_SIZE = 100

In [17]:
# Set the seed for reproducibility
tf.random.set_seed(SEED)
# shuffled = ratings.shuffle(SHUFFLE, seed=SEED, reshuffle_each_iteration=False)

# Calculate the size of the training and testing sets
train_size = int(len(ratings) * 0.9)
test_size = len(ratings) - train_size

# Split the dataset
train = ratings.take(train_size)
test = ratings.skip(train_size).take(test_size)

# Print the sizes of the training and testing sets
print('Training set size:', len(train))
print('Testing set size:', len(test))

Training set size: 496
Testing set size: 56


In [18]:
genre_seni = genres.batch(BATCH_SIZE)
user_ids = ratings.batch(BATCH_SIZE).map(lambda x: x["user_id"])

unique_genre_titles = np.unique(np.concatenate(list(genre_seni)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_genre_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 10
Unique users: 100


In [19]:
class GenreModels(tfrs.models.Model):

    def __init__(self, rating_weight: int, retrieval_weight: int) -> None:
        # We take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights.

        super().__init__()

        embedding_dimension = 64

        # User and movie models.
        self.genre_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_genre_titles, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_genre_titles) + 1, embedding_dimension)
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and movie embeddings and predict ratings.
        # We can make this as complicated as we want as long as we output a scalar
        # as our prediction.
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])

        # The tasks.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=genres.batch(128).map(self.genre_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model.
        genre_embeddings = self.genre_model(features["genre_seni"])

        return (
            user_embeddings,
            genre_embeddings,
            # We apply the multi-layered rating model to a concatentation of
            # user and movie embeddings.
            self.rating_model(
                tf.concat([user_embeddings, genre_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        ratings = features.pop("rating")

        user_embeddings, genre_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, genre_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

In [20]:
model = GenreModels(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(SHUFFLE).batch(BATCH_SIZE).cache()
cached_test = test.batch(BATCH_SIZE).cache()

model.fit(cached_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x20eb9eb6520>

In [21]:
metrics = model.evaluate(cached_test, return_dict=True)

print(
    f"\nRetrieval top-5 accuracy: {metrics['factorized_top_k/top_5_categorical_accuracy'] * 100:.2f}%")
print(f"RMSE: {metrics['root_mean_squared_error']:.2f}")


Retrieval top-5 accuracy: 48.21%
RMSE: 3.28


In [22]:
def predict_genres(user, top_n=5):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

    # Recommend genres out of the entire genres dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip(
            (genres.batch(100), genres.batch(100).map(model.genre_model)))
    )

    # Ensure k doesn't exceed the number of unique genres
    num_genres = len(unique_genre_titles)
    k = min(top_n, num_genres)

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]), k=k)

    print(f'Top {top_n} recommendations genre for user {user}:\n')
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print(f'{i+1}. {title.decode("utf-8")}')


def predict_rating(user, genre):
    trained_genre_embeddings, trained_user_embeddings, predicted_rating = model({
        "user_id": np.array([str(user)]),
        "genre_title": np.array([genre])
    })
    print(f"Predicted rating for {genre}: {predicted_rating.numpy()[0][0]}")

In [23]:
rating_df[rating_df['user_id'] == '1']

Unnamed: 0,user_id,nama,genre_seni,genre_seni_id,rating
0,1,User_1,Renaissance,9,3
1,1,User_1,Abstract,1,5
2,1,User_1,Impressionism,5,9
3,1,User_1,Dadaism,3,7


In [24]:
predict_genres(1, 5)

Top 5 recommendations genre for user 1:

1. Abstract
2. Dadaism
3. Impressionism
4. Renaissance
5. Nouveau


In [25]:
model.save_weights('tfrs.h5')

In [26]:
# Load the model weights
model.load_weights('tfrs.h5')

# Define genres dataset and unique_genre_titles if not already defined
# Example placeholder (you need to replace this with actual data)
unique_genre_titles = np.unique(np.concatenate(list(genre_seni)))

# Create a tf.data.Dataset of genres
genres = tf.data.Dataset.from_tensor_slices(unique_genre_titles)


def predict_genres(user, top_n=5):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

    # Recommend genres out of the entire genres dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip(
            (genres.batch(100), genres.batch(100).map(model.genre_model)))
    )

    # Ensure k doesn't exceed the number of unique genres
    num_genres = len(unique_genre_titles)
    k = min(top_n, num_genres)

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]), k=k)

    print(f'Top {top_n} recommendations genre for user {user}:\n')
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print(f'{i+1}. {title.decode("utf-8")}')


# Example usage:
# Replace 'example_user_id' with the actual user ID you want to predict for.
predict_genres('1', top_n=5)

Top 5 recommendations genre for user 1:

1. Abstract
2. Dadaism
3. Impressionism
4. Renaissance
5. Nouveau


In [27]:
# import numpy as np
# import tensorflow as tf
# import tensorflow_recommenders as tfrs

# # Load the model weights
# model.load_weights('tfrs.h5')

# # Define genres dataset and unique_genre_titles if not already defined
# # Example placeholder (replace this with actual data)
# unique_genre_titles = np.unique(np.concatenate(list(genre_seni)))

# # Create a tf.data.Dataset of genres
# genres = tf.data.Dataset.from_tensor_slices(unique_genre_titles)


# def predict_genres(user, top_n=5):
#     # Create a model that takes in raw query features, and
#     index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

#     # Recommend genres out of the entire genres dataset.
#     index.index_from_dataset(
#         tf.data.Dataset.zip(
#             (genres.batch(100), genres.batch(100).map(model.genre_model)))
#     )

#     # Ensure k doesn't exceed the number of unique genres
#     num_genres = len(unique_genre_titles)
#     k = min(top_n, num_genres)

#     # Get recommendations.
#     _, titles = index(tf.constant([str(user)]), k=k)

#     print(f'Top {top_n} recommended genres for user {user}:\n')
#     for i, title in enumerate(titles[0, :top_n].numpy()):
#         print(f'{i+1}. {title.decode("utf-8")}')


# # Interactive loop to get user input
# while True:
#     user_id = input("Enter user ID (or 'exit' to quit): ")
#     if user_id.lower() == 'exit':
#         break
#     top_n = input("Enter the number of top genres to recommend: ")
#     try:
#         top_n = int(top_n)
#     except ValueError:
#         print("Invalid input for number of genres. Please enter an integer.")
#         continue

#     predict_genres(user_id, top_n)