# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from collections import defaultdict

# RecSys

### Data Prep

In [4]:
df = pd.read_csv('./dataset/table_interactions.csv')

In [5]:
df

Unnamed: 0,userID,courseID,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
962100,6040,1091,1
962101,6040,1094,5
962102,6040,562,5
962103,6040,1096,4


In [6]:
X = df[['userID', 'courseID']]
y = df[['rating']]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    ((X_train['userID'].values, X_train['courseID'].values), y_train.values)
)
valid_dataset = tf.data.Dataset.from_tensor_slices(
    ((X_valid['userID'].values, X_valid['courseID'].values), y_valid.values)
)

batch_size = 2048

dl_train = train_dataset.shuffle(buffer_size=len(X_train)).batch(batch_size)
dl_valid = valid_dataset.batch(batch_size)

### Model Class

In [9]:
@tf.keras.utils.register_keras_serializable(package='Custom')
class MF(tf.keras.Model):
    def __init__(self, num_users, num_items, emb_dim, init=True, bias=True, sigmoid=True, **kwargs):
        super(MF, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.emb_dim = emb_dim
        self.init = init
        self.bias = bias
        self.sigmoid = sigmoid

        # Embedding layers
        self.user_emb = tf.keras.layers.Embedding(num_users, emb_dim)
        self.item_emb = tf.keras.layers.Embedding(num_items, emb_dim)
        
        if init:
            self.user_emb.embeddings_initializer = tf.keras.initializers.RandomUniform(0., 0.05)
            self.item_emb.embeddings_initializer = tf.keras.initializers.RandomUniform(0., 0.05)
        
        if bias:
            self.user_bias = self.add_weight(name="user_bias", shape=(num_users,), initializer="zeros", trainable=True)
            self.item_bias = self.add_weight(name="item_bias", shape=(num_items,), initializer="zeros", trainable=True)
            self.offset = self.add_weight(name="offset", shape=(), initializer="zeros", trainable=True)
    
    def call(self, inputs):
        user, item = inputs

        # Look up embeddings
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)

        # Compute dot product
        element_product = tf.reduce_sum(user_emb * item_emb, axis=1)

        if self.bias:
            # Add biases
            user_b = tf.gather(self.user_bias, user)
            item_b = tf.gather(self.item_bias, item)
            element_product += user_b + item_b + self.offset

        if self.sigmoid:
            return self.sigmoid_range(element_product, low=0, high=5.5)

        return element_product

    def predict(self, user_id, course_id, k=10):
        tensor_user = tf.convert_to_tensor([user_id] * len(course_id), dtype=tf.int32)
        tensor_course = tf.convert_to_tensor(course_id, dtype=tf.int32)
    
        pred = self.call((tensor_user, tensor_course))
        rank = tf.argsort(pred, direction='DESCENDING')[:k].numpy().flatten()
        rec_id = tf.gather(tensor_course, rank)
    
        return rec_id.numpy().tolist()
    
    @staticmethod
    def sigmoid_range(x, low=0, high=5.5):
        return tf.sigmoid(x) * (high - low) + low

    def get_config(self):
        config = super(MF, self).get_config()
        config.update({
            "num_users": self.num_users,
            "num_items": self.num_items,
            "emb_dim": self.emb_dim,
            "init": self.init,
            "bias": self.bias,
            "sigmoid": self.sigmoid,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [10]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    restore_best_weights=True
)

### Init Model

In [12]:
n_users = len(df['userID'].unique()) + 1
n_items = 3712 + 1 # len(df['courseID'].unique()) + 1

model = MF(n_users, n_items, emb_dim=64,
           init=False,
           bias=True,
           sigmoid=True)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_squared_error',
    metrics=['mean_absolute_error']
)

### Training

In [253]:
model.fit(
    dl_train,
    epochs=100,
    verbose=1,
    validation_data=dl_valid,
    shuffle=True,
    initial_epoch=0,
    callbacks=[early_stopping]
)

Epoch 1/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 24ms/step - loss: 1.5584 - mean_absolute_error: 1.0395 - val_loss: 0.9307 - val_mean_absolute_error: 0.7815
Epoch 2/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - loss: 0.8645 - mean_absolute_error: 0.7466 - val_loss: 0.7996 - val_mean_absolute_error: 0.7091
Epoch 3/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - loss: 0.7446 - mean_absolute_error: 0.6838 - val_loss: 0.7633 - val_mean_absolute_error: 0.6899
Epoch 4/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - loss: 0.6763 - mean_absolute_error: 0.6499 - val_loss: 0.7430 - val_mean_absolute_error: 0.6795
Epoch 5/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - loss: 0.6135 - mean_absolute_error: 0.6184 - val_loss: 0.7334 - val_mean_absolute_error: 0.6740
Epoch 6/100
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x17fae0ff710>

In [255]:
model.summary()

model.save('./model/MF_model.keras')

In [13]:
model = tf.keras.models.load_model('./model/MF_model.keras', custom_objects={'MF': MF})




  instance.build_from_config(build_config)


# Vector Search

In [14]:
from sentence_transformers import SentenceTransformer
# import tensorflow_hub as hub




In [15]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')
# encoder = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

### Vector DB

In [27]:
db = pd.read_csv('./dataset/table_courses_info.csv')
desc_embedding = encoder.encode(db['Description'])

In [None]:
def vector_search(encoder, skillset, k=10, threshold=None):
    tensor_skillset = encoder(skillset)
    tensor_course = desc_embedding
    tensor_skillset = tf.nn.l2_normalize(tensor_skillset, axis=-1)
    tensor_course = tf.nn.l2_normalize(tensor_course, axis=-1)

    cos_sim = tf.squeeze(tf.matmul(tensor_course, tensor_skillset, transpose_b=True))

    if threshold is not None:
        indices = tf.where(cos_sim >= threshold).numpy().flatten()
        top_idx = indices[tf.argsort(tf.gather(cos_sim, indices), direction='DESCENDING').numpy()]
    else:
        top_idx = tf.argsort(cos_sim, axis=0, direction='DESCENDING')[:k].numpy().flatten()

    rec_id = db.iloc[top_idx]['ID'].tolist()
    rec_name = db.iloc[top_idx]['Title'].tolist()

    return rec_id, rec_name

# End-to-End

In [None]:
def recommender(user_id, skillset, encoder, model, n=50, k=10):
    if user_id > model.user_emb.input_dim - 1:
        user = 0
    else:
        user = user_id

    course_ids, course_names = vector_search(encoder, skillset, k=n)
    course = course_ids

    pred = model.predict(user, course, k=k)
    rec = db[db['ID'].isin(pred)]

    return rec['ID'].tolist()

In [70]:
iUser = 1
iSkill = ['Math, Machine Learning, Computer Science', 'Python']
recommender(iUser, iSkill, encoder, model)

[364, 531, 1210, 1487, 1544, 1876, 1958, 2406, 2539]

def new_user_update(new_user_id, preferences, encoder, threshold=0.5):
    interactions = df.pivot_table(
        index='user_id',
        columns='course_id', 
        values='rating', 
        fill_value=0
    )
    course_ids, _ = vector_search(encoder, preferences, threshold=threshold)
    
    rating_new_user = np.zeros(len(df['course_encode'].unique()))
    indices = interactions.columns.get_indexer(course_ids)
    rating_new_user[indices] = 4

    interactions.loc[new_user_id] = rating_new_user
    new_df = interactions.reset_index().melt(
        id_vars='user_id',
        var_name='course_id',
        value_name='rating'
    )
    new_df = new_df[new_df['rating'] != 0].reset_index(drop=True)

    inv_user_map = df.groupby('user_id')['user_encode'].first().reset_index().set_index('user_encode').to_dict()['user_id']
    user_map = {v: k for k, v in inv_user_map.items()}
    user_map[new_user_id] = max(user_map.values()) + 1
    
    inv_course_map = df.groupby('course_id')['course_encode'].first().reset_index().set_index('course_encode').to_dict()['course_id']
    course_map = {v: k for k, v in inv_course_map.items()}
    course_map[new_user_id] = max(course_map.values()) + 1

    new_df['user_encode'] = new_df['user_id'].map(user_map)
    new_df['course_encode'] = new_df['course_id'].map(course_map)
    
    return new_df

def recommender(input_user, input_skillset, encoder, model, n=50, k=10):
    user_encode = df[df['user_id'] == input_user]['user_encode'].values[0]
    if user_encode > model.user_emb.input_dim - 1:
        interactions = df.pivot_table(
            index='user_id', 
            columns='course_id', 
            values='rating', 
            fill_value=0
        )
        new_interaction = interactions.loc[input_user]
        exist_interaction = interactions.drop(input_user)
        
        similarity = np.matmul(exist_interaction.values, new_interaction.values)
        position = tf.argsort(similarity, direction='DESCENDING').numpy()[0]
        user_sim = exist_interaction.index[position]
        user = df[df['user_id'] == user_sim]['user_encode'].unique().item()
    else:
        user = df[df['user_id'] == input_user]['user_encode'].unique().item()

    course_ids, course_names = vector_search(encoder, input_skillset, k=n)
    course = df[df['course_id'].isin(course_ids)]['course_encode'].unique().tolist()

    pred = model.predict(user, course, k=k)
    pred_id = df[df['course_encode'].isin(pred)]['course_id'].unique().tolist()

    rec = db[db['course_id'].isin(pred_id)][['course_id', 'Course Name', 'Course URL']]

    response = {
        idx: {
            'course_id': row['course_id'],
            'course_name': row['Course Name'],
            'course_url': row['Course URL'],
        }
        for idx, row in rec.iterrows()
    }
    return response

# --