<a href="https://colab.research.google.com/github/AsmaaBoudjenane/RAG/blob/main/SASREC_Mol_Vs_HybridALgo_100kmovieLEnsDATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**SASREC**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau  # Removed EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Preprocess data
def preprocess_data(ratings_path, movies_path):
    """
    Load and preprocess the ratings and movie data.
    """
    # Load ratings data
    ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv(ratings_path, sep='\t', names=ratings_columns, encoding='latin-1')

    # Define correct column names for the movies dataset
    movie_columns = [
        'movie_id', 'title', 'release_date', 'unknown_col', 'imdb_url',
        'unknown', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5',
        'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11',
        'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17', 'genre_18'
    ]
    movies_df = pd.read_csv(movies_path, sep='|', names=movie_columns, encoding='latin-1')

    # Merge ratings and movie data
    merged_df = pd.merge(ratings_df, movies_df, on='movie_id')
    merged_df['rating_norm'] = merged_df['rating'] / merged_df['rating'].max()

    # Group by user and create sequences
    grouped = merged_df[['user_id', 'movie_id', 'rating_norm', 'timestamp']].sort_values(by=['user_id', 'timestamp'])
    user_sequences = []
    for user_id, user_group in grouped.groupby('user_id'):
        seq = list(zip(user_group['movie_id'], user_group['rating_norm']))
        user_sequences.append({'user_id': user_id, 'movie_sequence': seq})

    return pd.DataFrame(user_sequences), movies_df

# Prepare sequences
def prepare_sequences(sequences, max_seq_len, num_items):
    """
    Prepare padded sequences of movies, ratings, position IDs, and labels for training.
    """
    padded_seqs, rating_seqs, pos_ids, labels = [], [], [], []
    for _, row in sequences.iterrows():
        seq = row['movie_sequence']
        if not seq:
            continue
        seq_movies, seq_ratings = zip(*seq)

        # Truncate or pad the sequence to the maximum length
        if len(seq_movies) > max_seq_len:
            seq_movies = seq_movies[-max_seq_len:]
            seq_ratings = seq_ratings[-max_seq_len:]
        else:
            padding = max_seq_len - len(seq_movies)
            seq_movies = [0] * padding + list(seq_movies)
            seq_ratings = [0] * padding + list(seq_ratings)

        # Store the prepared sequence data
        padded_seqs.append(seq_movies[:-1])  # Remove the last item for input
        rating_seqs.append(seq_ratings[:-1])  # Remove the last item for input
        pos_ids.append(list(range(len(seq_movies[:-1]))))  # Position IDs
        labels.append(seq_movies[1:])  # Target labels (next movie to predict)

    return np.array(padded_seqs), np.array(rating_seqs), np.array(pos_ids), np.array(labels)

# SASRec model
class EnhancedSASRec(Model):
    """
    Define the Enhanced SASRec model with multi-head attention and feed-forward layers.
    """
    def __init__(self, num_items, embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate):
        super(EnhancedSASRec, self).__init__()
        self.item_embedding = Embedding(input_dim=num_items, output_dim=embed_dim)
        self.positional_embedding = Embedding(input_dim=max_seq_len, output_dim=embed_dim)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.layer_norm1 = LayerNormalization()
        self.feed_forward = Dense(ff_dim, activation='gelu')
        self.dropout2 = Dropout(dropout_rate)
        self.layer_norm2 = LayerNormalization()
        self.mol_layer = Dense(num_items)

    def call(self, inputs, training=False):
        """
        Implement the forward pass for the Enhanced SASRec model.
        """
        seqs, pos_ids = inputs
        seq_embeds = self.item_embedding(seqs) + self.positional_embedding(pos_ids)
        attention_out = self.attention(seq_embeds, seq_embeds)
        attention_out = self.dropout1(attention_out, training=training)
        attention_out = self.layer_norm1(attention_out + seq_embeds)  # Residual connection
        ff_out = self.feed_forward(attention_out)
        ff_out = self.dropout2(ff_out, training=training)
        output = self.layer_norm2(ff_out + attention_out)  # Residual connection
        logits = self.mol_layer(output)
        return logits

# Extract metadata embeddings
def extract_metadata_embeddings(movies_df, embedding_dim=256):
    """
    Generate metadata embeddings using movie titles and genres.
    """
    # Load SentenceTransformer for title embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    title_embeddings = model.encode(movies_df['title'].tolist())

    # Extract genre embeddings (e.g., one-hot or dense encoding)
    genre_columns = [col for col in movies_df.columns if col not in ['movie_id', 'title', 'release_date']]
    genre_embeddings = movies_df[genre_columns].values

    # Combine title and genre embeddings
    combined_embeddings = np.hstack([title_embeddings, genre_embeddings])

    # Ensure consistent embedding dimension
    combined_embeddings = combined_embeddings[:, :embedding_dim]
    return combined_embeddings

# Train model
def train_sasrec_mol(ratings_path, movies_path):
    """
    Train the SASRec model using the movie ratings and metadata embeddings.
    """
    # Preprocess the data and generate movie sequences
    user_sequences, movies_df = preprocess_data(ratings_path, movies_path)

    # Extract metadata embeddings (titles + genres)
    metadata_embeddings = extract_metadata_embeddings(movies_df)

    # Encode movie IDs into numerical values
    movie_encoder = LabelEncoder()
    all_movies = sorted(set([movie for seq in user_sequences['movie_sequence'] for movie, _ in seq]))
    movie_encoder.fit(all_movies)
    user_sequences['movie_sequence'] = user_sequences['movie_sequence'].apply(
        lambda seq: [(movie_encoder.transform([movie])[0], rating) for movie, rating in seq])

    # Hyperparameters for the model
    max_seq_len = 50  # Updated max_seq_len
    embed_dim = 128   # Updated embed_dim
    num_heads = 2     # Updated num_heads
    ff_dim = 128      # Updated ff_dim
    dropout_rate = 0.3
    batch_size = 128  # Updated batch_size
    epochs = 9        # Updated epochs

    # Prepare sequences for training
    X, _, pos_ids, y = prepare_sequences(user_sequences, max_seq_len, len(all_movies))

    # Split data into training and validation sets
    X_train, X_val, pos_train, pos_val, y_train, y_val = train_test_split(
        X, pos_ids, y, test_size=0.2, random_state=42)

    # Build and compile the Enhanced SASRec model
    model = EnhancedSASRec(len(all_movies), embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate)
    model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Define callbacks for learning rate reduction only
    callbacks = [
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)
    ]

    # Train the model
    model.fit([X_train, pos_train], y_train, validation_data=([X_val, pos_val], y_val),
              batch_size=batch_size, epochs=epochs, callbacks=callbacks)

    return model, metadata_embeddings

# Define paths to data files
ratings_path = '/content/drive/My Drive/u.data'
movies_path = '/content/drive/My Drive/u.item'

# Train the model
model, metadata_embeddings = train_sasrec_mol(ratings_path, movies_path)


Epoch 1/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step - accuracy: 0.0428 - loss: 13.8812 - val_accuracy: 0.1462 - val_loss: 12.5816 - learning_rate: 0.0010
Epoch 2/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.1563 - loss: 11.8070 - val_accuracy: 0.1471 - val_loss: 11.9220 - learning_rate: 0.0010
Epoch 3/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1541 - loss: 10.9222 - val_accuracy: 0.1476 - val_loss: 11.4527 - learning_rate: 0.0010
Epoch 4/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1496 - loss: 10.3403 - val_accuracy: 0.1488 - val_loss: 11.1435 - learning_rate: 0.0010
Epoch 5/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1575 - loss: 9.9164 - val_accuracy: 0.1498 - val_loss: 10.8334 - learning_rate: 0.0010
Epoch 6/9
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step 

**SASREC+HybridALgorithm**
 max_seq_len=128, embed_dim=256, num_heads=4, ff_dim=256,
 batch_size=128, epochs=10

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Preprocess data
def preprocess_data(ratings_path, movies_path):
    ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv(ratings_path, sep='\t', names=ratings_columns, encoding='latin-1')

    movie_columns = [
        'movie_id', 'title', 'release_date', 'unknown_col', 'imdb_url',
        'unknown', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5',
        'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11',
        'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17', 'genre_18'
    ]
    movies_df = pd.read_csv(movies_path, sep='|', names=movie_columns, encoding='latin-1')

    merged_df = pd.merge(ratings_df, movies_df, on='movie_id')
    merged_df['rating_norm'] = merged_df['rating'] / merged_df['rating'].max()

    grouped = merged_df[['user_id', 'movie_id', 'rating_norm', 'timestamp']].sort_values(by=['user_id', 'timestamp'])
    user_sequences = []
    for user_id, user_group in grouped.groupby('user_id'):
        seq = list(zip(user_group['movie_id'], user_group['rating_norm']))
        user_sequences.append({'user_id': user_id, 'movie_sequence': seq})

    return pd.DataFrame(user_sequences), movies_df

# Prepare sequences
def prepare_sequences(sequences, max_seq_len):
    padded_seqs, rating_seqs, pos_ids, labels = [], [], [], []
    for _, row in sequences.iterrows():
        seq = row['movie_sequence']
        if not seq:
            continue
        seq_movies, seq_ratings = zip(*seq)

        if len(seq_movies) > max_seq_len:
            seq_movies = seq_movies[-max_seq_len:]
            seq_ratings = seq_ratings[-max_seq_len:]
        else:
            padding = max_seq_len - len(seq_movies)
            seq_movies = [0] * padding + list(seq_movies)
            seq_ratings = [0] * padding + list(seq_ratings)

        padded_seqs.append(seq_movies[:-1])
        rating_seqs.append(seq_ratings[:-1])
        pos_ids.append(list(range(len(seq_movies[:-1]))))
        labels.append(seq_movies[1:])

    return np.array(padded_seqs), np.array(rating_seqs), np.array(pos_ids), np.array(labels)

# Extract metadata embeddings
def extract_metadata_embeddings(movies_df, embedding_dim=256):
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    title_embeddings = model.encode(movies_df['title'].tolist())

    genre_columns = [col for col in movies_df.columns if col not in ['movie_id', 'title', 'release_date']]
    genre_embeddings = movies_df[genre_columns].values

    combined_embeddings = np.hstack([title_embeddings, genre_embeddings])
    combined_embeddings = combined_embeddings[:, :embedding_dim]
    return combined_embeddings

class EnhancedSASRecWithHybrid(Model):
    def __init__(self, num_items, embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate):
        super(EnhancedSASRecWithHybrid, self).__init__()
        self.item_embedding = Embedding(input_dim=num_items, output_dim=embed_dim)
        self.positional_embedding = Embedding(input_dim=max_seq_len, output_dim=embed_dim)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.layer_norm1 = LayerNormalization()
        self.feed_forward = Dense(ff_dim, activation='gelu')
        self.dropout2 = Dropout(dropout_rate)
        self.layer_norm2 = LayerNormalization()
        self.output_layer = Dense(num_items)

    def call(self, inputs, training=False):
        seqs, pos_ids = inputs
        seq_embeds = self.item_embedding(seqs) + self.positional_embedding(pos_ids)
        attention_out = self.attention(seq_embeds, seq_embeds)
        attention_out = self.dropout1(attention_out, training=training)
        attention_out = self.layer_norm1(attention_out + seq_embeds)
        ff_out = self.feed_forward(attention_out)
        ff_out = self.dropout2(ff_out, training=training)
        output = self.layer_norm2(ff_out + attention_out)
        logits = self.output_layer(output)
        return logits

    def compute_mol_scores(self, query_embed, candidate_embeds, component_weights):
        mol_scores = np.sum(
            [w * np.dot(query_embed, candidate_embeds[p].T) for p, w in enumerate(component_weights) if p < len(candidate_embeds)],
            axis=0
        )
        return mol_scores

@tf.function
def train_on_batch(model, optimizer, batch_X, batch_pos, batch_y):
    with tf.GradientTape() as tape:
        logits = model([batch_X, batch_pos], training=True)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(batch_y, logits, from_logits=True))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Train function with validation accuracy
def train_hybrid_sasrec(ratings_path, movies_path, T_init, max_seq_len=128, embed_dim=256, num_heads=4,
                        ff_dim=256, dropout_rate=0.3, batch_size=128, epochs=10):
    user_sequences, movies_df = preprocess_data(ratings_path, movies_path)

    # Extract metadata embeddings
    metadata_embeddings = extract_metadata_embeddings(movies_df, embedding_dim=embed_dim)

    # Encode movie IDs
    movie_encoder = LabelEncoder()
    all_movies = sorted(set([movie for seq in user_sequences['movie_sequence'] for movie, _ in seq]))
    movie_encoder.fit(all_movies)
    user_sequences['movie_sequence'] = user_sequences['movie_sequence'].apply(
        lambda seq: [(movie_encoder.transform([movie])[0], rating) for movie, rating in seq]
    )

    # Prepare sequences for training
    X, _, pos_ids, y = prepare_sequences(user_sequences, max_seq_len)

    # Split data
    X_train, X_val, pos_train, pos_val, y_train, y_val = train_test_split(
        X, pos_ids, y, test_size=0.2, random_state=42)

    # Initialize the model
    model = EnhancedSASRecWithHybrid(len(all_movies), embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate)
    optimizer = Adam(0.001)

    for epoch in range(epochs):
        for step in range(0, len(X_train), batch_size):
            batch_X = X_train[step:step + batch_size]
            batch_pos = pos_train[step:step + batch_size]
            batch_y = y_train[step:step + batch_size]

            # Initial candidate retrieval
            query_embeddings = model.item_embedding(batch_X)
            candidate_embeddings = metadata_embeddings[batch_X.flatten()]

            # Compute MOL scores
            try:
                mol_scores = model.compute_mol_scores(query_embeddings, candidate_embeddings, np.ones(len(all_movies)))
                selected_candidates = np.argwhere(mol_scores >= T_init).flatten()
                selected_candidates = selected_candidates[selected_candidates < len(mol_scores)]

                if len(selected_candidates) > 0:
                    T_adaptive = np.min(mol_scores[selected_candidates])
                else:
                    T_adaptive = T_init

                refined_candidates = [x for x in selected_candidates if (mol_scores[x] >= T_adaptive).any()]

                # Train on the refined set
                train_on_batch(model, optimizer, batch_X, batch_pos, batch_y)
            except Exception as e:
                print(f"Error in MOL score computation: {e}")

        # Evaluate on validation data
        val_logits = model([X_val, pos_val], training=False)
        val_loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(y_val, val_logits, from_logits=True)
        )

        # Get the predicted indices (highest probability)
        val_predictions = tf.argmax(val_logits, axis=-1)

        # Calculate accuracy: check how many predictions match the true labels
        accuracy = tf.reduce_mean(tf.cast(tf.equal(val_predictions, y_val), tf.float32))

        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss.numpy()}, Validation Accuracy: {accuracy.numpy()}")

    return model, metadata_embeddings

# Define paths to data files
ratings_path = '/content/drive/My Drive/u.data'
movies_path = '/content/drive/My Drive/u.item'

# Train the hybrid SASRec model
T_init = 0.3  # Initial threshold
model, metadata_embeddings = train_hybrid_sasrec(ratings_path, movies_path, T_init)


Epoch 1/10, Validation Loss: 4.293770790100098, Validation Accuracy: 0.42340540885925293
Epoch 2/10, Validation Loss: 4.045614719390869, Validation Accuracy: 0.4235303997993469
Epoch 3/10, Validation Loss: 3.9535160064697266, Validation Accuracy: 0.4241553246974945
Epoch 4/10, Validation Loss: 3.9079554080963135, Validation Accuracy: 0.4255717992782593
Epoch 5/10, Validation Loss: 3.863077402114868, Validation Accuracy: 0.42648836970329285
Epoch 6/10, Validation Loss: 3.8215692043304443, Validation Accuracy: 0.42723825573921204
Epoch 7/10, Validation Loss: 3.7539122104644775, Validation Accuracy: 0.4280298352241516
Epoch 8/10, Validation Loss: 3.701047897338867, Validation Accuracy: 0.4288630485534668
Epoch 9/10, Validation Loss: 3.660278797149658, Validation Accuracy: 0.4294879734516144
Epoch 10/10, Validation Loss: 3.636460781097412, Validation Accuracy: 0.43015456199645996


**SASREC+MOL**

 max_seq_len=128, embed_dim=256, num_heads=4:,ff_dim=256,batch_size=128, epochs=10

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Preprocess data
def preprocess_data(ratings_path, movies_path):
    ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv(ratings_path, sep='\t', names=ratings_columns, encoding='latin-1')

    movie_columns = [
        'movie_id', 'title', 'release_date', 'unknown_col', 'imdb_url',
        'unknown', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5',
        'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11',
        'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17', 'genre_18'
    ]
    movies_df = pd.read_csv(movies_path, sep='|', names=movie_columns, encoding='latin-1')

    merged_df = pd.merge(ratings_df, movies_df, on='movie_id')
    merged_df['rating_norm'] = merged_df['rating'] / merged_df['rating'].max()

    grouped = merged_df[['user_id', 'movie_id', 'rating_norm', 'timestamp']].sort_values(by=['user_id', 'timestamp'])
    user_sequences = []
    for user_id, user_group in grouped.groupby('user_id'):
        seq = list(zip(user_group['movie_id'], user_group['rating_norm']))
        user_sequences.append({'user_id': user_id, 'movie_sequence': seq})

    return pd.DataFrame(user_sequences), movies_df

# Prepare sequences
def prepare_sequences(sequences, max_seq_len):
    padded_seqs, rating_seqs, pos_ids, labels = [], [], [], []
    for _, row in sequences.iterrows():
        seq = row['movie_sequence']
        if not seq:
            continue
        seq_movies, seq_ratings = zip(*seq)

        if len(seq_movies) > max_seq_len:
            seq_movies = seq_movies[-max_seq_len:]
            seq_ratings = seq_ratings[-max_seq_len:]
        else:
            padding = max_seq_len - len(seq_movies)
            seq_movies = [0] * padding + list(seq_movies)
            seq_ratings = [0] * padding + list(seq_ratings)

        padded_seqs.append(seq_movies[:-1])
        rating_seqs.append(seq_ratings[:-1])
        pos_ids.append(list(range(len(seq_movies[:-1]))))
        labels.append(seq_movies[1:])

    return np.array(padded_seqs), np.array(rating_seqs), np.array(pos_ids), np.array(labels)

# Extract metadata embeddings
def extract_metadata_embeddings(movies_df, embedding_dim=128):
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    title_embeddings = model.encode(movies_df['title'].tolist())

    genre_columns = [col for col in movies_df.columns if col not in ['movie_id', 'title', 'release_date']]
    genre_embeddings = movies_df[genre_columns].values

    combined_embeddings = np.hstack([title_embeddings, genre_embeddings])
    combined_embeddings = combined_embeddings[:, :embedding_dim]
    return combined_embeddings

class MOL_SASRec(Model):
    def __init__(self, num_items, embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate):
        super(MOL_SASRec, self).__init__()
        self.item_embedding = Embedding(input_dim=num_items, output_dim=embed_dim)
        self.positional_embedding = Embedding(input_dim=max_seq_len, output_dim=embed_dim)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.layer_norm1 = LayerNormalization()
        self.feed_forward = Dense(ff_dim, activation='gelu')
        self.dropout2 = Dropout(dropout_rate)
        self.layer_norm2 = LayerNormalization()
        self.output_layer = Dense(num_items)

    def call(self, inputs, training=False):
        seqs, pos_ids = inputs
        seq_embeds = self.item_embedding(seqs) + self.positional_embedding(pos_ids)
        attention_out = self.attention(seq_embeds, seq_embeds)
        attention_out = self.dropout1(attention_out, training=training)
        attention_out = self.layer_norm1(attention_out + seq_embeds)
        ff_out = self.feed_forward(attention_out)
        ff_out = self.dropout2(ff_out, training=training)
        output = self.layer_norm2(ff_out + attention_out)
        logits = self.output_layer(output)
        return logits

    def compute_mol_scores(self, query_embed, candidate_embeds, component_weights):
        if len(component_weights) > len(candidate_embeds):
            component_weights = component_weights[:len(candidate_embeds)]
        mol_scores = np.sum(
            [w * np.dot(query_embed, candidate_embeds[p].T) for p, w in enumerate(component_weights) if p < len(candidate_embeds)],
            axis=0
        )
        return mol_scores
@tf.function
def train_on_batch(model, optimizer, batch_X, batch_pos, batch_y):
    with tf.GradientTape() as tape:
        logits = model([batch_X, batch_pos], training=True)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(batch_y, logits, from_logits=True))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Train function
def train_mol_sasrec(ratings_path, movies_path,  max_seq_len=128, embed_dim=256, num_heads=4,ff_dim=256, dropout_rate=0.3, batch_size=128, epochs=10):
    user_sequences, movies_df = preprocess_data(ratings_path, movies_path)
    metadata_embeddings = extract_metadata_embeddings(movies_df, embedding_dim=embed_dim)

    movie_encoder = LabelEncoder()
    all_movies = sorted(set([movie for seq in user_sequences['movie_sequence'] for movie, _ in seq]))
    movie_encoder.fit(all_movies)
    user_sequences['movie_sequence'] = user_sequences['movie_sequence'].apply(
        lambda seq: [(movie_encoder.transform([movie])[0], rating) for movie, rating in seq]
    )

    X, _, pos_ids, y = prepare_sequences(user_sequences, max_seq_len)

    X_train, X_val, pos_train, pos_val, y_train, y_val = train_test_split(
        X, pos_ids, y, test_size=0.2, random_state=42)

    model = MOL_SASRec(len(all_movies), embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate)
    optimizer = Adam(0.001)

    for epoch in range(epochs):
        # Training phase
        for step in range(0, len(X_train), batch_size):
            batch_X = X_train[step:step + batch_size]
            batch_pos = pos_train[step:step + batch_size]
            batch_y = y_train[step:step + batch_size]

            loss = train_on_batch(model, optimizer, batch_X, batch_pos, batch_y)

        # Validation phase
        val_logits = model([X_val, pos_val], training=False)
        val_loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(y_val, val_logits, from_logits=True)
        )

        # Calculate validation accuracy
        val_preds = tf.argmax(val_logits, axis=-1)  # Get the index of the highest logit
        val_accuracy = tf.reduce_mean(tf.cast(tf.equal(val_preds, y_val), tf.float32))

        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss.numpy()}, Validation Accuracy: {val_accuracy.numpy()}")

    return model, metadata_embeddings

# Define paths to data files
ratings_path = '/content/drive/My Drive/u.data'
movies_path = '/content/drive/My Drive/u.item'

# Train the MOL_SASRec model and calculate accuracy
model, metadata_embeddings = train_mol_sasrec(ratings_path, movies_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch 1/10, Validation Loss: 4.326416015625, Validation Accuracy: 0.42340540885925293
Epoch 2/10, Validation Loss: 4.0661540031433105, Validation Accuracy: 0.42332208156585693
Epoch 3/10, Validation Loss: 3.9613349437713623, Validation Accuracy: 0.42332208156585693
Epoch 4/10, Validation Loss: 3.9056999683380127, Validation Accuracy: 0.4246552586555481
Epoch 5/10, Validation Loss: 3.8694026470184326, Validation Accuracy: 0.4254884719848633
Epoch 6/10, Validation Loss: 3.8316071033477783, Validation Accuracy: 0.42682164907455444
Epoch 7/10, Validation Loss: 3.763185501098633, Validation Accuracy: 0.4278215169906616
Epoch 8/10, Validation Loss: 3.7065272331237793, Validation Accuracy: 0.4293629825115204
Epoch 9/10, Validation Loss: 3.6619632244110107, Validation Accuracy: 0.42973795533180237
Epoch 10/10, Validation Loss: 3.637404680252075, Validation Accuracy: 0.42990460991859436
