<a href="https://colab.research.google.com/github/AsmaaBoudjenane/RAG/blob/main/SAS_mol_VS_Hybrid__1Mmovielens_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


Sas+hybrid
       max_seq_len=50,
        embed_dim=128,
        num_heads=2,
        ff_dim=128,
        dropout_rate=0.3,
        batch_size=128,
        epochs=10
        1 M dataset

        

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

# Preprocess data
def preprocess_data(ratings_path, movies_path, users_path):
    # Read ratings data
    ratings_df = pd.read_csv(ratings_path, sep='::',
                            names=['user_id', 'movie_id', 'rating', 'timestamp'],
                            encoding='latin1', engine='python')

    # Read movies data
    movies_df = pd.read_csv(movies_path, sep='::',
                           names=['movie_id', 'title', 'genres'],
                           encoding='latin1', engine='python')

    # Read users data
    users_df = pd.read_csv(users_path, sep='::',
                          names=['user_id', 'gender', 'age', 'occupation', 'zipcode'],
                          encoding='latin1', engine='python')

    # Process genres
    genres_list = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy',
                   'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                   'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                   'Thriller', 'War', 'Western']

    # Create genre columns
    for genre in genres_list:
        movies_df[genre] = movies_df['genres'].str.contains(genre).astype(int)

    # Merge ratings with movies
    merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

    # Normalize ratings
    merged_df['rating_norm'] = merged_df['rating'] / merged_df['rating'].max()

    # Create user sequences
    grouped = merged_df[['user_id', 'movie_id', 'rating_norm', 'timestamp']].sort_values(by=['user_id', 'timestamp'])
    user_sequences = []
    for user_id, user_group in grouped.groupby('user_id'):
        seq = list(zip(user_group['movie_id'], user_group['rating_norm']))
        user_sequences.append({'user_id': user_id, 'movie_sequence': seq})

    return pd.DataFrame(user_sequences), movies_df

# Prepare sequences function remains the same
def prepare_sequences(sequences, max_seq_len):
    padded_seqs, rating_seqs, pos_ids, labels = [], [], [], []
    for _, row in sequences.iterrows():
        seq = row['movie_sequence']
        if not seq:
            continue
        seq_movies, seq_ratings = zip(*seq)

        if len(seq_movies) > max_seq_len:
            seq_movies = seq_movies[-max_seq_len:]
            seq_ratings = seq_ratings[-max_seq_len:]
        else:
            padding = max_seq_len - len(seq_movies)
            seq_movies = [0] * padding + list(seq_movies)
            seq_ratings = [0] * padding + list(seq_ratings)

        padded_seqs.append(seq_movies[:-1])
        rating_seqs.append(seq_ratings[:-1])
        pos_ids.append(list(range(len(seq_movies[:-1]))))
        labels.append(seq_movies[1:])

    return np.array(padded_seqs), np.array(rating_seqs), np.array(pos_ids), np.array(labels)

def extract_metadata_embeddings(movies_df, embedding_dim=256):
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    title_embeddings = model.encode(movies_df['title'].tolist())

    # Extract genre features
    genre_columns = [col for col in movies_df.columns if col not in ['movie_id', 'title', 'genres']]
    genre_embeddings = movies_df[genre_columns].values

    # Combine embeddings
    combined_embeddings = np.hstack([title_embeddings, genre_embeddings])
    combined_embeddings = combined_embeddings[:, :embedding_dim]
    return combined_embeddings

# EnhancedSASRecWithHybrid class remains the same
class EnhancedSASRecWithHybrid(Model):
    def __init__(self, num_items, embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate):
        super(EnhancedSASRecWithHybrid, self).__init__()
        self.item_embedding = Embedding(input_dim=num_items, output_dim=embed_dim)
        self.positional_embedding = Embedding(input_dim=max_seq_len, output_dim=embed_dim)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.layer_norm1 = LayerNormalization()
        self.feed_forward = Dense(ff_dim, activation='gelu')
        self.dropout2 = Dropout(dropout_rate)
        self.layer_norm2 = LayerNormalization()
        self.output_layer = Dense(num_items)

    def call(self, inputs, training=False):
        seqs, pos_ids = inputs
        seq_embeds = self.item_embedding(seqs) + self.positional_embedding(pos_ids)
        attention_out = self.attention(seq_embeds, seq_embeds)
        attention_out = self.dropout1(attention_out, training=training)
        attention_out = self.layer_norm1(attention_out + seq_embeds)
        ff_out = self.feed_forward(attention_out)
        ff_out = self.dropout2(ff_out, training=training)
        output = self.layer_norm2(ff_out + attention_out)
        logits = self.output_layer(output)
        return logits

    def compute_mol_scores(self, query_embed, candidate_embeds, component_weights):
        mol_scores = np.sum(
            [w * np.dot(query_embed, candidate_embeds[p].T) for p, w in enumerate(component_weights) if p < len(candidate_embeds)],
            axis=0
        )
        return mol_scores

@tf.function
def train_on_batch(model, optimizer, batch_X, batch_pos, batch_y):
    with tf.GradientTape() as tape:
        logits = model([batch_X, batch_pos], training=True)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(batch_y, logits, from_logits=True))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_hybrid_sasrec(ratings_path, movies_path, users_path, T_init,
                       max_seq_len=50, embed_dim=128, num_heads=2,
                       ff_dim=128, dropout_rate=0.3, batch_size=128, epochs=10):

    # Preprocess data with the new format
    user_sequences, movies_df = preprocess_data(ratings_path, movies_path, users_path)

    # Extract metadata embeddings
    metadata_embeddings = extract_metadata_embeddings(movies_df, embedding_dim=embed_dim)

    # Encode movie IDs
    movie_encoder = LabelEncoder()
    all_movies = sorted(set([movie for seq in user_sequences['movie_sequence'] for movie, _ in seq]))
    movie_encoder.fit(all_movies)
    user_sequences['movie_sequence'] = user_sequences['movie_sequence'].apply(
        lambda seq: [(movie_encoder.transform([movie])[0], rating) for movie, rating in seq]
    )

    # Prepare sequences for training
    X, _, pos_ids, y = prepare_sequences(user_sequences, max_seq_len)

    # Split data
    X_train, X_val, pos_train, pos_val, y_train, y_val = train_test_split(
        X, pos_ids, y, test_size=0.2, random_state=42)

    # Initialize model with increased capacity for larger dataset
    model = EnhancedSASRecWithHybrid(
        len(all_movies), embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate)
    optimizer = Adam(learning_rate=0.001)

    # Training loop
    print("Starting training...")
    for epoch in range(epochs):
        epoch_loss = []

        for step in range(0, len(X_train), batch_size):
            batch_X = X_train[step:step + batch_size]
            batch_pos = pos_train[step:step + batch_size]
            batch_y = y_train[step:step + batch_size]

            # Initial candidate retrieval
            query_embeddings = model.item_embedding(batch_X)
            candidate_embeddings = metadata_embeddings[batch_X.flatten()]

            try:
                # Compute MOL scores
                mol_scores = model.compute_mol_scores(
                    query_embeddings, candidate_embeddings, np.ones(len(all_movies)))
                selected_candidates = np.argwhere(mol_scores >= T_init).flatten()
                selected_candidates = selected_candidates[selected_candidates < len(mol_scores)]

                if len(selected_candidates) > 0:
                    T_adaptive = np.min(mol_scores[selected_candidates])
                else:
                    T_adaptive = T_init

                refined_candidates = [x for x in selected_candidates if (mol_scores[x] >= T_adaptive).any()]

                # Train on refined set
                loss = train_on_batch(model, optimizer, batch_X, batch_pos, batch_y)
                epoch_loss.append(loss.numpy())

            except Exception as e:
                print(f"Error in batch processing: {e}")
                continue

        # Validation
        val_logits = model([X_val, pos_val], training=False)
        val_loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(y_val, val_logits, from_logits=True))
        val_predictions = tf.argmax(val_logits, axis=-1)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(val_predictions, y_val), tf.float32))

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training Loss: {np.mean(epoch_loss):.4f}")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Accuracy: {accuracy:.4f}")
        print("-" * 50)

    return model, metadata_embeddings, movie_encoder

# Example usage
if __name__ == "__main__":
    # Define paths to MovieLens 1M dataset files
    ratings_path = "/content/drive/My Drive/ratings.dat"
    movies_path = "/content/drive/My Drive/movies.dat"
    users_path = "/content/drive/My Drive/users.dat"

    # Initial threshold for MOL scores
    T_init = 0.3

    # Train the model
    model, metadata_embeddings, movie_encoder = train_hybrid_sasrec(
        ratings_path, movies_path, users_path, T_init,
        max_seq_len=50,
        embed_dim=128,
        num_heads=2,
        ff_dim=128,
        dropout_rate=0.3,
        batch_size=128,
        epochs=10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting training...
Epoch 1/10
Training Loss: 7.3115
Validation Loss: 6.8123
Validation Accuracy: 0.1026
--------------------------------------------------
Epoch 2/10
Training Loss: 6.6661
Validation Loss: 6.5182
Validation Accuracy: 0.1100
--------------------------------------------------
Epoch 3/10
Training Loss: 6.2192
Validation Loss: 6.0720
Validation Accuracy: 0.1297
--------------------------------------------------
Epoch 4/10
Training Loss: 5.7461
Validation Loss: 5.7726
Validation Accuracy: 0.1368
--------------------------------------------------
Epoch 5/10
Training Loss: 5.4244
Validation Loss: 5.6091
Validation Accuracy: 0.1401
--------------------------------------------------
Epoch 6/10
Training Loss: 5.2000
Validation Loss: 5.5175
Validation Accuracy: 0.1421
--------------------------------------------------
Epoch 7/10
Training Loss: 5.0278
Validation Loss: 5.4667
Validation Accuracy: 0.1435
--------------------------------------------------
Epoch 8/10
Training Loss: 4

SAS+mol

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

def preprocess_data(ratings_path, movies_path, users_path):
    # Read ratings data
    ratings_df = pd.read_csv(ratings_path, sep='::',
                            names=['user_id', 'movie_id', 'rating', 'timestamp'],
                            encoding='latin1', engine='python')

    # Read movies data
    movies_df = pd.read_csv(movies_path, sep='::',
                           names=['movie_id', 'title', 'genres'],
                           encoding='latin1', engine='python')

    # Process genres
    genres_list = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy',
                   'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                   'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                   'Thriller', 'War', 'Western']

    # Create genre columns
    for genre in genres_list:
        movies_df[genre] = movies_df['genres'].str.contains(genre).astype(int)

    # Merge ratings with movies
    merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

    # Normalize ratings
    merged_df['rating_norm'] = merged_df['rating'] / merged_df['rating'].max()

    # Create user sequences
    grouped = merged_df[['user_id', 'movie_id', 'rating_norm', 'timestamp']].sort_values(by=['user_id', 'timestamp'])
    user_sequences = []
    for user_id, user_group in grouped.groupby('user_id'):
        seq = list(zip(user_group['movie_id'], user_group['rating_norm']))
        user_sequences.append({'user_id': user_id, 'movie_sequence': seq})

    return pd.DataFrame(user_sequences), movies_df

def prepare_sequences(sequences, max_seq_len):
    padded_seqs, rating_seqs, pos_ids, labels = [], [], [], []
    for _, row in sequences.iterrows():
        seq = row['movie_sequence']
        if not seq:
            continue
        seq_movies, seq_ratings = zip(*seq)

        if len(seq_movies) > max_seq_len:
            seq_movies = seq_movies[-max_seq_len:]
            seq_ratings = seq_ratings[-max_seq_len:]
        else:
            padding = max_seq_len - len(seq_movies)
            seq_movies = [0] * padding + list(seq_movies)
            seq_ratings = [0] * padding + list(seq_ratings)

        padded_seqs.append(seq_movies[:-1])
        rating_seqs.append(seq_ratings[:-1])
        pos_ids.append(list(range(len(seq_movies[:-1]))))
        labels.append(seq_movies[1:])

    return np.array(padded_seqs), np.array(rating_seqs), np.array(pos_ids), np.array(labels)

def extract_metadata_embeddings(movies_df, embedding_dim=128):
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    title_embeddings = model.encode(movies_df['title'].tolist())

    # Extract genre features
    genre_columns = [col for col in movies_df.columns if col not in ['movie_id', 'title', 'genres']]
    genre_embeddings = movies_df[genre_columns].values

    # Combine embeddings
    combined_embeddings = np.hstack([title_embeddings, genre_embeddings])
    combined_embeddings = combined_embeddings[:, :embedding_dim]
    return combined_embeddings

class MOL_SASRec(Model):
    def __init__(self, num_items, embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate):
        super(MOL_SASRec, self).__init__()
        self.item_embedding = Embedding(input_dim=num_items, output_dim=embed_dim)
        self.positional_embedding = Embedding(input_dim=max_seq_len, output_dim=embed_dim)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dropout1 = Dropout(dropout_rate)
        self.layer_norm1 = LayerNormalization()
        self.feed_forward = Dense(ff_dim, activation='gelu')
        self.dropout2 = Dropout(dropout_rate)
        self.layer_norm2 = LayerNormalization()
        self.output_layer = Dense(num_items)

    def call(self, inputs, training=False):
        seqs, pos_ids = inputs
        seq_embeds = self.item_embedding(seqs) + self.positional_embedding(pos_ids)
        attention_out = self.attention(seq_embeds, seq_embeds)
        attention_out = self.dropout1(attention_out, training=training)
        attention_out = self.layer_norm1(attention_out + seq_embeds)
        ff_out = self.feed_forward(attention_out)
        ff_out = self.dropout2(ff_out, training=training)
        output = self.layer_norm2(ff_out + attention_out)
        logits = self.output_layer(output)
        return logits

    def compute_mol_scores(self, query_embed, candidate_embeds, component_weights):
        if len(component_weights) > len(candidate_embeds):
            component_weights = component_weights[:len(candidate_embeds)]
        mol_scores = np.sum(
            [w * np.dot(query_embed, candidate_embeds[p].T) for p, w in enumerate(component_weights) if p < len(candidate_embeds)],
            axis=0
        )
        return mol_scores

@tf.function
def train_on_batch(model, optimizer, batch_X, batch_pos, batch_y):
    with tf.GradientTape() as tape:
        logits = model([batch_X, batch_pos], training=True)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(batch_y, logits, from_logits=True))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_mol_sasrec(ratings_path, movies_path, users_path, max_seq_len=50,
                     embed_dim=128, num_heads=4, ff_dim=128,
                     dropout_rate=0.3, batch_size=128, epochs=10):

    print("Loading and preprocessing data...")
    user_sequences, movies_df = preprocess_data(ratings_path, movies_path, users_path)

    print("Extracting metadata embeddings...")
    metadata_embeddings = extract_metadata_embeddings(movies_df, embedding_dim=embed_dim)

    # Encode movie IDs
    movie_encoder = LabelEncoder()
    all_movies = sorted(set([movie for seq in user_sequences['movie_sequence'] for movie, _ in seq]))
    movie_encoder.fit(all_movies)
    user_sequences['movie_sequence'] = user_sequences['movie_sequence'].apply(
        lambda seq: [(movie_encoder.transform([movie])[0], rating) for movie, rating in seq]
    )

    print("Preparing sequences...")
    X, _, pos_ids, y = prepare_sequences(user_sequences, max_seq_len)

    # Split data
    X_train, X_val, pos_train, pos_val, y_train, y_val = train_test_split(
        X, pos_ids, y, test_size=0.2, random_state=42)

    print(f"Training set size: {len(X_train)}, Validation set size: {len(X_val)}")

    # Initialize model
    print("Initializing model...")
    model = MOL_SASRec(len(all_movies), embed_dim, max_seq_len, num_heads, ff_dim, dropout_rate)
    optimizer = Adam(learning_rate=0.001)

    # Training loop
    print("Starting training...")
    for epoch in range(epochs):
        epoch_losses = []

        # Training phase
        for step in range(0, len(X_train), batch_size):
            batch_X = X_train[step:step + batch_size]
            batch_pos = pos_train[step:step + batch_size]
            batch_y = y_train[step:step + batch_size]

            try:
                loss = train_on_batch(model, optimizer, batch_X, batch_pos, batch_y)
                epoch_losses.append(loss.numpy())

                if step % 1000 == 0:
                    print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.numpy():.4f}")

            except Exception as e:
                print(f"Error in batch processing: {e}")
                continue

        # Validation phase
        val_logits = model([X_val, pos_val], training=False)
        val_loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(y_val, val_logits, from_logits=True)
        )

        # Calculate validation metrics
        val_preds = tf.argmax(val_logits, axis=-1)
        val_accuracy = tf.reduce_mean(tf.cast(tf.equal(val_preds, y_val), tf.float32))



        print(f"\nEpoch {epoch + 1}/{epochs}")
        print(f"Average Training Loss: {np.mean(epoch_losses):.4f}")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print("-" * 50)

    return model, metadata_embeddings, movie_encoder

# Example usage
if __name__ == "__main__":
    # Define paths to MovieLens 1M dataset files
    ratings_path = "/content/drive/My Drive/ratings.dat"
    movies_path = "/content/drive/My Drive/movies.dat"
    users_path = "/content/drive/My Drive/users.dat"

    # Train the model
    model, metadata_embeddings, movie_encoder = train_mol_sasrec(
        ratings_path=ratings_path,
        movies_path=movies_path,
        users_path=users_path,
        max_seq_len=50,
        embed_dim=128,
        num_heads=2,
        ff_dim=128,
        dropout_rate=0.3,
        batch_size=128,
        epochs=10 )


Loading and preprocessing data...
Extracting metadata embeddings...
Preparing sequences...
Training set size: 4832, Validation set size: 1208
Initializing model...
Starting training...
Epoch 1, Step 0, Loss: 8.2505

Epoch 1/10
Average Training Loss: 7.3117
Validation Loss: 6.8207
Validation Accuracy: 0.1020
--------------------------------------------------
Epoch 2, Step 0, Loss: 6.8266

Epoch 2/10
Average Training Loss: 6.6741
Validation Loss: 6.5190
Validation Accuracy: 0.1105
--------------------------------------------------
Epoch 3, Step 0, Loss: 6.3837

Epoch 3/10
Average Training Loss: 6.2094
Validation Loss: 6.0593
Validation Accuracy: 0.1294
--------------------------------------------------
Epoch 4, Step 0, Loss: 5.8642

Epoch 4/10
Average Training Loss: 5.7423
Validation Loss: 5.7708
Validation Accuracy: 0.1362
--------------------------------------------------
Epoch 5, Step 0, Loss: 5.4702

Epoch 5/10
Average Training Loss: 5.4279
Validation Loss: 5.6173
Validation Accuracy