In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time

# 1. Data Preprocessing
class DataPreprocessor:
    def __init__(self):
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        
    def preprocess_data(self, df):
        # Encode user and item IDs
        df['user_id_encoded'] = self.user_encoder.fit_transform(df['user_id'])
        df['item_id_encoded'] = self.item_encoder.fit_transform(df['item_id'])
        
        # Create user features
        user_features = pd.DataFrame({
            'user_id': df['user_id_encoded'].unique()
        })
        
        # Create item features
        item_features = pd.DataFrame({
            'item_id': df['item_id_encoded'].unique()
        })
        
        return df, user_features, item_features

# 2. Two-Tower Model Definition
class TwoTowerModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim=64, hidden_units=[32, 16]):
        super(TwoTowerModel, self).__init__()
        
        # User tower
        self.user_tower = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(1,)),
            tf.keras.layers.Embedding(num_users, embedding_dim),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(hidden_units[0], activation='relu'),
            tf.keras.layers.Dense(hidden_units[1], activation='relu'),
            tf.keras.layers.Dense(embedding_dim),
            tf.keras.layers.LayerNormalization(),
        ])
        
        # Item tower
        self.item_tower = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(1,)),
            tf.keras.layers.Embedding(num_items, embedding_dim),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(hidden_units[0], activation='relu'),
            tf.keras.layers.Dense(hidden_units[1], activation='relu'),
            tf.keras.layers.Dense(embedding_dim),
            tf.keras.layers.LayerNormalization(),
        ])
        
    def call(self, inputs):
        user_input, item_input = inputs
        user_embedding = self.user_tower(user_input)
        item_embedding = self.item_tower(item_input)
        
        # Compute dot product similarity
        return tf.reduce_sum(user_embedding * item_embedding, axis=1)
    
    def get_user_embeddings(self, user_ids):
        return self.user_tower(user_ids)
    
    def get_item_embeddings(self, item_ids):
        return self.item_tower(item_ids)

# 3. Training Data Generator
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, batch_size=1024, num_negatives=4):
        self.df = df
        self.batch_size = batch_size
        self.num_negatives = num_negatives
        self.num_items = df['item_id_encoded'].nunique()
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.ceil(len(self.df) * (1 + self.num_negatives) / self.batch_size))
    
    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.df) * (1 + self.num_negatives))
        
        batch_users = []
        batch_items = []
        batch_labels = []
        
        for i in range(start_idx, end_idx):
            if i < len(self.df):
                # Positive sample
                user = self.df.iloc[i]['user_id_encoded']
                item = self.df.iloc[i]['item_id_encoded']
                batch_users.append(user)
                batch_items.append(item)
                batch_labels.append(1.0)
                
                # Negative samples
                for _ in range(self.num_negatives):
                    neg_item = np.random.randint(0, self.num_items)
                    while neg_item in self.df[self.df['user_id_encoded'] == user]['item_id_encoded'].values:
                        neg_item = np.random.randint(0, self.num_items)
                    batch_users.append(user)
                    batch_items.append(neg_item)
                    batch_labels.append(0.0)
        
        return [np.array(batch_users), np.array(batch_items)], np.array(batch_labels)
    
    def on_epoch_end(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)

# 4. Training and Evaluation Functions
class ModelTrainer:
    def __init__(self, model, learning_rate=0.001):
        self.model = model
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        self.train_loss = tf.keras.metrics.Mean()
        self.train_accuracy = tf.keras.metrics.BinaryAccuracy()
        
    @tf.function
    def train_step(self, x, y):
        with tf.GradientTape() as tape:
            logits = self.model(x)
            loss = self.loss_fn(y, logits)
        
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        
        self.train_loss(loss)
        self.train_accuracy(y, tf.sigmoid(logits))
        
        return loss
    
    def train(self, train_generator, val_generator, epochs=10):
        history = {
            'loss': [],
            'accuracy': [],
            'val_loss': [],
            'val_accuracy': [],
            'time_per_epoch': []
        }
        
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")
            start_time = time.time()
            
            # Training
            for batch_idx, (x, y) in enumerate(train_generator):
                loss = self.train_step(x, y)
                if batch_idx % 50 == 0:
                    print(f"Batch {batch_idx}, Loss: {loss:.4f}")
            
            # Validation
            val_loss = 0
            val_accuracy = 0
            num_batches = 0
            
            for x_val, y_val in val_generator:
                val_logits = self.model(x_val)
                val_loss += self.loss_fn(y_val, val_logits)
                val_accuracy += tf.keras.metrics.binary_accuracy(y_val, tf.sigmoid(val_logits))
                num_batches += 1
            
            val_loss /= num_batches
            val_accuracy /= num_batches
            
            epoch_time = time.time() - start_time
            
            # Store metrics
            history['loss'].append(self.train_loss.result().numpy())
            history['accuracy'].append(self.train_accuracy.result().numpy())
            history['val_loss'].append(val_loss.numpy())
            history['val_accuracy'].append(val_accuracy.numpy())
            history['time_per_epoch'].append(epoch_time)
            
            print(f"Training Loss: {self.train_loss.result():.4f}")
            print(f"Training Accuracy: {self.train_accuracy.result():.4f}")
            print(f"Validation Loss: {val_loss:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            print(f"Time taken: {epoch_time:.2f}s")
            
            # Reset metrics
            self.train_loss.reset_states()
            self.train_accuracy.reset_states()
        
        return history

# 5. Recommendation Generator
class RecommendationGenerator:
    def __init__(self, model, num_recommendations=5):
        self.model = model
        self.num_recommendations = num_recommendations
    
    def generate_recommendations(self, user_id, item_ids):
        user_embedding = self.model.get_user_embeddings(tf.constant([user_id]))
        item_embeddings = self.model.get_item_embeddings(tf.constant(item_ids))
        
        # Calculate similarities
        similarities = tf.matmul(user_embedding, tf.transpose(item_embeddings))
        similarities = tf.squeeze(similarities)
        
        # Get top k recommendations
        _, indices = tf.nn.top_k(similarities, k=self.num_recommendations)
        
        return indices.numpy(), similarities[indices].numpy()

# 6. Main Training Pipeline
def main():
   df = pd.read_csv('cleaned_dataset.csv')  # Load your data
    # Replace this with your actual data loading code
    #df = pd.DataFrame({
       # 'user_id': [...],  # Your user IDs
       #'item_id': [...],  # Your item IDs
   # })
    
    # Initialize preprocessor
preprocessor = DataPreprocessor()
df_processed, user_features, item_features = preprocessor.preprocess_data(df)
    
    # Split data
train_df, val_df = train_test_split(df_processed, test_size=0.2, random_state=42)
    
    # Create data generators
train_generator = DataGenerator(train_df)
val_generator = DataGenerator(val_df)
    
    # Initialize model
model = TwoTowerModel(
    num_users=len(user_features),
    num_items=len(item_features),
    embedding_dim=64,
    hidden_units=[32, 16]
)
    
    # Initialize trainer
trainer = ModelTrainer(model)
    
    # Train model
history = trainer.train(train_generator, val_generator, epochs=10)
    
    # Initialize recommendation generator
recommender = RecommendationGenerator(model)
    
    # Generate recommendations for a sample user
sample_user_id = 0
all_item_ids = np.arange(len(item_features))
rec_indices, rec_scores = recommender.generate_recommendations(sample_user_id, all_item_ids)
    
    # Print recommendations
print("\nTop 5 recommendations for user", preprocessor.user_encoder.inverse_transform([sample_user_id])[0])
for idx, score in zip(rec_indices, rec_scores):
    item_id = preprocessor.item_encoder.inverse_transform([idx])[0]
    print(f"Item {item_id}: Score {score:.4f}")

if __name__ == "__main__":
    main()


NameError: name 'df' is not defined