## Basic Model building

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from ast import literal_eval

# Load dataset
df = pd.read_csv('merged_data.csv')

  df = pd.read_csv('merged_data.csv')


In [3]:
# Convert string representation of list to actual list for Genres column
df['Genres'] = df['Genres'].apply(literal_eval)

# Step 1: Zero-Index all categorical features
df["UserID"] = df["UserID"] - 1  # Zero-index UserID
df["MovieID"] = df["MovieID"] - 1  # Zero-index MovieID
df["Gender"] = df["Gender"].astype('category').cat.codes  # Zero-index Gender
df["Occupation"] = df["Occupation"].astype('category').cat.codes  # Zero-index Occupation


In [4]:
# Step 2: Split data by user age group or any criteria you want to simulate clients
age_groups = df['Age'].unique()
client_datasets = {}

# Create simulated client datasets based on age groups
for age in age_groups:
    client_data = df[df['Age'] == age]
    client_datasets[f"client_{age}"] = client_data

# Get all unique genres from the dataset
all_genres = set()
for genres in df['Genres']:
    all_genres.update(genres)
all_genres = sorted(all_genres)
num_genres = len(all_genres)

# Create genre mapping
genre_to_idx = {genre: i for i, genre in enumerate(all_genres)}

In [None]:
# Example Federated Recommender Model using PyTorch
class FederatedRecommender(nn.Module):
    def __init__(self, num_users, num_movies, num_genders, num_occupations, num_genres, embedding_dim=10):
        super(FederatedRecommender, self).__init__()
        
        # Embeddings for each feature
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.gender_embedding = nn.Embedding(num_genders, embedding_dim)
        self.occupation_embedding = nn.Embedding(num_occupations, embedding_dim)
        
        # Genre embeddings - we'll use a linear layer
        self.genre_projection = nn.Linear(num_genres, embedding_dim)
        
        # Fully connected layers for rating prediction
        self.fc1 = nn.Linear(embedding_dim * 5, 128)  # 5 features: user, movie, gender, occupation, genres
        self.fc2 = nn.Linear(128, 1)

    def forward(self, user, movie, gender, occupation, genres):
        # Embedding lookup
        user_embedded = self.user_embedding(user)
        movie_embedded = self.movie_embedding(movie)
        gender_embedded = self.gender_embedding(gender)
        occupation_embedded = self.occupation_embedding(occupation)
        
        # Process genres - project binary flags to embedding space
        genre_embedded = self.genre_projection(genres.float())
        
        # Concatenate all embeddings
        all_embeddings = torch.cat([
            user_embedded, 
            movie_embedded, 
            gender_embedded, 
            occupation_embedded,
            genre_embedded
        ], dim=-1)
        
        # Pass through fully connected layers
        x = torch.relu(self.fc1(all_embeddings))
        rating = self.fc2(x)
        return rating.squeeze()


In [51]:
from torch.utils.data import DataLoader, TensorDataset

# Federated learning training function for each client
def train_local_model(client_data, model, epochs=75, batch_size=64, lr=0.001, weight_decay=0.01):
    """
    Train the local model using the client's data.

    Args:
    - client_data: The dataset for the current client (a DataFrame).
    - model: The FederatedRecommender model to train.
    - epochs: The number of epochs to train for.
    - batch_size: The batch size for training.
    - lr: The learning rate.
    - weight_decay: The weight decay (L2 regularization).

    Returns:
    - trained_model: The trained model.
    - final_loss: The final loss after training.
    """

    # Prepare the data
    user = torch.tensor(client_data['UserID'].values, dtype=torch.long)
    movie = torch.tensor(client_data['MovieID'].values, dtype=torch.long)
    gender = torch.tensor(client_data['Gender'].values, dtype=torch.long)
    occupation = torch.tensor(client_data['Occupation'].values, dtype=torch.long)
    
    # Create genre vectors
    genre_vectors = []
    for genres in client_data['Genres']:
        vec = torch.zeros(num_genres)
        for genre in genres:
            if genre in genre_to_idx:  # Check if genre is in the index map
                vec[genre_to_idx[genre]] = 1
        genre_vectors.append(vec)
    genres = torch.stack(genre_vectors)
    
    ratings = torch.tensor(client_data['Rating'].values, dtype=torch.float32)

    # Create a DataLoader to batch the data
    dataset = TensorDataset(user, movie, gender, occupation, genres, ratings)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize the optimizer, loss function, and learning rate scheduler
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)  # Using AdamW optimizer with weight decay
    criterion = nn.MSELoss()  # Mean Squared Error for regression

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Learning rate scheduler to decay after 10 epochs

    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        # Loop through mini-batches
        for batch in data_loader:
            user_batch, movie_batch, gender_batch, occupation_batch, genres_batch, ratings_batch = batch
            
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            predictions = model(user_batch, movie_batch, gender_batch, occupation_batch, genres_batch)
            
            # Compute the loss
            loss = criterion(predictions.squeeze(), ratings_batch)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Update running loss for the current epoch
            running_loss += loss.item()

        # Step the learning rate scheduler
        scheduler.step()

        avg_loss = running_loss / len(data_loader)
        print(f"Client Training Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f}")
        
    return model, avg_loss

In [52]:
# Step 4: Federated Learning Training
local_models = []
validation_losses = []

# Get the maximum values for each categorical feature
num_users = df['UserID'].max() + 1
num_movies = df['MovieID'].max() + 1
num_genders = df['Gender'].max() + 1
num_occupations = df['Occupation'].max() + 1

for age in age_groups:
    age_key = f"client_{int(age)}"
    if age_key not in client_datasets:
        print(f"Skipping {age_key}, no data available.")
        continue
    
    # Initialize the model for this client
    model = FederatedRecommender(
        num_users=num_users, 
        num_movies=num_movies, 
        num_genders=num_genders, 
        num_occupations=num_occupations, 
        num_genres=num_genres
    )
    
    # Train model locally for the current client
    trained_model, val_loss = train_local_model(client_datasets[age_key], model)
    # Save each trained model
    model_save_path = f"client_models/client_model_{int(age)}.pth"
    torch.save(model.state_dict(), model_save_path)
    print(f"💾 Saved model for client_{int(age)} to {model_save_path}")
    
    
    local_models.append(trained_model)
    validation_losses.append(val_loss)

# Step 5: Display training completion
print(f"✅ Federated Learning Training Completed!")
print(f"📊 Validation Loss per Client: {validation_losses}")

Client Training Epoch 1/75 - Avg Loss: 1.9037
Client Training Epoch 2/75 - Avg Loss: 1.3371
Client Training Epoch 3/75 - Avg Loss: 1.2720
Client Training Epoch 4/75 - Avg Loss: 1.2141
Client Training Epoch 5/75 - Avg Loss: 1.1651
Client Training Epoch 6/75 - Avg Loss: 1.1165
Client Training Epoch 7/75 - Avg Loss: 1.0727
Client Training Epoch 8/75 - Avg Loss: 1.0371
Client Training Epoch 9/75 - Avg Loss: 1.0041
Client Training Epoch 10/75 - Avg Loss: 0.9709
Client Training Epoch 11/75 - Avg Loss: 0.9323
Client Training Epoch 12/75 - Avg Loss: 0.9168
Client Training Epoch 13/75 - Avg Loss: 0.9001
Client Training Epoch 14/75 - Avg Loss: 0.8910
Client Training Epoch 15/75 - Avg Loss: 0.8801
Client Training Epoch 16/75 - Avg Loss: 0.8677
Client Training Epoch 17/75 - Avg Loss: 0.8562
Client Training Epoch 18/75 - Avg Loss: 0.8471
Client Training Epoch 19/75 - Avg Loss: 0.8364
Client Training Epoch 20/75 - Avg Loss: 0.8264
Client Training Epoch 21/75 - Avg Loss: 0.8070
Client Training Epoch 

### Federated learning model is training well on each client. The loss is steadily decreasing across epochs, which indicates that the model is learning effectively

In [60]:
# Function to prepare input features for a single prediction
def prepare_input(user_id, movie_id, gender_str, occupation_str, genres_list):
    user = torch.tensor([user_id], dtype=torch.long)
    movie = torch.tensor([movie_id], dtype=torch.long)
    
    gender_code = pd.Series([gender_str]).astype('category').cat.codes[0]
    occupation_code = pd.Series([occupation_str]).astype('category').cat.codes[0]
    
    gender = torch.tensor([gender_code], dtype=torch.long)
    occupation = torch.tensor([occupation_code], dtype=torch.long)
    
    genre_vec = torch.zeros(num_genres)
    for genre in genres_list:
        if genre in genre_to_idx:
            genre_vec[genre_to_idx[genre]] = 1
    genres = genre_vec.unsqueeze(0)
    
    return user, movie, gender, occupation, genres

# Recreate the model architecture first
test_model = FederatedRecommender(
    num_users=num_users, 
    num_movies=num_movies, 
    num_genders=num_genders, 
    num_occupations=num_occupations, 
    num_genres=num_genres
)

# Sample input: Modify with actual known values from your dataset
sample_row = df.iloc[15]

sample_user_id = sample_row["UserID"]   # 0-indexed
sample_movie_id = sample_row["MovieID"]  # 0-indexed
sample_gender = sample_row["Gender"]    # Gender column
sample_occupation = sample_row["Occupation"]  # Occupation column
sample_genres = sample_row["Genres"] 

# Extract the user's age (you would need to ensure there's a column for age or birthdate)
user_age = sample_row["Age"]  # Assuming there's an 'Age' column

# Dynamically select the model based on age group
# For example, if the user is between 18-25, load the model for age group 18
age_group = 1 if (user_age // 10) * 10 ==0 else (user_age // 10) * 10 # This groups the ages (e.g., 20, 30, 40, etc.)

model_filename = f"client_models/client_model_{age_group}.pth"
test_model.load_state_dict(torch.load(model_filename))
test_model.eval()

# Prepare input
user, movie, gender, occupation, genres = prepare_input(
    sample_user_id, sample_movie_id, sample_gender, sample_occupation, sample_genres
)

# Predict
with torch.no_grad():
    predicted_rating = test_model(user, movie, gender, occupation, genres)

    # Clamp between 1 and 5, then round to nearest int
    predicted_rating = torch.clamp(predicted_rating, 1.0, 5.0)
    predicted_rating = torch.round(predicted_rating)

    print(f"⭐ Predicted rating: {predicted_rating.item():.0f} for User {sample_user_id}, Movie {sample_movie_id} Age group  {age_group}")


⭐ Predicted rating: 4 for User 0, Movie 2790 Age group  1


In [58]:
print(df.iloc[15])

UserID                        0
MovieID                    2790
Rating                        4
Timestamp             978302188
Gender                        0
Age                           1
Occupation                   10
Zip-code                  48067
Title          Airplane! (1980)
Genres                 [Comedy]
User                          0
Movie                      2586
Action                        0
Adventure                     0
Animation                     0
Children's                    0
Comedy                        1
Crime                         0
Documentary                   0
Drama                         0
Fantasy                       0
Film-Noir                     0
Horror                        0
Musical                       0
Mystery                       0
Romance                       0
Sci-Fi                        0
Thriller                      0
War                           0
Western                       0
Name: 15, dtype: object


In [None]:
import torch

# Initialize the federated model (this is the final model that combines knowledge from all clients)
federated_model = FederatedRecommender(
    num_users=num_users, 
    num_movies=num_movies, 
    num_genders=num_genders, 
    num_occupations=num_occupations, 
    num_genres=num_genres
)

# Aggregation function: Average the weights from all local models
def aggregate_local_models(local_models):
    # Initialize an empty dictionary to hold the averaged weights
    federated_weights = {}

    # Initialize the federated model weights with zeroes (or the first model's weights)
    for name, param in local_models[0].named_parameters():
        federated_weights[name] = torch.zeros_like(param)

    # Iterate through each local model and accumulate its weights
    for model in local_models:
        for name, param in model.named_parameters():
            federated_weights[name] += param.data

    # Average the accumulated weights
    num_models = len(local_models)
    for name, param in federated_weights.items():
        federated_weights[name] /= num_models

    return federated_weights

# Aggregate the local models into the federated model
aggregated_weights = aggregate_local_models(local_models)

# Load the aggregated weights into the federated model
federated_model.load_state_dict(aggregated_weights)

# Save the federated model
torch.save(federated_model.state_dict(), "federated_model.pth")
print("💾 Federated model saved as federated_model.pth")


💾 Federated model saved as federated_model.pth


In [75]:
# Load the federated model for prediction
federated_model.load_state_dict(torch.load("federated_model.pth"))
federated_model.eval()

# Function to recommend movies using the federated model
def recommend_movies_federated(user_id, gender, occupation, genres, num_recommendations=5):
    all_movie_ids = range(num_movies)  # All possible movie IDs
    movie_ratings = []

    # Loop over all movie IDs to get predictions
    for movie_id in all_movie_ids:
        # Prepare the input for the model (ensure correct format for input)
        user_tensor, movie_tensor, gender_tensor, occupation_tensor, genres_tensor = prepare_input(
            user_id, movie_id, gender, occupation, genres
        )
        
        # Predict the rating for the movie using the federated model
        with torch.no_grad():
            predicted_rating = federated_model(user_tensor, movie_tensor, gender_tensor, occupation_tensor, genres_tensor)
            predicted_rating = torch.clamp(predicted_rating, 1.0, 5.0)  # Ensure it's between 1 and 5
            movie_ratings.append((movie_id, predicted_rating.item()))

    # Sort movies by predicted rating (highest to lowest)
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top N recommendations
    top_recommendations = movie_ratings[:num_recommendations]
    
    # Create a list of movie names and predicted ratings to display
    recommendations = []
    for movie_id, rating in top_recommendations:
        # Fetch the movie name from a movie_names dictionary
        movie_name = movie_names.get(movie_id, f"Unknown Movie {movie_id}")  # If no name found, fallback to the movie ID
        recommendations.append(f"{movie_name}")
    
    return recommendations


In [76]:
# Function to get user input
def get_user_input():
    user_id = int(input("Enter User age: "))
    gender = input("Enter Gender (M/F): ")
    occupation = input("Enter Occupation: ")
    genres = input("Enter Preferred Genres (comma separated): ").split(",")
    
    return user_id, gender, occupation, genres

# Get user input
user_id, gender, occupation, genres = get_user_input()  # This should be a function to get user input
recommendations = recommend_movies_federated(user_id, gender, occupation, genres, num_recommendations=5)

# Print recommended movies
for recommendation in recommendations:
    print(recommendation)

Toy Story (1995)
Jumanji (1995)
Grumpier Old Men (1995)
Waiting to Exhale (1995)
Father of the Bride Part II (1995)
