<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/paper_implement/dev/1_mrinal_mtech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install -q transformers datasets sentence-transformers torch accelerate

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

In [3]:
from sentence_transformers import SentenceTransformer
import json
from datasets import load_dataset
from tqdm.auto import tqdm
import numpy as np
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [5]:
# ==================== Configuration ====================
class Config:
    # Model parameters
    llm_name = "google/flan-t5-base"  # Using base instead of XXL for Colab
    encoder_name = "BAAI/bge-base-en-v1.5"  # BGE encoder as per paper

    # Training parameters
    batch_size = 4
    learning_rate = 1e-4
    num_epochs = 2
    warmup_ratio = 0.05
    max_input_length = 256
    max_encoder_length = 512
    max_output_length = 128

    # PPlug specific
    embedding_dim = 768  # BGE base embedding dimension
    llm_hidden_size = 768  # T5-base hidden size
    num_personal_tokens = 1  # Number of personal embedding tokens

    # Data parameters
    max_histories = 20  # Limit user histories for memory efficiency
    sample_size = 500  # Reduced dataset size for Colab


In [6]:
config = Config()

In [7]:
# ==================== Data Loading ====================
class LaMP_Dataset:
    """Simplified LaMP dataset loader for demonstration"""

    def __init__(self, task="LaMP-2", split="train", sample_size=500):
        """
        Load LaMP dataset (using LaMP-2: Movie Tagging as example)
        For full implementation, download from: https://lamp-benchmark.github.io/download
        """
        print(f"Loading {task} {split} dataset...")

        # For demo: Create synthetic data mimicking LaMP structure
        # In production, load from: https://huggingface.co/datasets/LaMP/LaMP-2
        self.data = self._create_demo_data(sample_size)

    def _create_demo_data(self, sample_size):
        """Create CONSISTENT synthetic data for demonstration"""
        data = []

        # Simulate movie tagging task
        movie_genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi',
                       'Romance', 'Thriller', 'Documentary']

        for i in range(sample_size):
            user_id = f"user_{i % 50}"

            # 1. Select a specific genre for this sample
            selected_genre = np.random.choice(movie_genres)

            # 2. Create input text that actually contains hints for this genre
            current_movie = f"Movie: Test Film {i}. Description: An exciting {selected_genre.lower()} adventure with classic tropes."

            # 3. Set the output to match the selected genre
            target_genre = selected_genre

            # 4. Create consistent history (user tends to watch this genre)
            histories = []
            num_hist = np.random.randint(2, 5) # Smaller history for faster processing
            for j in range(num_hist):
                # User history also aligns with their preference often, but includes noise
                if np.random.random() > 0.3:
                    hist_genre = selected_genre
                else:
                    hist_genre = np.random.choice(movie_genres)

                hist = {
                    'text': f"Movie: Old Film {j}. Description: A typical {hist_genre.lower()} movie.",
                    'label': hist_genre
                }
                histories.append(hist)

            data.append({
                'user_id': user_id,
                'input': current_movie,
                'output': target_genre,
                'histories': histories
            })

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [25]:
# ==================== User Behavior Encoder ====================
class UserBehaviorEncoder(nn.Module):
    """Encodes user historical behaviors using BGE model"""

    def __init__(self, encoder_name):
        super().__init__()
        self.encoder = SentenceTransformer(encoder_name)
        # Freeze encoder parameters as per paper
        for param in self.encoder.parameters():
            param.requires_grad = False

    def encode_histories(self, histories):
        """Encode list of historical behaviors"""
        with torch.no_grad():
            embeddings = self.encoder.encode(
                histories,
                convert_to_tensor=True,
                show_progress_bar=False
            )
        return embeddings

    def encode_input(self, inputs, trainable=True):
        """Encode current input (can be trainable)"""
        if trainable:
            embeddings = self.encoder.encode(
                inputs,
                convert_to_tensor=True,
                show_progress_bar=False
            )
        else:
            with torch.no_grad():
                embeddings = self.encoder.encode(
                    inputs,
                    convert_to_tensor=True,
                    show_progress_bar=False
                )
        return embeddings


In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

# ==================== Input-aware Personal Aggregator ====================
class PersonalAggregator(nn.Module):
    """Aggregates user histories into personal embedding with attention"""

    def __init__(self, embedding_dim, llm_hidden_size):
        super().__init__()
        # Project from encoder space to LLM space
        self.projector = nn.Sequential(
            nn.Linear(embedding_dim, llm_hidden_size),
            nn.ReLU(),
            nn.Linear(llm_hidden_size, llm_hidden_size)
        )

    def forward(self, history_embeddings, input_embedding):
        """
        Args:
            history_embeddings: [num_histories, embedding_dim]
            input_embedding: [embedding_dim]
        Returns:
            personal_embedding: [llm_hidden_size]
        """
        # Ensure history_embeddings is not an inference tensor and does not require gradients
        history_embeddings_processed = history_embeddings.clone().detach() # FIX: Added .clone()

        # Compute attention weights (Equation 3 in paper)
        # wi = exp(xu^T * hu_i) / sum(exp(xu^T * hu_k))
        scores = torch.matmul(history_embeddings_processed, input_embedding)  # [num_histories]
        weights = torch.softmax(scores, dim=0)  # [num_histories]

        # Weighted aggregation (Equation 4 in paper)
        # Pu = sum(wi * Proj(hu_i))
        projected_histories = self.projector(history_embeddings_processed)  # [num_histories, llm_hidden_size]
        personal_embedding = torch.sum(
            weights.unsqueeze(1) * projected_histories,
            dim=0
        )  # [llm_hidden_size]

        return personal_embedding, weights


In [27]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

# ==================== PPlug Model ====================
class PPlugModel(nn.Module):
    """Corrected PPlug model with proper initialization and masking"""

    def __init__(self, config):
        super().__init__()

        # Load LLM (frozen)
        self.llm = T5ForConditionalGeneration.from_pretrained(config.llm_name)
        self.llm_tokenizer = AutoTokenizer.from_pretrained(config.llm_name)

        # Freeze LLM parameters
        for param in self.llm.parameters():
            param.requires_grad = False

        # User behavior encoder
        self.behavior_encoder = UserBehaviorEncoder(config.encoder_name)

        # Personal aggregator (trainable)
        self.personal_aggregator = PersonalAggregator(
            config.embedding_dim,
            config.llm_hidden_size
        )

        # --- FIX 1: Smaller Initialization ---
        # Initialize with 0.01 scale to match T5's internal embedding variance
        self.instruction_embedding = nn.Parameter(
            torch.randn(1, config.num_personal_tokens, config.llm_hidden_size) * 0.01
        )

        self.config = config

    def get_personal_embedding(self, histories, current_input):
        """Generate personal embedding for a user"""
        # Encode histories (limit to max_histories to save memory)
        history_texts = [h['text'] for h in histories[:self.config.max_histories]]

        # Handle case where history might be empty (though unlikely with our data generator)
        if not history_texts:
            history_texts = ["Empty history"]

        history_embeddings = self.behavior_encoder.encode_histories(history_texts)

        # Encode current input
        input_embedding = self.behavior_encoder.encode_input([current_input], trainable=True)[0]

        # Aggregate into personal embedding
        personal_embedding, attention_weights = self.personal_aggregator(
            history_embeddings.to(self.instruction_embedding.device),
            input_embedding.to(self.instruction_embedding.device)
        )

        return personal_embedding, attention_weights

    def forward(self, batch):
        batch_size = len(batch['input'])
        device = self.instruction_embedding.device

        # Tokenize input (we need the mask this time!)
        tokenized_inputs = self.llm_tokenizer(
            batch['input'],
            max_length=self.config.max_input_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        input_ids = tokenized_inputs.input_ids
        original_mask = tokenized_inputs.attention_mask

        # Tokenize labels
        labels = self.llm_tokenizer(
            batch['output'],
            max_length=self.config.max_output_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).input_ids.to(device)
        labels[labels == self.llm_tokenizer.pad_token_id] = -100

        # Get original input embeddings
        inputs_embeds = self.llm.encoder.embed_tokens(input_ids)

        # Create personal embeddings
        personal_embeds_list = []
        for i in range(batch_size):
            personal_emb, _ = self.get_personal_embedding(
                batch['histories'][i],
                batch['input'][i]
            )
            personal_embeds_list.append(personal_emb)

        personal_embeds = torch.stack(personal_embeds_list).unsqueeze(1) # [batch, 1, hidden]

        # Concatenate embeddings: [Instruction; Personal; Input]
        instruction_embeds = self.instruction_embedding.expand(batch_size, -1, -1)

        # Total added tokens = num_personal_tokens (instruction) + 1 (personal embedding)
        num_prefix_tokens = self.config.num_personal_tokens + 1

        final_embeds = torch.cat([
            instruction_embeds,
            personal_embeds,
            inputs_embeds
        ], dim=1)

        # --- FIX 2: Correct Attention Mask ---
        # Create mask of 1s for the prefix embeddings
        prefix_mask = torch.ones(batch_size, num_prefix_tokens).to(device)
        # Concatenate with original mask
        final_mask = torch.cat([prefix_mask, original_mask], dim=1)

        # Forward through LLM
        outputs = self.llm(
            inputs_embeds=final_embeds,
            attention_mask=final_mask,  # Pass the corrected mask
            labels=labels,
            return_dict=True
        )

        return outputs.loss, outputs.logits

    def generate(self, input_text, histories, max_length=50):
        device = self.instruction_embedding.device

        # Get personal embedding
        personal_emb, attention_weights = self.get_personal_embedding(histories, input_text)

        # Tokenize input to get IDs and Mask
        tokenized = self.llm_tokenizer(
            input_text,
            max_length=self.config.max_input_length,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        input_ids = tokenized.input_ids
        original_mask = tokenized.attention_mask
        inputs_embeds = self.llm.encoder.embed_tokens(input_ids)

        # Prepare embeddings
        personal_embeds = personal_emb.unsqueeze(0).unsqueeze(0)
        final_embeds = torch.cat([
            self.instruction_embedding,
            personal_embeds,
            inputs_embeds
        ], dim=1)

        # Prepare Mask for Generation
        num_prefix_tokens = self.config.num_personal_tokens + 1
        prefix_mask = torch.ones(1, num_prefix_tokens).to(device)
        final_mask = torch.cat([prefix_mask, original_mask], dim=1)

        # Generate
        with torch.no_grad():
            output_ids = self.llm.generate(
                inputs_embeds=final_embeds,
                attention_mask=final_mask, # Pass mask here too
                max_length=max_length,
                num_beams=4,
                early_stopping=True
            )

        output_text = self.llm_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return output_text, attention_weights

In [28]:
# ==================== Training ====================
def collate_fn(batch):
    """Custom collate function for DataLoader"""
    return {
        'input': [item['input'] for item in batch],
        #'output': [item['output'] for item in batch],
        'histories': [item['histories'] for item in batch],
        'user_id': [item['user_id'] for item in batch],
        'output': [item['output'] for item in batch]
    }


In [29]:
def train_pplug(model, train_dataset, config):
    """Train PPlug model"""

    # Create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )

    # Optimizer (only trainable parameters)
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=config.learning_rate
    )

    # Learning rate scheduler
    num_training_steps = len(train_loader) * config.num_epochs
    num_warmup_steps = int(num_training_steps * config.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
       # Training loop
    model.train()
    global_step = 0

    for epoch in range(config.num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}")

        for batch in progress_bar:
            # Forward pass
            loss, logits = model(batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            # Update metrics
            epoch_loss += loss.item()
            global_step += 1

            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'avg_loss': f'{epoch_loss/global_step:.4f}'
            })

        print(f"\nEpoch {epoch+1} completed. Average loss: {epoch_loss/len(train_loader):.4f}")

    return model



In [30]:
# ==================== Evaluation ====================
def evaluate_pplug(model, test_dataset, num_samples=10):
    """Evaluate PPlug model on test set with robust text matching"""

    model.eval()
    predictions = []
    ground_truths = []

    print("\n" + "="*50)
    print("EVALUATION EXAMPLES")
    print("="*50)

    for i in range(min(num_samples, len(test_dataset))):
        sample = test_dataset[i]

        # Generate prediction
        pred_text, attention_weights = model.generate(
            sample['input'],
            sample['histories'],
            max_length=config.max_output_length
        )

        # CLEANUP: Normalize text for comparison
        pred_clean = pred_text.strip().lower()
        truth_clean = sample['output'].strip().lower()

        predictions.append(pred_clean)
        ground_truths.append(truth_clean)

        # Print examples
        if i < 5:
            print(f"\n--- Example {i+1} ---")
            print(f"Input: {sample['input'][:80]}...")
            print(f"Predicted: '{pred_text}' (Cleaned: '{pred_clean}')")
            print(f"Ground Truth: '{sample['output']}' (Cleaned: '{truth_clean}')")
            print(f"Match: {pred_clean == truth_clean}")

    # Calculate accuracy
    accuracy = accuracy_score(ground_truths, predictions)
    print(f"\n{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"{'='*50}")

    return predictions, ground_truths, accuracy


In [31]:
# ==================== Main Execution ====================
def main():
    print("="*60)
    print("PPlug: Personalized LLM Implementation")
    print("Based on: LLMs + Persona-Plug = Personalized LLMs")
    print("="*60)

    # Load data
    print("\n1. Loading datasets...")
    train_dataset = LaMP_Dataset(task="LaMP-2", split="train", sample_size=config.sample_size)
    test_dataset = LaMP_Dataset(task="LaMP-2", split="test", sample_size=100)
    print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

    # Initialize model
    print("\n2. Initializing PPlug model...")
    model = PPlugModel(config).to(device)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")

    # Train model
    print("\n3. Training PPlug model...")
    model = train_pplug(model, train_dataset, config)

    # Evaluate model
    print("\n4. Evaluating PPlug model...")
    predictions, ground_truths, accuracy = evaluate_pplug(model, test_dataset)

    print("\n" + "="*60)
    print("Training and Evaluation Complete!")
    print("="*60)

    return model, predictions, ground_truths


In [34]:
model, predictions, ground_truths = main()

PPlug: Personalized LLM Implementation
Based on: LLMs + Persona-Plug = Personalized LLMs

1. Loading datasets...
Loading LaMP-2 train dataset...
Loading LaMP-2 test dataset...
Train size: 500, Test size: 100

2. Initializing PPlug model...
Total parameters: 358,242,048
Trainable parameters: 1,181,952 (0.33%)

3. Training PPlug model...


Epoch 1/2:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 1 completed. Average loss: 4.1349


Epoch 2/2:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 2 completed. Average loss: 3.6757

4. Evaluating PPlug model...

EVALUATION EXAMPLES

--- Example 1 ---
Input: Movie: Test Film 0. Description: An exciting romance adventure with classic trop...
Predicted: 'PG-13' (Cleaned: 'pg-13')
Ground Truth: 'Romance' (Cleaned: 'romance')
Match: False

--- Example 2 ---
Input: Movie: Test Film 1. Description: An exciting drama adventure with classic tropes...
Predicted: 'Thriller' (Cleaned: 'thriller')
Ground Truth: 'Drama' (Cleaned: 'drama')
Match: False

--- Example 3 ---
Input: Movie: Test Film 2. Description: An exciting thriller adventure with classic tro...
Predicted: 'Thriller' (Cleaned: 'thriller')
Ground Truth: 'Thriller' (Cleaned: 'thriller')
Match: True

--- Example 4 ---
Input: Movie: Test Film 3. Description: An exciting action adventure with classic trope...
Predicted: 'Thriller' (Cleaned: 'thriller')
Ground Truth: 'Action' (Cleaned: 'action')
Match: False

--- Example 5 ---
Input: Movie: Test Film 4. Description: An exciting 