<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/paper_implement/dev/2_mrinal_mtech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install -q transformers datasets sentence-transformers torch accelerate

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

In [3]:
from sentence_transformers import SentenceTransformer
import json
from datasets import load_dataset
from tqdm.auto import tqdm
import numpy as np
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [48]:
# ==================== Configuration ====================
class Config:
    # Model parameters
    llm_name = "google/flan-t5-base"
    encoder_name = "BAAI/bge-base-en-v1.5"

    # Training parameters
    batch_size = 8             # Increased for stability
    learning_rate = 5e-3       # Increased from 1e-4 (Need higher LR for new params)
    num_epochs = 5             # Increased from 2
    warmup_ratio = 0.1
    max_input_length = 256
    max_encoder_length = 512
    max_output_length = 128

    # PPlug specific
    embedding_dim = 768
    llm_hidden_size = 768
    num_personal_tokens = 1

    # Data parameters
    max_histories = 5          # Reduced for speed
    sample_size = 1000         # Increased data size slightly


In [49]:
config = Config()

In [50]:
# ==================== Data Loading ====================
class LaMP_Dataset:
    """Simplified LaMP dataset loader for demonstration"""

    def __init__(self, task="LaMP-2", split="train", sample_size=500):
        """
        Load LaMP dataset (using LaMP-2: Movie Tagging as example)
        For full implementation, download from: https://lamp-benchmark.github.io/download
        """
        print(f"Loading {task} {split} dataset...")

        # For demo: Create synthetic data mimicking LaMP structure
        # In production, load from: https://huggingface.co/datasets/LaMP/LaMP-2
        self.data = self._create_demo_data(sample_size)

    def _create_demo_data(self, sample_size):
        """Create synthetic data with EXPLICIT TASK INSTRUCTIONS"""
        data = []
        movie_genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi', 'Romance']

        for i in range(sample_size):
            user_id = f"user_{i % 50}"
            selected_genre = np.random.choice(movie_genres)

            # --- FIX: Add explicit instruction prefix ---
            # Without this, the model doesn't know it's a classification task
            current_movie = (
                f"Classify the genre of this movie. "
                f"Title: Test Film {i}. "
                f"Description: An exciting {selected_genre.lower()} story."
            )

            target_genre = selected_genre

            # Create histories
            histories = []
            num_hist = np.random.randint(2, 5)
            for j in range(num_hist):
                if np.random.random() > 0.3:
                    hist_genre = selected_genre
                else:
                    hist_genre = np.random.choice(movie_genres)

                hist = {
                    'text': f"Movie: Old Film {j}. Description: A typical {hist_genre.lower()} movie.",
                    'label': hist_genre
                }
                histories.append(hist)

            data.append({
                'user_id': user_id,
                'input': current_movie,
                'output': target_genre,
                'histories': histories
            })

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [51]:
# ==================== User Behavior Encoder ====================
class UserBehaviorEncoder(nn.Module):
    """
    Corrected Encoder using AutoModel directly to allow gradient flow.
    - History Encoder: Frozen (shared weights initially, but detached)
    - Input Encoder: Trainable (fine-tuned)
    """

    def __init__(self, encoder_name):
        super().__init__()
        # Load two separate instances to strictly follow paper (One frozen, One tuned)
        # BGE-base is small enough to load twice in Colab
        self.history_encoder = AutoModel.from_pretrained(encoder_name)
        self.input_encoder = AutoModel.from_pretrained(encoder_name)
        self.tokenizer = AutoTokenizer.from_pretrained(encoder_name)

        # Freeze History Encoder completely
        for param in self.history_encoder.parameters():
            param.requires_grad = False

        # Input Encoder remains trainable (requires_grad=True by default)

    def mean_pooling(self, model_output, attention_mask):
        """Standard BGE/BERT mean pooling"""
        token_embeddings = model_output[0] # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def encode_histories(self, texts):
        """Encode histories with NO gradients (Inference mode for efficiency)"""
        device = self.history_encoder.device

        # Tokenize
        encoded_input = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        # Forward pass in no_grad context
        with torch.no_grad():
            model_output = self.history_encoder(**encoded_input)
            embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        return embeddings

    def encode_input(self, texts):
        """Encode input WITH gradients (Trainable)"""
        device = self.input_encoder.device

        # Tokenize
        encoded_input = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        # Forward pass WITH gradients
        model_output = self.input_encoder(**encoded_input)
        embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        return embeddings


In [52]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

# ==================== Input-aware Personal Aggregator ====================
class PersonalAggregator(nn.Module):
    """Aggregates user histories into personal embedding with attention"""

    def __init__(self, embedding_dim, llm_hidden_size):
        super().__init__()
        # Project from encoder space to LLM space
        self.projector = nn.Sequential(
            nn.Linear(embedding_dim, llm_hidden_size),
            nn.ReLU(),
            nn.Linear(llm_hidden_size, llm_hidden_size)
        )

    def forward(self, history_embeddings, input_embedding):
        """
        Args:
            history_embeddings: [num_histories, embedding_dim]
            input_embedding: [embedding_dim]
        Returns:
            personal_embedding: [llm_hidden_size]
        """
        # Ensure history_embeddings is not an inference tensor and does not require gradients
        history_embeddings_processed = history_embeddings.clone().detach() # FIX: Added .clone()

        # Compute attention weights (Equation 3 in paper)
        # wi = exp(xu^T * hu_i) / sum(exp(xu^T * hu_k))
        scores = torch.matmul(history_embeddings_processed, input_embedding)  # [num_histories]
        weights = torch.softmax(scores, dim=0)  # [num_histories]

        # Weighted aggregation (Equation 4 in paper)
        # Pu = sum(wi * Proj(hu_i))
        projected_histories = self.projector(history_embeddings_processed)  # [num_histories, llm_hidden_size]
        personal_embedding = torch.sum(
            weights.unsqueeze(1) * projected_histories,
            dim=0
        )  # [llm_hidden_size]

        return personal_embedding, weights


In [53]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

In [54]:


# ==================== PPlug Model ====================
class PPlugModel(nn.Module):
    """Final PPlug Model with corrected Encoder and Masks"""

    def __init__(self, config):
        super().__init__()

        # Load LLM (frozen)
        self.llm = T5ForConditionalGeneration.from_pretrained(config.llm_name)
        self.llm_tokenizer = AutoTokenizer.from_pretrained(config.llm_name)

        for param in self.llm.parameters():
            param.requires_grad = False

        # Corrected User behavior encoder
        self.behavior_encoder = UserBehaviorEncoder(config.encoder_name)

        # Personal aggregator (trainable)
        self.personal_aggregator = PersonalAggregator(
            config.embedding_dim,
            config.llm_hidden_size
        )

        # Small initialization for stability
        self.instruction_embedding = nn.Parameter(
            torch.randn(1, config.num_personal_tokens, config.llm_hidden_size) * 0.01
        )

        self.config = config

    def get_personal_embedding(self, histories, current_input):
        # Encode histories (Frozen path)
        history_texts = [h['text'] for h in histories[:self.config.max_histories]]
        if not history_texts: history_texts = ["Empty"]

        history_embeddings = self.behavior_encoder.encode_histories(history_texts)

        # Encode input (Trainable path)
        # Note: We pass list [current_input] but take [0] index
        input_embedding = self.behavior_encoder.encode_input([current_input])[0]

        # Aggregate
        # Ensure devices match
        target_device = self.instruction_embedding.device
        personal_embedding, attention_weights = self.personal_aggregator(
            history_embeddings.to(target_device),
            input_embedding.to(target_device)
        )

        return personal_embedding, attention_weights

    def forward(self, batch):
        batch_size = len(batch['input'])
        device = self.instruction_embedding.device

        # 1. Tokenize Input & Create Masks
        tokenized_inputs = self.llm_tokenizer(
            batch['input'],
            max_length=self.config.max_input_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        input_ids = tokenized_inputs.input_ids
        original_mask = tokenized_inputs.attention_mask

        # 2. Tokenize Labels
        labels = self.llm_tokenizer(
            batch['output'],
            max_length=self.config.max_output_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).input_ids.to(device)
        labels[labels == self.llm_tokenizer.pad_token_id] = -100

        # 3. Get Embeddings
        inputs_embeds = self.llm.encoder.embed_tokens(input_ids)

        personal_embeds_list = []
        for i in range(batch_size):
            p_emb, _ = self.get_personal_embedding(
                batch['histories'][i],
                batch['input'][i]
            )
            personal_embeds_list.append(p_emb)

        personal_embeds = torch.stack(personal_embeds_list).unsqueeze(1)
        instruction_embeds = self.instruction_embedding.expand(batch_size, -1, -1)

        # 4. Concatenate
        final_embeds = torch.cat([instruction_embeds, personal_embeds, inputs_embeds], dim=1)

        # 5. Fix Attention Mask (Add 1s for the 2 new tokens)
        num_prefix = 1 + self.config.num_personal_tokens
        prefix_mask = torch.ones(batch_size, num_prefix).to(device)
        final_mask = torch.cat([prefix_mask, original_mask], dim=1)

        # 6. Forward
        outputs = self.llm(
            inputs_embeds=final_embeds,
            attention_mask=final_mask,
            labels=labels,
            return_dict=True
        )

        return outputs.loss, outputs.logits

    def generate(self, input_text, histories, max_length=50):
        # ... (Same logic as forward for masking) ...
        device = self.instruction_embedding.device
        personal_emb, attention_weights = self.get_personal_embedding(histories, input_text)

        tokenized = self.llm_tokenizer(
            input_text,
            max_length=self.config.max_input_length,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        input_ids = tokenized.input_ids
        original_mask = tokenized.attention_mask
        inputs_embeds = self.llm.encoder.embed_tokens(input_ids)

        personal_embeds = personal_emb.unsqueeze(0).unsqueeze(0)
        final_embeds = torch.cat([self.instruction_embedding, personal_embeds, inputs_embeds], dim=1)

        num_prefix = 1 + self.config.num_personal_tokens
        prefix_mask = torch.ones(1, num_prefix).to(device)
        final_mask = torch.cat([prefix_mask, original_mask], dim=1)

        with torch.no_grad():
            output_ids = self.llm.generate(
                inputs_embeds=final_embeds,
                attention_mask=final_mask,
                max_length=max_length,
                num_beams=4,
                early_stopping=True
            )

        return self.llm_tokenizer.decode(output_ids[0], skip_special_tokens=True), attention_weights


In [55]:
# ==================== Training ====================
def collate_fn(batch):
    """Custom collate function for DataLoader"""
    return {
        'input': [item['input'] for item in batch],
        #'output': [item['output'] for item in batch],
        'histories': [item['histories'] for item in batch],
        'user_id': [item['user_id'] for item in batch],
        'output': [item['output'] for item in batch]
    }


In [56]:
def train_pplug(model, train_dataset, config):
    """Train PPlug model"""

    # Create dataloader
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )

    # Optimizer (only trainable parameters)
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=config.learning_rate
    )

    # Learning rate scheduler
    num_training_steps = len(train_loader) * config.num_epochs
    num_warmup_steps = int(num_training_steps * config.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
       # Training loop
    model.train()
    global_step = 0

    for epoch in range(config.num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}")

        for batch in progress_bar:
            # Forward pass
            loss, logits = model(batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            # Update metrics
            epoch_loss += loss.item()
            global_step += 1

            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'avg_loss': f'{epoch_loss/global_step:.4f}'
            })

        print(f"\nEpoch {epoch+1} completed. Average loss: {epoch_loss/len(train_loader):.4f}")

    return model



In [57]:
# ==================== Evaluation ====================
def evaluate_pplug(model, test_dataset, num_samples=10):
    """Evaluate PPlug model on test set with robust text matching"""

    model.eval()
    predictions = []
    ground_truths = []

    print("\n" + "="*50)
    print("EVALUATION EXAMPLES")
    print("="*50)

    for i in range(min(num_samples, len(test_dataset))):
        sample = test_dataset[i]

        # Generate prediction
        pred_text, attention_weights = model.generate(
            sample['input'],
            sample['histories'],
            max_length=config.max_output_length
        )

        # CLEANUP: Normalize text for comparison
        pred_clean = pred_text.strip().lower()
        truth_clean = sample['output'].strip().lower()

        predictions.append(pred_clean)
        ground_truths.append(truth_clean)

        # Print examples
        if i < 5:
            print(f"\n--- Example {i+1} ---")
            print(f"Input: {sample['input'][:80]}...")
            print(f"Predicted: '{pred_text}' (Cleaned: '{pred_clean}')")
            print(f"Ground Truth: '{sample['output']}' (Cleaned: '{truth_clean}')")
            print(f"Match: {pred_clean == truth_clean}")

    # Calculate accuracy
    accuracy = accuracy_score(ground_truths, predictions)
    print(f"\n{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"{'='*50}")

    return predictions, ground_truths, accuracy


In [58]:
# ==================== Main Execution ====================
def main():
    print("="*60)
    print("PPlug: Personalized LLM Implementation")
    print("Based on: LLMs + Persona-Plug = Personalized LLMs")
    print("="*60)

    # Load data
    print("\n1. Loading datasets...")
    train_dataset = LaMP_Dataset(task="LaMP-2", split="train", sample_size=config.sample_size)
    test_dataset = LaMP_Dataset(task="LaMP-2", split="test", sample_size=100)
    print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

    # Initialize model
    print("\n2. Initializing PPlug model...")
    model = PPlugModel(config).to(device)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")

    # Train model
    print("\n3. Training PPlug model...")
    model = train_pplug(model, train_dataset, config)

    # Evaluate model
    print("\n4. Evaluating PPlug model...")
    predictions, ground_truths, accuracy = evaluate_pplug(model, test_dataset)

    print("\n" + "="*60)
    print("Training and Evaluation Complete!")
    print("="*60)

    return model, predictions, ground_truths


In [59]:
model, predictions, ground_truths = main()

PPlug: Personalized LLM Implementation
Based on: LLMs + Persona-Plug = Personalized LLMs

1. Loading datasets...
Loading LaMP-2 train dataset...
Loading LaMP-2 test dataset...
Train size: 1000, Test size: 100

2. Initializing PPlug model...
Total parameters: 467,724,288
Trainable parameters: 110,664,192 (23.66%)

3. Training PPlug model...


Epoch 1/5:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 1 completed. Average loss: 0.6712


Epoch 2/5:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 2 completed. Average loss: 0.3481


Epoch 3/5:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 3 completed. Average loss: 0.1820


Epoch 4/5:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 4 completed. Average loss: 0.1403


Epoch 5/5:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 5 completed. Average loss: 0.1003

4. Evaluating PPlug model...

EVALUATION EXAMPLES

--- Example 1 ---
Input: Classify the genre of this movie. Title: Test Film 0. Description: An exciting d...
Predicted: 'Drama' (Cleaned: 'drama')
Ground Truth: 'Drama' (Cleaned: 'drama')
Match: True

--- Example 2 ---
Input: Classify the genre of this movie. Title: Test Film 1. Description: An exciting d...
Predicted: 'Drama' (Cleaned: 'drama')
Ground Truth: 'Drama' (Cleaned: 'drama')
Match: True

--- Example 3 ---
Input: Classify the genre of this movie. Title: Test Film 2. Description: An exciting d...
Predicted: 'Drama' (Cleaned: 'drama')
Ground Truth: 'Drama' (Cleaned: 'drama')
Match: True

--- Example 4 ---
Input: Classify the genre of this movie. Title: Test Film 3. Description: An exciting a...
Predicted: 'Action' (Cleaned: 'action')
Ground Truth: 'Action' (Cleaned: 'action')
Match: True

--- Example 5 ---
Input: Classify the genre of this movie. Title: Test Film 4. Description: An exci