In [None]:
# requirements:
# pip install transformers datasets accelerate evaluate

# Install specific version to avoid compatibility issues
!pip install -q --upgrade transformers==4.45.0 huggingface_hub datasets accelerate evaluate scikit-learn

import os
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ["WANDB_DISABLED"] = "true"

import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

import torch
import torch.nn as nn
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

# ============================================================================
# GPU DETECTION AND CONFIGURATION
# ============================================================================
print("=" * 70)
print("GPU DETECTION AND CONFIGURATION")
print("=" * 70)

# Check if we're in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("‚úÖ Running in Google Colab")

    # Check GPU type in Colab
    gpu_info = !nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
    if gpu_info:
        print(f"‚úÖ GPU: {gpu_info[0]}")
    else:
        print("‚ùå No GPU detected in Colab")

except:
    IN_COLAB = False
    print("‚ùå Not in Google Colab")

# PyTorch GPU detection
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9

    print(f"‚úÖ PyTorch GPU: {gpu_name}")
    print(f"‚úÖ GPU Memory: {gpu_memory:.1f} GB")
    print(f"‚úÖ CUDA Version: {torch.version.cuda}")
    print(f"‚úÖ Using device: {device}")

    # Enable performance optimizations
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True  # Optimize for fixed input sizes
    print("‚úÖ GPU optimizations enabled: TF32, CuDNN benchmark")

    # Set GPU memory growth to avoid OOM
    torch.cuda.empty_cache()
    print("‚úÖ GPU cache cleared")

else:
    device = torch.device("cpu")
    print("‚ùå No GPU available - using CPU")
    print("‚ö†Ô∏è  Training will be significantly slower!")

print("=" * 70)

# Configuration - CHANGED TO RoBERTa
MODEL_NAME = "roberta-large"
LEARNING_RATE = 2e-5
MAX_LENGTH = 386
BATCH_SIZE = 64 if torch.cuda.is_available() else 8  # Reduced for RoBERTa-large (larger model)
EPOCHS = 70

# Early Stopping Configuration - USING SMAPE
EARLY_STOPPING_PATIENCE = 7
EARLY_STOPPING_THRESHOLD = 0.1

# TEST MODE - Use only first 1000 samples
TEST_MODE = False
SAMPLE_SIZE = 1000

# ============================================================================
# SMAPE Calculation Functions
# ============================================================================
def smape_loss(y_true, y_pred):
    """Calculate SMAPE loss for PyTorch - optimized for GPU"""
    return torch.mean(2 * torch.abs(y_pred - y_true) / (torch.abs(y_true) + torch.abs(y_pred) + 1e-8))

def smape_metric(y_true, y_pred):
    """Calculate SMAPE metric for numpy arrays"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

# ============================================================================
# Data Loading and Preparation
# ============================================================================
print("\n" + "=" * 70)
print("DATA LOADING AND PREPARATION")
print("=" * 70)

print("Loading datasets...")
train_df = pd.read_csv("/content/drive/MyDrive/Arun_code/Amazon_Ml_2025/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Arun_code/Amazon_Ml_2025/test.csv")  # Using train as test for validation

if TEST_MODE:
    print(f"üöÄ TEST MODE: Using first {SAMPLE_SIZE} samples for quick validation")
    train_df = train_df.head(SAMPLE_SIZE).copy()
    test_df = test_df.head(SAMPLE_SIZE).copy()

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check data distribution
print(f"\nData Overview:")
print(f"Price statistics - Min: ${train_df['price'].min():.2f}, Max: ${train_df['price'].max():.2f}, Mean: ${train_df['price'].mean():.2f}")
print(f"Catalog content length - Avg: {train_df['catalog_content'].str.len().mean():.1f} chars")

# Initialize tokenizer - CHANGED FOR RoBERTa
print("\nLoading RoBERTa tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    local_files_only=False
)
# Add RoBERTa special tokens
if not tokenizer.cls_token:
    tokenizer.add_special_tokens({'cls_token': '<s>'})  # RoBERTa uses <s> as CLS
print("‚úÖ RoBERTa tokenizer loaded successfully!")

# ============================================================================
# Custom Model with Layer Unfreezing - MODIFIED FOR RoBERTa
# ============================================================================
class RobertaRegressionModel(nn.Module):  # CHANGED CLASS NAME
    def __init__(self, model_name, unfreeze_last_n_layers=2):
        super().__init__()

        print(f"\nInitializing RoBERTa model on {device}...")

        # Load pretrained RoBERTa
        self.roberta = AutoModel.from_pretrained(model_name)  # CHANGED TO RoBERTa

        # Freeze all parameters first
        for param in self.roberta.parameters():
            param.requires_grad = False

        # Unfreeze last N layers
        if unfreeze_last_n_layers > 0:
            # Unfreeze the last N transformer layers
            for layer in self.roberta.encoder.layer[-unfreeze_last_n_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True

        # RoBERTa doesn't have a pooler like BERT, so we'll use the first token representation
        # Regression head - optimized for GPU
        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(self.roberta.config.hidden_size, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )

        # Print model info
        self._print_model_info(unfreeze_last_n_layers)

    def _print_model_info(self, unfreeze_last_n_layers):
        total_params = 0
        trainable_params = 0

        for name, param in self.named_parameters():
            total_params += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()

        print(f"\nüìä MODEL ARCHITECTURE:")
        print(f"   Model: RoBERTa-large")
        print(f"   Device: {device}")
        print(f"   Unfrozen layers: Last {unfreeze_last_n_layers} transformer layers")
        print(f"   Total parameters: {total_params:,}")
        print(f"   Trainable parameters: {trainable_params:,}")
        print(f"   Percentage trainable: {100 * trainable_params / total_params:.2f}%")
        print(f"   Loss function: SMAPE (competition metric)")
        print(f"   Hidden size: {self.roberta.config.hidden_size}")

    def forward(self, input_ids, attention_mask, labels=None):
        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

        # Use the [CLS] token representation (first token) for regression
        # RoBERTa doesn't have pooler_output like BERT
        cls_output = outputs.last_hidden_state[:, 0, :]

        # Regression prediction
        logits = self.regressor(cls_output).squeeze(-1)

        return {'logits': logits}

# Initialize Model
print("\n" + "=" * 70)
print("MODEL INITIALIZATION")
print("=" * 70)

model = RobertaRegressionModel(  # CHANGED TO RoBERTa MODEL
    model_name=MODEL_NAME,
    unfreeze_last_n_layers=2
)

# Move model to GPU with memory optimization
model = model.to(device)
print(f"‚úÖ RoBERTa model successfully moved to {device}")

# ============================================================================
# Dataset Preparation with GPU Optimization
# ============================================================================
def tokenize_fn(example):
    return tokenizer(
        example["catalog_content"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

# For validation, split the training data
train_pd, val_pd = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42
)

print(f"\nüìä DATASET SPLIT:")
print(f"   Training samples: {len(train_pd)}")
print(f"   Validation samples: {len(val_pd)}")

# Convert to Hugging Face datasets
hf_train = Dataset.from_pandas(train_pd)
hf_val = Dataset.from_pandas(val_pd)
hf_test = Dataset.from_pandas(test_df)

# Tokenize datasets
print("Tokenizing datasets...")
hf_train = hf_train.map(tokenize_fn, batched=True, batch_size=32)  # Batched for speed
hf_val = hf_val.map(tokenize_fn, batched=True, batch_size=32)
hf_test = hf_test.map(tokenize_fn, batched=True, batch_size=32)

# Rename price column to labels for training
hf_train = hf_train.rename_column("price", "labels")
hf_val = hf_val.rename_column("price", "labels")

# Set format for PyTorch with GPU optimization
cols = ["input_ids", "attention_mask", "labels"]
hf_train.set_format(type="torch", columns=cols)
hf_val.set_format(type="torch", columns=cols)
hf_test.set_format(type="torch", columns=["input_ids", "attention_mask"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(f"‚úÖ Dataset preparation complete:")
print(f"   Training samples: {len(hf_train)}")
print(f"   Validation samples: {len(hf_val)}")
print(f"   Test samples: {len(hf_test)}")

# ============================================================================
# Custom Metrics for Regression with SMAPE Focus
# ============================================================================
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    # Ensure proper shapes
    if len(logits.shape) > 1:
        logits = logits.flatten()
    if len(labels.shape) > 1:
        labels = labels.flatten()

    # Calculate SMAPE (Primary metric)
    smape = smape_metric(labels, logits)

    # Calculate other metrics for monitoring
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)

    # Calculate Pearson correlation
    try:
        pearson_corr = pearsonr(logits, labels)[0]
    except:
        pearson_corr = 0.0

    return {
        "smape": smape,  # PRIMARY METRIC
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "pearson": pearson_corr,
    }

# ============================================================================
# Custom Trainer with SMAPE Loss and GPU Optimization
# ============================================================================
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs['logits']

        # Handle different logits shapes
        if logits.dim() > 1:
            logits = logits.squeeze(-1) if logits.size(-1) == 1 else logits[:, 0]
        loss_fct = nn.MSELoss()
        # Calculate the loss
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def on_epoch_end(self, args, state, control, **kwargs):
        """Print learning progress at each epoch with GPU memory info"""
        super().on_epoch_end(args, state, control, **kwargs)
        if state.epoch is not None and torch.cuda.is_available():
            # Print GPU memory usage
            gpu_memory = torch.cuda.memory_allocated() / 1e9
            gpu_memory_max = torch.cuda.max_memory_allocated() / 1e9
            print(f"   GPU Memory: {gpu_memory:.1f}GB (Peak: {gpu_memory_max:.1f}GB)")

# ============================================================================
# Training Arguments with GPU Optimization
# ============================================================================
training_args = TrainingArguments(
    output_dir="./roberta-product-pricing-test" if TEST_MODE else "./roberta-product-pricing",  # CHANGED OUTPUT DIR
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="smape",
    load_best_model_at_end=True,
    greater_is_better=False,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10 if TEST_MODE else 50,
    report_to=None,
    remove_unused_columns=False,
    save_safetensors=False,

    # GPU OPTIMIZATIONS
    fp16=torch.cuda.is_available(),  # Mixed precision for GPU
    dataloader_pin_memory=True,      # Faster data transfer to GPU
    dataloader_num_workers=2 if torch.cuda.is_available() else 0,
    dataloader_prefetch_factor=2 if torch.cuda.is_available() else None,

    # Evaluation optimizations
    eval_steps=50 if TEST_MODE else 200,
    save_steps=50 if TEST_MODE else 200,

    # Gradient optimizations
    gradient_accumulation_steps=1,
    warmup_steps=100 if TEST_MODE else 500,
)

# ============================================================================
# Initialize Trainer with Early Stopping
# ============================================================================
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    early_stopping_threshold=EARLY_STOPPING_THRESHOLD
)

trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_for_regression,
    callbacks=[early_stopping_callback],
)

# ============================================================================
# Training with GPU Monitoring
# ============================================================================
print("\n" + "=" * 70)
print("üöÄ STARTING RoBERTa TRAINING" + " (TEST MODE)" if TEST_MODE else "")  # CHANGED TEXT
print("=" * 70)
print(f"üéØ PRIMARY METRIC: SMAPE")
print(f"‚ö° DEVICE: {device}")
print(f"üìä Training samples: {len(hf_train)}")
print(f"üìä Validation samples: {len(hf_val)}")
print(f"üî¢ Batch size: {BATCH_SIZE} (per device) - Reduced for RoBERTa-large")
print(f"üîÑ Epochs: {EPOCHS}")
print(f"üí° Mixed Precision: {training_args.fp16}")
print(f"üìà Early stopping: {EARLY_STOPPING_PATIENCE} epochs")

if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

try:
    # Clear GPU cache before training
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("‚úÖ GPU cache cleared before training")

    train_results = trainer.train()

    # Save model
    print("\nüíæ Saving model...")
    model_save_path = "/content/drive/MyDrive/Arun_code/Amazon_Ml_2025/roberta-product-pricing-test" if TEST_MODE else "/content/drive/MyDrive/Colab Notebooks/Amazon_Ml_2025/roberta-product-pricing-final"  # CHANGED PATH
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"‚úÖ RoBERTa model saved to: {model_save_path}")

except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    if torch.cuda.is_available():
        print(f"üíæ GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.1f}GB")
    raise e

# ============================================================================
# Prediction and Evaluation
# ============================================================================
print("\n" + "=" * 70)
print("PREDICTION AND EVALUATION")
print("=" * 70)

print("Making predictions...")
test_predictions = trainer.predict(hf_test)
predicted_prices = test_predictions.predictions.flatten()
predicted_prices = np.maximum(predicted_prices, 0.01)  # Ensure positive prices

# Create submission
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predicted_prices
})

submission_path = "/content/drive/MyDrive/Arun_code/Amazon_Ml_2025/submission_roberta_test.csv" if TEST_MODE else "/content/drive/MyDrive/Colab Notebooks/Amazon_Ml_2025/submission_roberta.csv"  # CHANGED PATH
submission_df.to_csv(submission_path, index=False)
print(f"‚úÖ RoBERTa submission saved: {submission_path}")

# Final metrics
print("\nüìä FINAL VALIDATION METRICS:")
val_predictions = trainer.predict(hf_val)
val_metrics = compute_metrics_for_regression((val_predictions.predictions, val_predictions.label_ids))

print(f"üéØ SMAPE: {val_metrics['smape']:.2f}%")
print(f"üìà MAE: ${val_metrics['mae']:.2f}")
print(f"üìä R¬≤: {val_metrics['r2']:.4f}")
print(f"üîó Pearson: {val_metrics['pearson']:.4f}")

# GPU memory summary
if torch.cuda.is_available():
    print(f"\nüíæ GPU MEMORY SUMMARY:")
    print(f"   Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.1f}GB")
    print(f"   Current memory usage: {torch.cuda.memory_allocated() / 1e9:.1f}GB")

print("\n‚úÖ RoBERTa TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 70)

GPU DETECTION AND CONFIGURATION
‚úÖ Running in Google Colab
‚úÖ GPU: NVIDIA A100-SXM4-40GB, 40960 MiB
‚úÖ PyTorch GPU: NVIDIA A100-SXM4-40GB
‚úÖ GPU Memory: 42.5 GB
‚úÖ CUDA Version: 12.6
‚úÖ Using device: cuda
‚úÖ GPU optimizations enabled: TF32, CuDNN benchmark
‚úÖ GPU cache cleared

DATA LOADING AND PREPARATION
Loading datasets...
Training data shape: (75000, 4)
Test data shape: (75000, 3)

Data Overview:
Price statistics - Min: $0.13, Max: $2796.00, Mean: $23.65
Catalog content length - Avg: 908.9 chars

Loading RoBERTa tokenizer...




‚úÖ RoBERTa tokenizer loaded successfully!

MODEL INITIALIZATION

Initializing RoBERTa model on cuda...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìä MODEL ARCHITECTURE:
   Model: RoBERTa-large
   Device: cuda
   Unfrozen layers: Last 2 transformer layers
   Total parameters: 355,663,873
   Trainable parameters: 25,496,577
   Percentage trainable: 7.17%
   Loss function: SMAPE (competition metric)
   Hidden size: 1024
‚úÖ RoBERTa model successfully moved to cuda

üìä DATASET SPLIT:
   Training samples: 60000
   Validation samples: 15000
Tokenizing datasets...


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/75000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


‚úÖ Dataset preparation complete:
   Training samples: 60000
   Validation samples: 15000
   Test samples: 75000


üéØ PRIMARY METRIC: SMAPE
‚ö° DEVICE: cuda
üìä Training samples: 60000
üìä Validation samples: 15000
üî¢ Batch size: 64 (per device) - Reduced for RoBERTa-large
üîÑ Epochs: 70
üí° Mixed Precision: True
üìà Early stopping: 7 epochs
üéÆ GPU: NVIDIA A100-SXM4-40GB
üíæ GPU Memory: 42.5 GB
‚úÖ GPU cache cleared before training


Epoch,Training Loss,Validation Loss,Smape,Mse,Mae,R2,Pearson
1,1269.8089,1710.490967,76.948967,1710.490967,17.391657,-0.133372,-0.002908
2,760.4525,1415.008301,63.190819,1415.008179,14.833452,0.062415,0.33511
3,691.9344,1265.112183,59.144348,1265.112305,13.701827,0.161736,0.435843
4,656.5656,1182.890137,57.99192,1182.890137,13.230284,0.216216,0.477398
5,898.1529,1139.463501,55.983829,1139.463501,12.488066,0.244991,0.511365
6,653.938,1095.244385,56.484688,1095.244385,12.597808,0.27429,0.525517
7,546.665,1059.977539,57.738258,1059.977539,12.704295,0.297658,0.546925
8,797.6248,1037.53186,54.950768,1037.53186,11.922587,0.312531,0.561369
9,401.4019,1030.128052,54.925694,1030.128052,12.171506,0.317437,0.563478
10,511.3562,1019.370422,53.948837,1019.370483,12.144215,0.324565,0.574141



üíæ Saving model...
‚úÖ RoBERTa model saved to: /content/drive/MyDrive/Colab Notebooks/Amazon_Ml_2025/roberta-product-pricing-final

PREDICTION AND EVALUATION
Making predictions...


‚úÖ RoBERTa submission saved: /content/drive/MyDrive/Colab Notebooks/Amazon_Ml_2025/submission_roberta.csv

üìä FINAL VALIDATION METRICS:


üéØ SMAPE: 49.65%
üìà MAE: $10.62
üìä R¬≤: 0.3616
üîó Pearson: 0.6061

üíæ GPU MEMORY SUMMARY:
   Peak memory usage: 6.3GB
   Current memory usage: 3.6GB

‚úÖ RoBERTa TRAINING COMPLETED SUCCESSFULLY!
