In [None]:
# Install required packages with progress tracking
%pip install transformers datasets torch torchvision torchaudio
%pip install scikit-learn umap-learn hdbscan
%pip install structlog fastapi uvicorn
%pip install google-generativeai
%pip install nltk pandas numpy
%pip install tqdm  # For beautiful progress bars

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
# Clone the repository (assuming it's added as a dataset)
import os
import sys
import shutil

# Copy the code from the dataset to working directory
dataset_path = '/kaggle/input/flipkart-grid-cadence'  # Update this path
if os.path.exists(dataset_path):
    # Copy all Python files and directories
    for item in os.listdir(dataset_path):
        src = os.path.join(dataset_path, item)
        dst = os.path.join('/kaggle/working', item)
        if os.path.isdir(src):
            shutil.copytree(src, dst, dirs_exist_ok=True)
        else:
            shutil.copy2(src, dst)
    
    print("✅ Code copied successfully")
else:
    print("❌ Dataset not found. Please add the GitHub repo as a dataset.")

# Add to Python path
sys.path.append('/kaggle/working')

# Check GPU availability
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


In [None]:
# Set up environment variables
os.environ['GEMINI_API_KEY'] = 'AIzaSyDAMxtFaYpbqLb2dlHNAaFA6YLgMUVVVaI'  # Your API key

# Create necessary directories
os.makedirs('/kaggle/working/models', exist_ok=True)
os.makedirs('/kaggle/working/processed_data', exist_ok=True)
os.makedirs('/kaggle/working/logs', exist_ok=True)

print("✅ Environment setup complete")


In [None]:
# Import and initialize the training system
from training.train_models import CADENCETrainer
import structlog

# Configure logging
structlog.configure(
    processors=[
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.dev.ConsoleRenderer()
    ],
    logger_factory=structlog.stdlib.LoggerFactory(),
    cache_logger_on_first_use=True,
)

logger = structlog.get_logger()
logger.info("🚀 Starting CADENCE training on Kaggle GPU...")

# Initialize trainer
trainer = CADENCETrainer()
logger.info("✅ Trainer initialized")


In [None]:
# Prepare training data with optimized settings for Kaggle
logger.info("📊 Preparing training data (100K samples for comprehensive training)...")

# Use larger sample size since we have GPU power
training_data = trainer.prepare_data(max_samples=100000)  # 100K samples
logger.info("✅ Training data prepared")

print(f"Query dataset size: {len(training_data['query_data'])}")
print(f"Catalog dataset size: {len(training_data['catalog_data'])}")
print(f"Vocabulary size: {len(training_data['vocab'])}")


In [None]:
# Train the enhanced CADENCE model
logger.info("🧠 Training enhanced CADENCE model (5 epochs with GPU acceleration)...")

# Train with more epochs since we have GPU power
result = trainer.train_enhanced_models(
    training_data, 
    epochs=5,  # More epochs for better training
    save_name='kaggle_enhanced_cadence_production'
)

logger.info("✅ Enhanced model training completed successfully!")
logger.info("🎉 Model saved as 'kaggle_enhanced_cadence_production'")
