In [None]:
"""
MODEL TRAINING NOTEBOOK
=======================
This notebook trains the multimodal regression model
"""

# ============================================================================
# CELL 1: SETUP AND IMPORTS
# ============================================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import Config
from model_architecture import MultimodalFusionModel, TabularOnlyModel, get_model
from dataset import MultimodalDataset, TabularOnlyDataset, get_train_transforms, get_val_transforms
from trainer import RealEstateTrainer, predict

# Set random seeds for reproducibility
torch.manual_seed(Config.RANDOM_SEED)
np.random.seed(Config.RANDOM_SEED)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Config.print_config()

# ============================================================================
# CELL 2: LOAD PROCESSED DATA
# ============================================================================
print("\n" + "="*70)
print("LOADING PROCESSED DATA")
print("="*70)

# Load data
train_df = pd.read_csv(Config.DATA_DIR / 'processed' / 'train_processed.csv')
test_df = pd.read_csv(Config.DATA_DIR / 'processed' / 'test_processed.csv')

print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Check for images
train_with_images = train_df[train_df['image_path'].notna()]
test_with_images = test_df[test_df['image_path'].notna()]

print(f"\nTrain samples with images: {len(train_with_images)}/{len(train_df)}")
print(f"Test samples with images: {len(test_with_images)}/{len(test_df)}")

# ============================================================================
# CELL 3: DEFINE FEATURES
# ============================================================================
print("\n" + "="*70)
print("FEATURE SELECTION")
print("="*70)

# Tabular features to use
tabular_features = [
    # Basic features
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
    'floors', 'waterfront', 'view', 'condition', 'grade',
    'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
    
    # Neighborhood features
    'sqft_living15', 'sqft_lot15',
    
    # Location features
    'lat', 'long',
    
    # Engineered features
    'age', 'years_since_renovation', 'is_renovated',
    'living_lot_ratio', 'above_ground_ratio', 'basement_ratio',
    'bath_bed_ratio', 'rooms_per_sqft',
    'living_vs_neighbors', 'lot_vs_neighbors',
    'overall_quality', 'luxury_score',
    'log_sqft_living', 'log_sqft_lot', 'log_sqft_above'
]

# Filter features that exist in the data
available_features = [f for f in tabular_features if f in train_df.columns]
print(f"\nUsing {len(available_features)} tabular features:")
for feat in available_features:
    print(f"  - {feat}")

target = 'price'

# ============================================================================
# CELL 4: DATA PREPROCESSING AND SCALING
# ============================================================================
print("\n" + "="*70)
print("DATA PREPROCESSING")
print("="*70)

# Handle missing values
for col in available_features:
    if train_df[col].isnull().any():
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        test_df[col].fillna(median_val, inplace=True)
        print(f"Filled {col} missing values with median: {median_val}")

# Scale features
scaler = StandardScaler()
train_df[available_features] = scaler.fit_transform(train_df[available_features])
test_df[available_features] = scaler.transform(test_df[available_features])

print("\n‚úì Features scaled using StandardScaler")

# Split train into train and validation
train_data, val_data = train_test_split(
    train_df,
    test_size=Config.TEST_SIZE,
    random_state=Config.RANDOM_SEED
)

print(f"\nTrain samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

# ============================================================================
# CELL 5: CREATE DATASETS AND DATALOADERS
# ============================================================================
print("\n" + "="*70)
print("CREATING DATASETS")
print("="*70)

# Create multimodal datasets
train_dataset = MultimodalDataset(
    df=train_data,
    tabular_features=available_features,
    target_column=target,
    transform=get_train_transforms(),
    is_test=False
)

val_dataset = MultimodalDataset(
    df=val_data,
    tabular_features=available_features,
    target_column=target,
    transform=get_val_transforms(),
    is_test=False
)

test_dataset = MultimodalDataset(
    df=test_df,
    tabular_features=available_features,
    transform=get_val_transforms(),
    is_test=True
)

print(f"\nMultimodal datasets created:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print("\n‚úì DataLoaders created")

# ============================================================================
# CELL 6: CREATE BASELINE MODEL (TABULAR ONLY)
# ============================================================================
print("\n" + "="*70)
print("TRAINING BASELINE MODEL (TABULAR ONLY)")
print("="*70)

# Create tabular-only datasets
train_tab_dataset = TabularOnlyDataset(
    df=train_data,
    tabular_features=available_features,
    target_column=target,
    is_test=False
)

val_tab_dataset = TabularOnlyDataset(
    df=val_data,
    tabular_features=available_features,
    target_column=target,
    is_test=False
)

# Create dataloaders
train_tab_loader = DataLoader(
    train_tab_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=True,
    num_workers=4
)

val_tab_loader = DataLoader(
    val_tab_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

# Initialize baseline model
baseline_model = TabularOnlyModel(
    input_dim=len(available_features),
    hidden_dims=[256, 128, 64],
    dropout=0.3
)

# Setup training
criterion = nn.MSELoss()
optimizer = optim.Adam(baseline_model.parameters(), lr=Config.LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Create trainer
baseline_trainer = RealEstateTrainer(
    model=baseline_model,
    train_loader=train_tab_loader,
    val_loader=val_tab_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    model_name='baseline_tabular',
    save_dir=Config.MODEL_SAVE_DIR
)

# Train
baseline_history = baseline_trainer.train(
    epochs=Config.EPOCHS,
    scheduler=scheduler,
    early_stopping_patience=10
)

# Plot history
baseline_trainer.plot_history(
    save_path=Config.MODEL_SAVE_DIR / 'baseline_training_history.png'
)

# ============================================================================
# CELL 7: CREATE MULTIMODAL MODEL
# ============================================================================
print("\n" + "="*70)
print("TRAINING MULTIMODAL MODEL")
print("="*70)

# Test different fusion strategies
fusion_strategies = ['late', 'early', 'attention']
multimodal_results = {}

for strategy in fusion_strategies:
    print(f"\n{'='*70}")
    print(f"Training with {strategy.upper()} fusion strategy")
    print(f"{'='*70}")
    
    # Initialize model
    multimodal_model = MultimodalFusionModel(
        tabular_input_dim=len(available_features),
        image_embedding_dim=512,
        tabular_hidden_dims=[256, 128, 64],
        fusion_strategy=strategy,
        dropout=0.3
    )
    
    # Setup training
    criterion = nn.MSELoss()
    optimizer = optim.Adam(multimodal_model.parameters(), lr=Config.LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, verbose=True
    )
    
    # Create trainer
    trainer = RealEstateTrainer(
        model=multimodal_model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        model_name=f'multimodal_{strategy}',
        save_dir=Config.MODEL_SAVE_DIR
    )
    
    # Train
    history = trainer.train(
        epochs=Config.EPOCHS,
        scheduler=scheduler,
        early_stopping_patience=10
    )
    
    # Save results
    multimodal_results[strategy] = {
        'history': history,
        'best_val_loss': trainer.best_val_loss
    }
    
    # Plot history
    trainer.plot_history(
        save_path=Config.MODEL_SAVE_DIR / f'multimodal_{strategy}_training_history.png'
    )

# ============================================================================
# CELL 8: COMPARE MODELS
# ============================================================================
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)

# Prepare comparison data
comparison_data = {
    'Model': ['Tabular Only'],
    'Best Val Loss': [baseline_trainer.best_val_loss],
    'Final Val MAE': [baseline_history['val_mae'][-1]],
    'Final Val RMSE': [baseline_history['val_rmse'][-1]],
    'Final Val MAPE': [baseline_history['val_mape'][-1]]
}

for strategy, results in multimodal_results.items():
    comparison_data['Model'].append(f'Multimodal ({strategy})')
    comparison_data['Best Val Loss'].append(results['best_val_loss'])
    comparison_data['Final Val MAE'].append(results['history']['val_mae'][-1])
    comparison_data['Final Val RMSE'].append(results['history']['val_rmse'][-1])
    comparison_data['Final Val MAPE'].append(results['history']['val_mape'][-1])

comparison_df = pd.DataFrame(comparison_data)
print("\n", comparison_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['Best Val Loss', 'Final Val MAE', 'Final Val RMSE', 'Final Val MAPE']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    ax.bar(comparison_df['Model'], comparison_df[metric])
    ax.set_title(metric)
    ax.set_xlabel('Model')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(Config.MODEL_SAVE_DIR / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Find best model
best_model_idx = comparison_df['Best Val Loss'].idxmin()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Best Val Loss: {comparison_df.loc[best_model_idx, 'Best Val Loss']:.4f}")

# ============================================================================
# CELL 9: GENERATE PREDICTIONS
# ============================================================================
print("\n" + "="*70)
print("GENERATING TEST PREDICTIONS")
print("="*70)

# Load best multimodal model (assuming 'late' fusion is best)
best_strategy = 'late'
best_model = MultimodalFusionModel(
    tabular_input_dim=len(available_features),
    image_embedding_dim=512,
    tabular_hidden_dims=[256, 128, 64],
    fusion_strategy=best_strategy,
    dropout=0.3
).to(device)

# Load checkpoint
checkpoint_path = Config.MODEL_SAVE_DIR / f'multimodal_{best_strategy}_best.pth'
checkpoint = torch.load(checkpoint_path, map_location=device)
best_model.load_state_dict(checkpoint['model_state_dict'])
print(f"‚úì Loaded best model from {checkpoint_path}")

# Make predictions
predictions, ids = predict(best_model, test_loader, device=device)

# Create submission file
submission_df = pd.DataFrame({
    'id': ids,
    'predicted_price': predictions
})

submission_df = submission_df.sort_values('id')
submission_path = 'predictions.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\n‚úì Predictions saved to {submission_path}")
print(f"\nPrediction statistics:")
print(submission_df['predicted_price'].describe())

# Visualize predictions
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(submission_df['predicted_price'], bins=50, edgecolor='black')
plt.xlabel('Predicted Price')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Prices')

plt.subplot(1, 2, 2)
plt.hist(np.log1p(submission_df['predicted_price']), bins=50, edgecolor='black', color='orange')
plt.xlabel('Log(Predicted Price)')
plt.ylabel('Frequency')
plt.title('Log Distribution of Predicted Prices')

plt.tight_layout()
plt.savefig('prediction_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*70)
print("TRAINING AND PREDICTION COMPLETE!")
print("="*70)
print(f"\nFiles generated:")
print(f"  - predictions.csv")
print(f"  - Model checkpoints in {Config.MODEL_SAVE_DIR}")
print(f"  - Training history plots")