# Crypto Market Prediction - Complete Pipeline

This notebook implements a complete machine learning pipeline for crypto market prediction including:
- Advanced feature engineering
- Multiple model implementations (XGBoost, LightGBM, Neural Networks)
- Model validation and tuning
- Final predictions and submission

## Instructions:
1. Upload your data files (train.parquet, test.parquet, sample_submission.csv) to Colab
2. Run all cells in order
3. Download the final submission file

In [None]:
# Install required packages
!pip install xgboost lightgbm optuna -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet
import xgboost as xgb
import lightgbm as lgb
import optuna
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')

print("All packages installed successfully!")

In [None]:
# Load data
print("Loading datasets...")
train_df = pd.read_parquet('train.parquet')
test_df = pd.read_parquet('test.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Quick data overview
print("\nTarget statistics:")
print(train_df['label'].describe())

print("\nData types:")
print(train_df.dtypes.value_counts())

In [None]:
# Advanced Feature Engineering
def create_advanced_features(df, is_train=True):
    """Create comprehensive feature set"""
    df_new = df.copy()
    
    # Market microstructure features
    df_new['bid_ask_spread'] = df_new['ask_qty'] - df_new['bid_qty']
    df_new['bid_ask_ratio'] = df_new['bid_qty'] / (df_new['ask_qty'] + 1e-8)
    df_new['buy_sell_ratio'] = df_new['buy_qty'] / (df_new['sell_qty'] + 1e-8)
    df_new['buy_sell_imbalance'] = df_new['buy_qty'] - df_new['sell_qty']
    df_new['volume_intensity'] = df_new['volume'] / (df_new['buy_qty'] + df_new['sell_qty'] + 1e-8)
    
    # Order flow features
    df_new['bid_volume_ratio'] = df_new['bid_qty'] / (df_new['volume'] + 1e-8)
    df_new['ask_volume_ratio'] = df_new['ask_qty'] / (df_new['volume'] + 1e-8)
    df_new['net_flow'] = df_new['buy_qty'] - df_new['sell_qty']
    df_new['flow_ratio'] = df_new['net_flow'] / (df_new['volume'] + 1e-8)
    
    # Time-based features (only for training data with datetime index)
    if is_train and hasattr(df_new.index, 'hour'):
        df_new['hour'] = df_new.index.hour
        df_new['day_of_week'] = df_new.index.dayofweek
        df_new['month'] = df_new.index.month
        
        # Cyclical encoding
        df_new['hour_sin'] = np.sin(2 * np.pi * df_new['hour'] / 24)
        df_new['hour_cos'] = np.cos(2 * np.pi * df_new['hour'] / 24)
        df_new['dow_sin'] = np.sin(2 * np.pi * df_new['day_of_week'] / 7)
        df_new['dow_cos'] = np.cos(2 * np.pi * df_new['day_of_week'] / 7)
        df_new['month_sin'] = np.sin(2 * np.pi * df_new['month'] / 12)
        df_new['month_cos'] = np.cos(2 * np.pi * df_new['month'] / 12)
        
        df_new = df_new.drop(['hour', 'day_of_week', 'month'], axis=1)
    elif not is_train:
        # Dummy time features for test data
        for feat in ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']:
            df_new[feat] = 0.0
    
    # Rolling window features
    windows = [3, 5, 10, 20]
    features = ['volume', 'buy_qty', 'sell_qty', 'bid_qty', 'ask_qty']
    
    for window in windows:
        for feature in features:
            df_new[f'{feature}_ma_{window}'] = df_new[feature].rolling(window=window).mean()
            df_new[f'{feature}_std_{window}'] = df_new[feature].rolling(window=window).std()
            df_new[f'{feature}_min_{window}'] = df_new[feature].rolling(window=window).min()
            df_new[f'{feature}_max_{window}'] = df_new[feature].rolling(window=window).max()
    
    # Lag features (only for training data)
    if is_train and 'label' in df_new.columns:
        for lag in [1, 2, 3, 5, 10]:
            df_new[f'label_lag_{lag}'] = df_new['label'].shift(lag)
    
    # Interaction features
    df_new['volume_x_spread'] = df_new['volume'] * df_new['bid_ask_spread']
    df_new['imbalance_x_volume'] = df_new['buy_sell_imbalance'] * df_new['volume']
    
    return df_new

print("Creating advanced features...")
train_enhanced = create_advanced_features(train_df, is_train=True)
test_enhanced = create_advanced_features(test_df, is_train=False)

print(f"Enhanced training data shape: {train_enhanced.shape}")
print(f"Enhanced test data shape: {test_enhanced.shape}")

In [None]:
# Prepare data for modeling
def prepare_modeling_data(train_df, test_df, n_features=200):
    """Prepare data with feature selection and scaling"""
    
    # Features and target
    target_col = 'label'
    feature_cols = [col for col in train_df.columns if col != target_col]
    
    X = train_df[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
    y = train_df[target_col]
    
    # Time series split for validation
    tscv = TimeSeriesSplit(n_splits=5)
    splits = list(tscv.split(X))
    train_idx, val_idx = splits[-1]  # Use last split for final validation
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    print(f"Training set: {len(X_train)}, Validation set: {len(X_val)}")
    
    # Feature selection using mutual information
    print(f"Selecting top {n_features} features...")
    selector = SelectKBest(score_func=mutual_info_regression, k=min(n_features, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)
    
    selected_features = X.columns[selector.get_support()].tolist()
    print(f"Selected {len(selected_features)} features")
    
    # Prepare test data
    X_test = test_df[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
    X_test_selected = selector.transform(X_test)
    
    # Scaling
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_val_scaled = scaler.transform(X_val_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    return {
        'X_train': X_train_scaled,
        'X_val': X_val_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_val': y_val,
        'selected_features': selected_features,
        'selector': selector,
        'scaler': scaler,
        'tscv_splits': splits
    }

print("Preparing modeling data...")
data = prepare_modeling_data(train_enhanced, test_enhanced, n_features=200)
print("Data preparation completed!")

In [None]:
# Model Training and Evaluation with Comprehensive Metrics
def comprehensive_evaluation(y_true, y_pred, model_name, dataset_name):
    """Comprehensive evaluation including Pearson correlation coefficient"""
    # Basic regression metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Correlation metrics
    pearson_corr, pearson_p = pearsonr(y_true, y_pred)
    spearman_corr, spearman_p = spearmanr(y_true, y_pred)
    
    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100
    
    # Directional accuracy (for time series)
    if len(y_true) > 1:
        true_direction = np.sign(np.diff(y_true))
        pred_direction = np.sign(np.diff(y_pred))
        directional_accuracy = np.mean(true_direction == pred_direction) * 100
    else:
        directional_accuracy = np.nan
    
    # Interpretation
    def interpret_correlation(corr):
        abs_corr = abs(corr)
        if abs_corr >= 0.7: return 'Strong'
        elif abs_corr >= 0.5: return 'Moderate'
        elif abs_corr >= 0.3: return 'Weak'
        elif abs_corr >= 0.1: return 'Very weak'
        else: return 'Negligible'
    
    print(f"{model_name} - {dataset_name} Results:")
    print(f"  RMSE:                 {rmse:.6f}")
    print(f"  MAE:                  {mae:.6f}")
    print(f"  R² Score:             {r2:.6f}")
    print(f"  Pearson Correlation:  {pearson_corr:.6f} ({interpret_correlation(pearson_corr)})")
    print(f"  Spearman Correlation: {spearman_corr:.6f}")
    print(f"  MAPE:                 {mape:.2f}%")
    if not np.isnan(directional_accuracy):
        print(f"  Directional Accuracy: {directional_accuracy:.2f}%")
    print(f"  Statistical Sig:      {'Yes' if pearson_p < 0.05 else 'No'} (p={pearson_p:.6f})")
    
    return {
        'rmse': rmse, 'mae': mae, 'r2': r2,
        'pearson_corr': pearson_corr, 'pearson_p': pearson_p,
        'spearman_corr': spearman_corr, 'spearman_p': spearman_p,
        'mape': mape, 'directional_accuracy': directional_accuracy
    }

def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    """Enhanced model evaluation with comprehensive metrics"""
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    print(f"\n{'='*60}")
    print(f"COMPREHENSIVE EVALUATION: {model_name}")
    print(f"{'='*60}")
    
    train_metrics = comprehensive_evaluation(y_train, train_pred, model_name, 'Training')
    print()
    val_metrics = comprehensive_evaluation(y_val, val_pred, model_name, 'Validation')
    
    return {
        'model': model,
        'train_metrics': train_metrics,
        'val_metrics': val_metrics,
        'train_predictions': train_pred,
        'val_predictions': val_pred
    }

# Train multiple models
models = {}
results = {}

print("Training models...\n")

# 1. Ridge Regression
ridge = Ridge(alpha=1.0, random_state=42)
results['Ridge'] = evaluate_model(ridge, data['X_train'], data['y_train'], 
                                 data['X_val'], data['y_val'], 'Ridge Regression')
models['Ridge'] = ridge

print()

# 2. XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
results['XGBoost'] = evaluate_model(xgb_model, data['X_train'], data['y_train'], 
                                   data['X_val'], data['y_val'], 'XGBoost')
models['XGBoost'] = xgb_model

print()

# 3. LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
results['LightGBM'] = evaluate_model(lgb_model, data['X_train'], data['y_train'], 
                                    data['X_val'], data['y_val'], 'LightGBM')
models['LightGBM'] = lgb_model

print("\nModel training completed!")

In [None]:
# Model Ensemble and Final Predictions
def create_ensemble_predictions(models, results, X_test):
    """Create weighted ensemble predictions"""
    predictions = []
    weights = []
    
    # Weight models by inverse validation RMSE
    for name, model in models.items():
        pred = model.predict(X_test)
        predictions.append(pred)
        
        val_rmse = results[name]['val_metrics']['rmse']
        val_pearson = results[name]['val_metrics']['pearson_corr']
        weight = 1.0 / val_rmse
        weights.append(weight)
        
        print(f"{name} - Val RMSE: {val_rmse:.6f}, Pearson: {val_pearson:.6f}, Weight: {weight:.4f}")
    
    # Normalize weights
    weights = np.array(weights)
    weights = weights / weights.sum()
    
    # Weighted ensemble
    ensemble_pred = np.average(predictions, axis=0, weights=weights)
    
    print(f"\nFinal ensemble weights: {dict(zip(models.keys(), weights))}")
    
    return ensemble_pred

print("Creating ensemble predictions...")
final_predictions = create_ensemble_predictions(models, results, data['X_test'])

# Create submission file
submission = pd.DataFrame({
    'ID': range(1, len(final_predictions) + 1),
    'prediction': final_predictions
})

submission.to_csv('submission_ensemble.csv', index=False)
print(f"\nSubmission file created with {len(submission)} predictions")
print(f"Prediction statistics:")
print(submission['prediction'].describe())

In [None]:
# Results Summary and Visualization
print("=" * 60)
print("FINAL RESULTS SUMMARY")
print("=" * 60)

# Model comparison
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train_RMSE': [results[model]['train_metrics']['rmse'] for model in results.keys()],
    'Val_RMSE': [results[model]['val_metrics']['rmse'] for model in results.keys()],
    'Train_MAE': [results[model]['train_metrics']['mae'] for model in results.keys()],
    'Val_MAE': [results[model]['val_metrics']['mae'] for model in results.keys()],
    'Val_Pearson': [results[model]['val_metrics']['pearson_corr'] for model in results.keys()],
    'Val_R2': [results[model]['val_metrics']['r2'] for model in results.keys()],
    'Val_Directional': [results[model]['val_metrics']['directional_accuracy'] for model in results.keys()]
})

print("\nModel Performance Comparison:")
print(comparison_df.round(6))

# Best model analysis
best_model_idx = comparison_df['Val_RMSE'].idxmin()
best_model = comparison_df.loc[best_model_idx, 'Model']
best_rmse = comparison_df.loc[best_model_idx, 'Val_RMSE']
best_pearson = comparison_df.loc[best_model_idx, 'Val_Pearson']
best_r2 = comparison_df.loc[best_model_idx, 'Val_R2']
best_directional = comparison_df.loc[best_model_idx, 'Val_Directional']

print(f"\nBest Model: {best_model}")
print(f"  Val RMSE: {best_rmse:.6f}")
print(f"  Pearson Correlation: {best_pearson:.6f}")
print(f"  R² Score: {best_r2:.6f}")
print(f"  Directional Accuracy: {best_directional:.2f}%")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Model comparison
comparison_df.set_index('Model')[['Train_RMSE', 'Val_RMSE']].plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Model RMSE Comparison')
axes[0,0].set_ylabel('RMSE')
axes[0,0].legend()

# Prediction distribution
axes[0,1].hist(final_predictions, bins=50, alpha=0.7, edgecolor='black')
axes[0,1].set_title('Final Predictions Distribution')
axes[0,1].set_xlabel('Prediction Value')
axes[0,1].set_ylabel('Frequency')

# Target vs prediction scatter (validation set) with correlation
best_val_pred = results[best_model]['val_predictions']
axes[1,0].scatter(data['y_val'], best_val_pred, alpha=0.5)
axes[1,0].plot([data['y_val'].min(), data['y_val'].max()], 
               [data['y_val'].min(), data['y_val'].max()], 'r--')
axes[1,0].set_title(f'{best_model} - Actual vs Predicted\nPearson r = {best_pearson:.4f}')
axes[1,0].set_xlabel('Actual')
axes[1,0].set_ylabel('Predicted')

# Residuals
residuals = data['y_val'] - best_val_pred
axes[1,1].scatter(best_val_pred, residuals, alpha=0.5)
axes[1,1].axhline(y=0, color='r', linestyle='--')
axes[1,1].set_title(f'{best_model} - Residuals')
axes[1,1].set_xlabel('Predicted')
axes[1,1].set_ylabel('Residuals')

plt.tight_layout()
plt.show()

print("\n" + "=" * 60)
print("COMPREHENSIVE METRICS SUMMARY")
print("=" * 60)

# Create comprehensive summary
print("\nFinal Model Performance Summary:")
for model_name in results.keys():
    val_metrics = results[model_name]['val_metrics']
    print(f"\n{model_name}:")
    print(f"  RMSE: {val_metrics['rmse']:.6f}")
    print(f"  Pearson Correlation: {val_metrics['pearson_corr']:.6f}")
    print(f"  R² Score: {val_metrics['r2']:.6f}")
    print(f"  Directional Accuracy: {val_metrics['directional_accuracy']:.2f}%")

# Performance interpretation
print(f"\nPerformance Interpretation:")
if best_pearson > 0.5:
    quality = "Excellent"
elif best_pearson > 0.3:
    quality = "Good"
elif best_pearson > 0.1:
    quality = "Moderate"
else:
    quality = "Poor"

print(f"Best Model ({best_model}):")
print(f"  Overall Quality: {quality}")
print(f"  Linear Relationship: {'Strong' if abs(best_pearson) > 0.7 else 'Moderate' if abs(best_pearson) > 0.3 else 'Weak'}")
print(f"  Variance Explained: {best_r2*100:.2f}%")
print(f"  Direction Prediction: {'Good' if best_directional > 60 else 'Moderate' if best_directional > 50 else 'Random'}")

print("\n" + "=" * 60)
print("CRYPTO MARKET PREDICTION PIPELINE COMPLETED!")
print("=" * 60)
print("\nFiles created:")
print("- submission_ensemble.csv (final submission)")
print("\nKey Achievements:")
print(f"- Best Pearson Correlation: {best_pearson:.6f}")
print(f"- Best RMSE: {best_rmse:.6f}")
print(f"- Best Directional Accuracy: {best_directional:.2f}%")
print("\nNext steps:")
print("1. Download the submission file")
print("2. Submit to Kaggle competition")
print("3. Monitor leaderboard performance")
print("4. Consider further feature engineering if Pearson < 0.3")