In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlx-2-0-regression/sample_submission.csv
/kaggle/input/mlx-2-0-regression/train.csv
/kaggle/input/mlx-2-0-regression/test.csv


In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, max_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
import lightgbm as lgb
import xgboost as xgb

In [3]:
# Load and Initial Data Exploration
train = pd.read_csv('/kaggle/input/mlx-2-0-regression/train.csv')
test = pd.read_csv('/kaggle/input/mlx-2-0-regression/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Store test IDs and remove from datasets
test_ids = test['id'].copy()
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

# Remove duplicates
train = train.drop_duplicates()
print(f"Train shape after removing duplicates: {train.shape}")

Train shape: (61609, 62)
Test shape: (41074, 61)
Train shape after removing duplicates: (61515, 61)


In [4]:
# Define Evaluation Functions
def calculate_mape(y_true, y_pred):
    """Calculate Mean Absolute Percentage Error"""
    # Add small epsilon to avoid division by zero
    epsilon = 1e-8
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

def calculate_all_metrics(y_true, y_pred):
    """Calculate all evaluation metrics"""
    metrics = {}
    
    # RMSE (Root Mean Squared Error)
    metrics['RMSE'] = np.sqrt(mean_squared_error(y_true, y_pred))
    
    # MAE (Mean Absolute Error)
    metrics['MAE'] = mean_absolute_error(y_true, y_pred)
    
    # R² Score
    metrics['R2'] = r2_score(y_true, y_pred)
    
    # Median AE (Median Absolute Error)
    metrics['Median_AE'] = median_absolute_error(y_true, y_pred)
    
    # MAPE (Mean Absolute Percentage Error)
    metrics['MAPE'] = calculate_mape(y_true, y_pred)
    
    # Max Error
    metrics['Max_Error'] = max_error(y_true, y_pred)
    
    return metrics

def print_metrics(metrics, model_name="Model"):
    """Print all metrics in a formatted way"""
    print(f"\n{model_name} Performance Metrics:")
    print("-" * 40)
    print(f"RMSE:       {metrics['RMSE']:.4f}")
    print(f"MAE:        {metrics['MAE']:.4f}")
    print(f"R² Score:   {metrics['R2']:.4f}")
    print(f"Median AE:  {metrics['Median_AE']:.4f}")
    print(f"MAPE:       {metrics['MAPE']:.2f}%")
    print(f"Max Error:  {metrics['Max_Error']:.4f}")

In [5]:
# Feature Engineering - Categorical Cleaning
def clean_categorical_features(train_df, test_df):
    """Clean and encode categorical features"""
    
    # Categorical columns that need cleaning
    cat_cols = ['composition_label_0', 'composition_label_1', 'composition_label_2', 
                'creator_collective', 'track_identifier']
    
    for col in cat_cols:
        if col not in train_df.columns:
            continue
            
        # Find common labels between train and test
        train_labels = set(train_df[col].unique())
        test_labels = set(test_df[col].unique())
        common_labels = train_labels.intersection(test_labels)
        
        # Replace uncommon labels with 'Other'
        train_df[col] = train_df[col].apply(lambda x: x if x in common_labels else 'Other')
        test_df[col] = test_df[col].apply(lambda x: x if x in common_labels else 'Other')
        
        # Handle rare categories (frequency <= 5)
        freq_threshold = 5 if col != 'creator_collective' else 2
        
        value_counts = train_df[col].value_counts()
        rare_labels = value_counts[value_counts <= freq_threshold].index
        
        train_df[col] = train_df[col].apply(lambda x: 'Rare' if x in rare_labels else x)
        test_df[col] = test_df[col].apply(lambda x: 'Rare' if x in rare_labels else x)
    
    return train_df, test_df

In [6]:
# Feature Engineering - Encoding
def encode_features(train_df, test_df):
    """Encode categorical features"""
    
    # Frequency encoding for high-cardinality features
    freq_encode_cols = ['composition_label_0', 'composition_label_1', 'composition_label_2', 
                        'creator_collective', 'track_identifier']
    
    for col in freq_encode_cols:
        if col in train_df.columns:
            freq = train_df[col].value_counts(normalize=True)
            train_df[col] = train_df[col].map(freq)
            test_df[col] = test_df[col].map(freq).fillna(0)
    
    # Label encoding for low-cardinality features
    label_encode_cols = ['season_of_release', 'lunar_phase', 'weekday_of_release']
    
    for col in label_encode_cols:
        if col in train_df.columns:
            le = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
            le.fit(combined)
            train_df[col] = le.transform(train_df[col].astype(str))
            test_df[col] = le.transform(test_df[col].astype(str))
    
    return train_df, test_df

In [7]:
# Feature Engineering - Create Additional Features
def create_features(df):
    """Create additional features"""
    
    # Audio feature aggregations (if available)
    audio_features = []
    for prefix in ['duration_ms_', 'rhythmic_cohesion_', 'intensity_index_', 
                   'organic_texture_', 'beat_frequency_']:
        cols = [col for col in df.columns if col.startswith(prefix)]
        if cols:
            audio_features.extend(cols)
            # Create mean, std, max, min features
            df[f'{prefix}mean'] = df[cols].mean(axis=1)
            df[f'{prefix}std'] = df[cols].std(axis=1)
            df[f'{prefix}max'] = df[cols].max(axis=1)
            df[f'{prefix}min'] = df[cols].min(axis=1)
    
    return df

In [8]:
# Apply Feature Engineering
print("Cleaning categorical features...")
train, test = clean_categorical_features(train, test)

print("Encoding features...")
train, test = encode_features(train, test)

print("Creating additional features...")
train = create_features(train)
test = create_features(test)

Cleaning categorical features...
Encoding features...
Creating additional features...


In [9]:
# Final Data Preprocessing
# Drop timestamp column (replace with better datetime features if needed)
if 'publication_timestamp' in train.columns:
    train = train.drop(['publication_timestamp'], axis=1)
if 'publication_timestamp' in test.columns:
    test = test.drop(['publication_timestamp'], axis=1)

# Handle missing values
train = train.fillna(-1)
test = test.fillna(-1)

# Prepare features and target
X = train.drop(['target'], axis=1)
y = train['target']

print(f"Final feature shape: {X.shape}")

Final feature shape: (61515, 79)


In [10]:
# Cross-Validation Function
def cv_model(model, X, y, test_data, n_splits=5):
    """Cross-validation with a single model and enhanced metrics"""
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test_data))
    
    fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit model
        if hasattr(model, 'fit'):
            if 'lgb' in str(type(model)).lower():
                model.fit(X_train, y_train, 
                         eval_set=[(X_val, y_val)], 
                         callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
            elif 'xgb' in str(type(model)).lower():
                model.fit(X_train, y_train, 
                         eval_set=[(X_val, y_val)], 
                         verbose=False)
            else:
                model.fit(X_train, y_train)
        
        # Predict
        val_pred = model.predict(X_val)
        test_pred = model.predict(test_data)
        
        oof_preds[val_idx] = val_pred
        test_preds += test_pred / n_splits
        
        # Calculate all metrics for this fold
        fold_metric = calculate_all_metrics(y_val, val_pred)
        fold_metrics.append(fold_metric)
        
        print(f"Fold {fold+1} - RMSE: {fold_metric['RMSE']:.4f}, MAE: {fold_metric['MAE']:.4f}, R²: {fold_metric['R2']:.4f}")
    
    # Calculate average metrics across folds
    avg_metrics = {}
    for metric in fold_metrics[0].keys():
        avg_metrics[metric] = np.mean([fm[metric] for fm in fold_metrics])
    
    print(f"\nAverage CV Metrics:")
    print(f"RMSE: {avg_metrics['RMSE']:.4f}")
    print(f"MAE: {avg_metrics['MAE']:.4f}")
    print(f"R²: {avg_metrics['R2']:.4f}")
    
    return oof_preds, test_preds, avg_metrics

In [11]:
# Initialize Model Storage
models = {}
oof_predictions = {}
test_predictions = {}
model_metrics = {}

In [12]:
# Train LightGBM Model
print("\n" + "="*50)
print("Training LightGBM...")
print("="*50)

lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42,
    verbose=-1
)

oof_lgb, test_lgb, metrics_lgb = cv_model(lgb_model, X, y, test)
models['lgb'] = metrics_lgb['RMSE']
oof_predictions['lgb'] = oof_lgb
test_predictions['lgb'] = test_lgb
model_metrics['lgb'] = metrics_lgb


Training LightGBM...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 99.7481
Fold 1 - RMSE: 9.9874, MAE: 6.6409, R²: 0.7872
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 102.664
Fold 2 - RMSE: 10.1323, MAE: 6.8018, R²: 0.7817
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 95.8367
Fold 3 - RMSE: 9.7896, MAE: 6.5731, R²: 0.7932
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 99.4317
Fold 4 - RMSE: 9.9715, MAE: 6.6310, R²: 0.7855
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 99.1716
Fold 5 - RMSE: 9.9585, MAE: 6.6518, R²: 0.7846

Average CV Metrics:
RMSE: 9.9679
MAE: 6.6597
R²: 0.7

In [13]:
# Train XGBoost Model
print("\n" + "="*50)
print("Training XGBoost...")
print("="*50)

xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)

oof_xgb, test_xgb, metrics_xgb = cv_model(xgb_model, X, y, test)
models['xgb'] = metrics_xgb['RMSE']
oof_predictions['xgb'] = oof_xgb
test_predictions['xgb'] = test_xgb
model_metrics['xgb'] = metrics_xgb


Training XGBoost...
Fold 1 - RMSE: 9.6869, MAE: 6.2150, R²: 0.7998
Fold 2 - RMSE: 9.7830, MAE: 6.3201, R²: 0.7965
Fold 3 - RMSE: 9.4397, MAE: 6.0566, R²: 0.8077
Fold 4 - RMSE: 9.6631, MAE: 6.2160, R²: 0.7986
Fold 5 - RMSE: 9.6926, MAE: 6.2251, R²: 0.7959

Average CV Metrics:
RMSE: 9.6531
MAE: 6.2066
R²: 0.7997


In [14]:
# Create Ensemble Model
print("\n" + "="*50)
print("Creating Ensemble...")
print("="*50)

# Simple weighted average based on CV scores
weights = {}
total_inv_score = sum(1/score for score in models.values())

for model_name, score in models.items():
    weights[model_name] = (1/score) / total_inv_score
    print(f"{model_name} weight: {weights[model_name]:.3f} (CV RMSE: {score:.4f})")

# Create ensemble predictions
ensemble_oof = np.zeros(len(y))
ensemble_test = np.zeros(len(test))

for model_name in models.keys():
    ensemble_oof += weights[model_name] * oof_predictions[model_name]
    ensemble_test += weights[model_name] * test_predictions[model_name]

# Calculate ensemble metrics
ensemble_metrics = calculate_all_metrics(y, ensemble_oof)


Creating Ensemble...
lgb weight: 0.492 (CV RMSE: 9.9679)
xgb weight: 0.508 (CV RMSE: 9.6531)


In [15]:
# Display Detailed Results
print("\n" + "="*60)
print("DETAILED MODEL COMPARISON")
print("="*60)

# Print individual model metrics
for model_name, metrics in model_metrics.items():
    print_metrics(metrics, f"{model_name.upper()} Model")

# Print ensemble metrics
print_metrics(ensemble_metrics, "ENSEMBLE Model")


DETAILED MODEL COMPARISON

LGB Model Performance Metrics:
----------------------------------------
RMSE:       9.9679
MAE:        6.6597
R² Score:   0.7864
Median AE:  3.6366
MAPE:       52.24%
Max Error:  48.0398

XGB Model Performance Metrics:
----------------------------------------
RMSE:       9.6531
MAE:        6.2066
R² Score:   0.7997
Median AE:  3.0230
MAPE:       51.01%
Max Error:  46.7391

ENSEMBLE Model Performance Metrics:
----------------------------------------
RMSE:       9.7382
MAE:        6.3651
R² Score:   0.7962
Median AE:  3.2661
MAPE:       51.47%
Max Error:  48.8148


In [16]:
# Display Summary Comparison
print("\n" + "="*60)
print("SUMMARY COMPARISON")
print("="*60)
print(f"{'Model':<12} {'RMSE':<8} {'MAE':<8} {'R²':<8} {'MAPE':<8}")
print("-" * 50)

for model_name, metrics in model_metrics.items():
    print(f"{model_name.upper():<12} {metrics['RMSE']:<8.4f} {metrics['MAE']:<8.4f} {metrics['R2']:<8.4f} {metrics['MAPE']:<8.2f}")

print(f"{'ENSEMBLE':<12} {ensemble_metrics['RMSE']:<8.4f} {ensemble_metrics['MAE']:<8.4f} {ensemble_metrics['R2']:<8.4f} {ensemble_metrics['MAPE']:<8.2f}")


SUMMARY COMPARISON
Model        RMSE     MAE      R²       MAPE    
--------------------------------------------------
LGB          9.9679   6.6597   0.7864   52.24   
XGB          9.6531   6.2066   0.7997   51.01   
ENSEMBLE     9.7382   6.3651   0.7962   51.47   


In [17]:
# Create and Save Submissions
print("\nCreating submission files...")

# Individual model submissions
submission_template = pd.DataFrame({'id': test_ids})

for model_name, preds in test_predictions.items():
    submission = submission_template.copy()
    submission['target'] = np.round(preds).astype(int)  # Round to nearest integer
    submission.to_csv(f'{model_name}_submission.csv', index=False)
    print(f"Saved {model_name}_submission.csv")

# Ensemble submission
submission = submission_template.copy()
submission['target'] = np.round(ensemble_test).astype(int)  # Round to nearest integer
submission.to_csv('ensemble_submission.csv', index=False)
print("Saved ensemble_submission.csv")


Creating submission files...
Saved lgb_submission.csv
Saved xgb_submission.csv
Saved ensemble_submission.csv


In [18]:
# Final Summary
print(f"\nBest single model: {min(models.keys(), key=lambda x: models[x])} (RMSE: {min(models.values()):.4f})")
print(f"Ensemble RMSE: {ensemble_metrics['RMSE']:.4f}")
print(f"Ensemble R²: {ensemble_metrics['R2']:.4f}")


Best single model: xgb (RMSE: 9.6531)
Ensemble RMSE: 9.7382
Ensemble R²: 0.7962
