# AIDS Kaggle Competition 1

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [28]:
# Load the data
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

# Explore the data
print("Train data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTrain columns:", train_df.columns.tolist())
print("\nMissing values in train:", train_df.isnull().sum())
print("\nMissing values in test:", test_df.isnull().sum())

Train data shape: (30000, 15)
Test data shape: (10000, 14)

Train columns: ['id', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode', 'speechiness', 'tempo', 'time_signature', 'audio_valence', 'song_popularity']

Missing values in train: id                     0
song_duration_ms    3067
acousticness        3024
danceability        2967
energy              2935
instrumentalness    2966
key                 3074
liveness            3022
loudness            2977
audio_mode             0
speechiness            0
tempo                  0
time_signature         0
audio_valence          0
song_popularity        0
dtype: int64

Missing values in test: id                     0
song_duration_ms    1034
acousticness         968
danceability        1061
energy              1040
instrumentalness    1019
key                  991
liveness            1066
loudness             980
audio_mode             0
speechiness            0

In [29]:
# Separate features and target
X_train = train_df.drop('song_popularity', axis=1)
y_train = train_df['song_popularity']
X_test = test_df

In [30]:
# Handle missing values with advanced imputation
def advanced_imputation(X_train, X_test):
    # Identify numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Use different imputation strategies for different columns
    # For most numerical features, use median imputation
    median_imputer = SimpleImputer(strategy='median')
    X_train_imputed = X_train.copy()
    X_test_imputed = X_test.copy()
    
    # Apply median imputation to all numerical columns
    X_train_imputed[numerical_cols] = median_imputer.fit_transform(X_train[numerical_cols])
    X_test_imputed[numerical_cols] = median_imputer.transform(X_test[numerical_cols])
    
    return X_train_imputed, X_test_imputed

In [31]:
# Apply imputation
X_train_imputed, X_test_imputed = advanced_imputation(X_train, X_test)

In [32]:
# Feature engineering
def create_features(df):
    df = df.copy()
    
    # Create interaction features
    df['energy_danceability'] = df['energy'] * df['danceability']
    df['valence_energy'] = df['audio_valence'] * df['energy']
    df['acousticness_energy'] = df['acousticness'] * df['energy']
    df['loudness_energy'] = df['loudness'] * df['energy']
    
    # Create ratio features
    df['speech_to_instrument'] = df['speechiness'] / (df['instrumentalness'] + 0.001)
    df['energy_to_acoustic'] = df['energy'] / (df['acousticness'] + 0.001)
    
    # Create polynomial features for important columns
    df['danceability_sq'] = df['danceability'] ** 2
    df['energy_sq'] = df['energy'] ** 2
    df['loudness_sq'] = df['loudness'] ** 2
    
    # Duration in minutes
    df['duration_min'] = df['song_duration_ms'] / 60000
    
    # Binning for certain features
    df['tempo_binned'] = pd.cut(df['tempo'], bins=5, labels=False)
    df['loudness_binned'] = pd.cut(df['loudness'], bins=5, labels=False)
    
    return df

In [33]:
# Apply feature engineering
X_train_fe = create_features(X_train_imputed)
X_test_fe = create_features(X_test_imputed)

# Scale the features
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

In [34]:
# Convert back to DataFrame to maintain column names
X_train_final = pd.DataFrame(X_train_scaled, columns=X_train_fe.columns, index=X_train_fe.index)
X_test_final = pd.DataFrame(X_test_scaled, columns=X_test_fe.columns, index=X_test_fe.index)

# Split training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_final, y_train, test_size=0.2, random_state=42
)

In [35]:
models = {
    'xgb': XGBClassifier(random_state=42, n_jobs=1, n_estimators=100, max_depth=5),
    'lgbm': LGBMClassifier(random_state=42, n_jobs=1, n_estimators=100, max_depth=5, verbose=-1),
    'catboost': CatBoostClassifier(random_state=42, verbose=0, iterations=100, depth=5),
    'rf': RandomForestClassifier(random_state=42, n_jobs=1, n_estimators=100, max_depth=5),
    'gbr': GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=5),
    'logreg': LogisticRegression(random_state=42, max_iter=1000),
    'ridge': RidgeClassifier(random_state=42)
}


In [36]:
print("Training and evaluating individual models with cross-validation...")
model_performance = {}

best_xgb, best_lgbm, best_rf, best_gbr, best_catboost = None, None, None, None, None

for name, model in models.items():
    # Use cross-validation for more reliable performance estimation
    cv_scores = cross_val_score(model, X_train_split, y_train_split, 
                               cv=3, scoring='accuracy', n_jobs=-1)
    cv_accuracy = cv_scores.mean()
    cv_std = cv_scores.std()
    
    # Also train on full split for comparison
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val_split)
    y_pred_proba = model.predict_proba(X_val_split)[:, 1] if hasattr(model, 'predict_proba') else None
    
    val_accuracy = accuracy_score(y_val_split, y_pred)
    val_precision = precision_score(y_val_split, y_pred)
    val_recall = recall_score(y_val_split, y_pred)
    val_f1 = f1_score(y_val_split, y_pred)
    
    model_performance[name] = {
        'cv_accuracy': cv_accuracy,
        'cv_std': cv_std,
        'val_accuracy': val_accuracy,
        'val_precision': val_precision,
        'val_recall': val_recall,
        'val_f1': val_f1
    }
    
    print(f"{name.upper():<10} - CV Acc: {cv_accuracy:.4f} (±{cv_std:.4f}), "
          f"Val Acc: {val_accuracy:.4f}, F1: {val_f1:.4f}")


Training and evaluating individual models with cross-validation...
XGB        - CV Acc: 0.6103 (±0.0008), Val Acc: 0.6192, F1: 0.2471
LGBM       - CV Acc: 0.6302 (±0.0046), Val Acc: 0.6287, F1: 0.1045
CATBOOST   - CV Acc: 0.6238 (±0.0047), Val Acc: 0.6250, F1: 0.1567
RF         - CV Acc: 0.6363 (±0.0003), Val Acc: 0.6325, F1: 0.0000
GBR        - CV Acc: 0.6310 (±0.0019), Val Acc: 0.6303, F1: 0.0969
LOGREG     - CV Acc: 0.6352 (±0.0003), Val Acc: 0.6322, F1: 0.0081
RIDGE      - CV Acc: 0.6353 (±0.0005), Val Acc: 0.6315, F1: 0.0045


In [38]:
# Select top 2-3 models for hyperparameter tuning based on performance
# Use 'cv_accuracy' instead of 'cv_mse' since we're doing classification now
top_models = sorted(model_performance.items(), key=lambda x: x[1]['cv_accuracy'], reverse=True)[:3]
top_model_names = [name for name, _ in top_models]
print(f"\nTop models selected for tuning: {top_model_names}")


Top models selected for tuning: ['rf', 'ridge', 'logreg']


In [45]:
if 'xgb' in top_model_names:
    print("\nTuning XGBoost...")
    xgb_param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 0.9]
    }
    
    xgb_grid = GridSearchCV(
        XGBClassifier(random_state=42, n_jobs=1),
        xgb_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    xgb_grid.fit(X_train_split, y_train_split)
    best_xgb = xgb_grid.best_estimator_
    print(f"Best XGBoost params: {xgb_grid.best_params_}")
    print(f"Best XGBoost score: {-xgb_grid.best_score_:.4f}")

if 'lgbm' in top_model_names:
    print("\nTuning LightGBM...")
    lgbm_param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [31, 40, 50],
        'subsample': [0.8, 0.9]
    }
    
    lgbm_grid = GridSearchCV(
        LGBMClassifier(random_state=42, n_jobs=1, verbose=-1),
        lgbm_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    lgbm_grid.fit(X_train_split, y_train_split)
    best_lgbm = lgbm_grid.best_estimator_
    print(f"Best LightGBM params: {lgbm_grid.best_params_}")
    print(f"Best LightGBM score: {-lgbm_grid.best_score_:.4f}")

# For other top models, add tuning blocks
if 'rf' in top_model_names:
    print("\nTuning Random Forest...")
    rf_param_grid = {
        'n_estimators': [100, 150, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
    
    rf_grid = GridSearchCV(
        RandomForestClassifier(random_state=42, n_jobs=1),
        rf_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    rf_grid.fit(X_train_split, y_train_split)
    best_rf = rf_grid.best_estimator_
    print(f"Best Random Forest params: {rf_grid.best_params_}")
    print(f"Best Random Forest score: {rf_grid.best_score_:.4f}")

if 'gbr' in top_model_names:
    print("\nTuning Gradient Boosting...")
    gbr_param_grid = {
        'n_estimators': [100, 150, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'subsample': [0.8, 0.9, 1.0]
    }
    
    gbr_grid = GridSearchCV(
        GradientBoostingClassifier(random_state=42),
        gbr_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    gbr_grid.fit(X_train_split, y_train_split)
    best_gbr = gbr_grid.best_estimator_
    print(f"Best Gradient Boosting params: {gbr_grid.best_params_}")
    print(f"Best Gradient Boosting score: {gbr_grid.best_score_:.4f}")

if 'ridge' in top_model_names:
    print("\nTuning Ridge Classifier...")
    ridge_param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr']
    }
    
    ridge_grid = GridSearchCV(
        RidgeClassifier(random_state=42),
        ridge_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    ridge_grid.fit(X_train_split, y_train_split)
    best_ridge = ridge_grid.best_estimator_
    print(f"Best Ridge Classifier params: {ridge_grid.best_params_}")
    print(f"Best Ridge Classifier score: {ridge_grid.best_score_:.4f}")

if 'catboost' in top_model_names:
    print("\nTuning CatBoost...")
    catboost_param_grid = {
        'iterations': [100, 150, 200],
        'depth': [4, 6, 8],
        'learning_rate': [0.03, 0.05, 0.1],
        'l2_leaf_reg': [1, 3, 5]
    }
    
    catboost_grid = GridSearchCV(
        CatBoostClassifier(random_state=42, verbose=0),
        catboost_param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    catboost_grid.fit(X_train_split, y_train_split)
    best_catboost = catboost_grid.best_estimator_
    print(f"Best CatBoost params: {catboost_grid.best_params_}")
    print(f"Best CatBoost score: {catboost_grid.best_score_:.4f}")




Tuning Random Forest...
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Random Forest params: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Random Forest score: 0.6370

Tuning Ridge Classifier...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Ridge Classifier params: {'alpha': 100.0, 'solver': 'auto'}
Best Ridge Classifier score: 0.6358


In [46]:
# Create a stacking ensemble with only the available models
print("\nCreating stacking ensemble...")
estimators = []

# Only add models that were actually tuned/available
if best_xgb is not None:
    estimators.append(('xgb', best_xgb))
if best_lgbm is not None:
    estimators.append(('lgbm', best_lgbm))
if 'rf' in top_model_names:
    estimators.append(('rf', best_rf if 'best_rf' in locals() else models['rf']))
if 'gbr' in top_model_names:
    estimators.append(('gbr', best_gbr if 'best_gbr' in locals() else models['gbr']))
if 'ridge' in top_model_names:
    estimators.append(('ridge', models['ridge']))
if 'lasso' in top_model_names:
    estimators.append(('lasso', models['lasso']))
if 'catboost' in top_model_names:
    estimators.append(('catboost', best_catboost if 'best_catboost' in locals() else models['catboost']))

if len(estimators) < 2:
    estimators.extend([
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
        ('gbr', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ])

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),  # Changed to LogisticRegression
    cv=5,
    n_jobs=-1
)

# Train the stacking ensemble
stacking_classifier.fit(X_train_split, y_train_split)

# Evaluate the stacking ensemble
y_pred_stack = stacking_classifier.predict(X_val_split)
accuracy_stack = accuracy_score(y_val_split, y_pred_stack)
f1_stack = f1_score(y_val_split, y_pred_stack)
print(f"Stacking Ensemble - Accuracy: {accuracy_stack:.4f}, F1: {f1_stack:.4f}")

# Train final model on all training data
print("\nTraining final model on all data...")
final_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

final_model.fit(X_train_final, y_train)


Creating stacking ensemble...
Stacking Ensemble - Accuracy: 0.6315, F1: 0.0116

Training final model on all data...


0,1,2
,estimators,"[('rf', ...), ('ridge', ...)]"
,final_estimator,LogisticRegression()
,cv,5
,stack_method,'auto'
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,class_weight,
,solver,'auto'
,positive,False
,random_state,42

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [50]:
# Make predictions on test data
print("Making predictions on test data...")
test_predictions = final_model.predict(X_test_final)

# Create submission file
submission_df = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_predictions)),
    'song_popularity': test_predictions
})

# Ensure predictions are within reasonable bounds (0-100)
submission_df['song_popularity'] = submission_df['song_popularity'].clip(0, 100)

Making predictions on test data...


In [51]:
submission_df.to_csv('song_popularity_predictions.csv', index=False)
print("Predictions saved to 'song_popularity_predictions.csv'")

Predictions saved to 'song_popularity_predictions.csv'


In [49]:
# Update feature importance and evaluation for classification
if best_xgb is not None:
    best_xgb.fit(X_train_final, y_train)
    feature_importance = pd.DataFrame({
        'feature': X_train_final.columns,
        'importance': best_xgb.feature_importances_
    }).sort_values('importance', ascending=False)
    print("\nTop 10 most important features:")
    print(feature_importance.head(10))

# Cross-validation score for final evaluation
cv_scores = cross_val_score(final_model, X_train_final, y_train, 
                           cv=3, scoring='accuracy', n_jobs=-1)
cv_accuracy = cv_scores.mean()
print(f"\nCross-validated Accuracy: {cv_accuracy:.4f}")


Cross-validated Accuracy: 0.5854
