## 1. Import Required Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error
)

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("All libraries imported successfully!")

## 2. Load and Explore the Data

In [None]:
# Load dataset
df = pd.read_csv('midterm-regresi-dataset.csv', header=None)

print("Dataset shape:", df.shape)
print("\n" + "="*50)
print("First few rows:")
df.head()

In [None]:
# Separate target and features
# First column is the target (year), rest are features
y = df.iloc[:, 0].values  # Target: release year
X = df.iloc[:, 1:].values  # Features: audio characteristics

print(f"Target shape: {y.shape}")
print(f"Features shape: {X.shape}")
print(f"\nNumber of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

In [None]:
# Basic statistics of the target variable
print("Target Variable (Release Year) Statistics:")
print(f"Mean: {y.mean():.2f}")
print(f"Median: {np.median(y):.2f}")
print(f"Std Dev: {y.std():.2f}")
print(f"Min: {y.min():.0f}")
print(f"Max: {y.max():.0f}")
print(f"Range: {y.max() - y.min():.0f} years")

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
axes[0].hist(y, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Release Years', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(y, vert=True)
axes[1].set_title('Box Plot of Release Years', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Year')
axes[1].grid(True, alpha=0.3)

# Violin plot
parts = axes[2].violinplot([y], vert=True, showmeans=True, showmedians=True)
axes[2].set_title('Violin Plot of Release Years', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Year')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Check for missing values
print("Missing Values in Dataset:")
print(f"Target: {np.isnan(y).sum()}")
print(f"Features: {np.isnan(X).sum()}")

if np.isnan(X).sum() > 0:
    print(f"\nPercentage of missing values: {(np.isnan(X).sum() / X.size * 100):.2f}%")

## 3. Data Preprocessing

In [None]:
# Create DataFrame for easier manipulation
feature_names = [f'feature_{i+1}' for i in range(X.shape[1])]
X_df = pd.DataFrame(X, columns=feature_names)
y_series = pd.Series(y, name='year')

print("Features DataFrame:")
X_df.head()

In [None]:
# Statistical summary of features
print("Feature Statistics:")
X_df.describe()

In [None]:
# Detect and handle outliers using IQR method
def detect_outliers_iqr(data, threshold=1.5):
    """
    Detect outliers using Interquartile Range (IQR) method
    """
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    outliers = (data < lower_bound) | (data > upper_bound)
    return outliers, lower_bound, upper_bound

# Check for outliers in target variable
outliers_y, lower_y, upper_y = detect_outliers_iqr(y)
print(f"Outliers in target variable: {outliers_y.sum()} ({outliers_y.sum()/len(y)*100:.2f}%)")
print(f"Target range without outliers: [{lower_y:.0f}, {upper_y:.0f}]")

# For this task, we'll keep outliers as they represent valid years
print("\nNote: Keeping all data points as they represent valid release years.")

In [None]:
# Visualize feature distributions (sample of first 12 features)
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
axes = axes.flatten()

for i in range(min(12, X.shape[1])):
    axes[i].hist(X[:, i], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Feature {i+1}', fontsize=10)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Distribution of First 12 Features', fontsize=14, fontweight='bold', y=1.01)
plt.show()

## 4. Train-Test Split and Feature Scaling

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set year range: [{y_train.min():.0f}, {y_train.max():.0f}]")
print(f"Test set year range: [{y_test.min():.0f}, {y_test.max():.0f}]")

In [None]:
# Feature Scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")
print(f"Scaled training features shape: {X_train_scaled.shape}")
print(f"Scaled test features shape: {X_test_scaled.shape}")
print(f"\nMean of scaled features (should be ~0): {X_train_scaled.mean():.6f}")
print(f"Std of scaled features (should be ~1): {X_train_scaled.std():.6f}")

## 5. Machine Learning Models

In [None]:
# Function to evaluate regression models
def evaluate_regression_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Train and evaluate a regression model
    """
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print(f"{'='*60}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"\nTraining Set:")
    print(f"  MSE:  {train_mse:.4f}")
    print(f"  RMSE: {train_rmse:.4f}")
    print(f"  MAE:  {train_mae:.4f}")
    print(f"  R²:   {train_r2:.4f}")
    
    print(f"\nTest Set:")
    print(f"  MSE:  {test_mse:.4f}")
    print(f"  RMSE: {test_rmse:.4f}")
    print(f"  MAE:  {test_mae:.4f}")
    print(f"  R²:   {test_r2:.4f}")
    
    return {
        'model': model,
        'model_name': model_name,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred
    }

### 5.1 Linear Regression

In [None]:
# Linear Regression
lr_model = LinearRegression()
lr_results = evaluate_regression_model(lr_model, X_train_scaled, y_train, X_test_scaled, y_test, "Linear Regression")

### 5.2 Ridge Regression

In [None]:
# Ridge Regression (L2 regularization)
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_results = evaluate_regression_model(ridge_model, X_train_scaled, y_train, X_test_scaled, y_test, "Ridge Regression")

### 5.3 Lasso Regression

In [None]:
# Lasso Regression (L1 regularization)
lasso_model = Lasso(alpha=0.1, random_state=42, max_iter=10000)
lasso_results = evaluate_regression_model(lasso_model, X_train_scaled, y_train, X_test_scaled, y_test, "Lasso Regression")

### 5.4 Random Forest

In [None]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_results = evaluate_regression_model(rf_model, X_train_scaled, y_train, X_test_scaled, y_test, "Random Forest")

### 5.5 XGBoost

In [None]:
# XGBoost
xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)
xgb_results = evaluate_regression_model(xgb_model, X_train_scaled, y_train, X_test_scaled, y_test, "XGBoost")

### 5.6 LightGBM

In [None]:
# LightGBM
lgbm_model = LGBMRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgbm_results = evaluate_regression_model(lgbm_model, X_train_scaled, y_train, X_test_scaled, y_test, "LightGBM")

## 6. Deep Learning Model

In [None]:
# Build Neural Network for Regression
def build_regression_nn(input_dim):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_shape=(input_dim,)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        
        layers.Dense(1)  # Output layer for regression (no activation)
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae', keras.metrics.RootMeanSquaredError(name='rmse')]
    )
    
    return model

# Create the model
nn_model = build_regression_nn(X_train_scaled.shape[1])
nn_model.summary()

In [None]:
# Define callbacks
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=7,
    min_lr=1e-7,
    verbose=1
)

# Train the model
print("Training Neural Network...")
history = nn_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Loss
axes[0].plot(history.history['loss'], label='Train Loss')
axes[0].plot(history.history['val_loss'], label='Val Loss')
axes[0].set_title('Model Loss (MSE)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True)

# MAE
axes[1].plot(history.history['mae'], label='Train MAE')
axes[1].plot(history.history['val_mae'], label='Val MAE')
axes[1].set_title('Mean Absolute Error', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].legend()
axes[1].grid(True)

# RMSE
axes[2].plot(history.history['rmse'], label='Train RMSE')
axes[2].plot(history.history['val_rmse'], label='Val RMSE')
axes[2].set_title('Root Mean Squared Error', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('RMSE')
axes[2].legend()
axes[2].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluate Neural Network
print("\n" + "="*60)
print("Evaluating Neural Network...")
print("="*60)

# Predictions
y_train_pred_nn = nn_model.predict(X_train_scaled).flatten()
y_test_pred_nn = nn_model.predict(X_test_scaled).flatten()

# Calculate metrics
train_mse_nn = mean_squared_error(y_train, y_train_pred_nn)
test_mse_nn = mean_squared_error(y_test, y_test_pred_nn)
train_rmse_nn = np.sqrt(train_mse_nn)
test_rmse_nn = np.sqrt(test_mse_nn)
train_mae_nn = mean_absolute_error(y_train, y_train_pred_nn)
test_mae_nn = mean_absolute_error(y_test, y_test_pred_nn)
train_r2_nn = r2_score(y_train, y_train_pred_nn)
test_r2_nn = r2_score(y_test, y_test_pred_nn)

print(f"\nNeural Network Results:")
print(f"\nTraining Set:")
print(f"  MSE:  {train_mse_nn:.4f}")
print(f"  RMSE: {train_rmse_nn:.4f}")
print(f"  MAE:  {train_mae_nn:.4f}")
print(f"  R²:   {train_r2_nn:.4f}")

print(f"\nTest Set:")
print(f"  MSE:  {test_mse_nn:.4f}")
print(f"  RMSE: {test_rmse_nn:.4f}")
print(f"  MAE:  {test_mae_nn:.4f}")
print(f"  R²:   {test_r2_nn:.4f}")

# Store results
nn_results = {
    'model': nn_model,
    'model_name': 'Neural Network',
    'train_mse': train_mse_nn,
    'test_mse': test_mse_nn,
    'train_rmse': train_rmse_nn,
    'test_rmse': test_rmse_nn,
    'train_mae': train_mae_nn,
    'test_mae': test_mae_nn,
    'train_r2': train_r2_nn,
    'test_r2': test_r2_nn,
    'y_train_pred': y_train_pred_nn,
    'y_test_pred': y_test_pred_nn
}

## 7. Model Comparison

In [None]:
# Compile all results
all_results = [lr_results, ridge_results, lasso_results, rf_results, xgb_results, lgbm_results, nn_results]

# Create comparison dataframe
comparison_df = pd.DataFrame([
    {
        'Model': result['model_name'],
        'Train RMSE': result['train_rmse'],
        'Test RMSE': result['test_rmse'],
        'Train MAE': result['train_mae'],
        'Test MAE': result['test_mae'],
        'Train R²': result['train_r2'],
        'Test R²': result['test_r2']
    }
    for result in all_results
])

print("\n" + "="*100)
print("MODEL COMPARISON SUMMARY")
print("="*100)
print(comparison_df.to_string(index=False))
print("\n" + "="*100)

# Find best model based on Test R²
best_model_idx = comparison_df['Test R²'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
print(f"\nBest Model (by Test R²): {best_model_name}")
print(f"Test R²: {comparison_df.loc[best_model_idx, 'Test R²']:.4f}")
print(f"Test RMSE: {comparison_df.loc[best_model_idx, 'Test RMSE']:.4f} years")
print(f"Test MAE: {comparison_df.loc[best_model_idx, 'Test MAE']:.4f} years")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
metrics = [('Train RMSE', 'Test RMSE'), ('Train MAE', 'Test MAE'), ('Train R²', 'Test R²')]

for idx, (train_metric, test_metric) in enumerate(metrics):
    # Training metrics
    row = 0
    col = idx
    axes[row, col].bar(comparison_df['Model'], comparison_df[train_metric], color='steelblue')
    axes[row, col].set_title(f'{train_metric} Comparison', fontsize=12, fontweight='bold')
    axes[row, col].set_ylabel(train_metric)
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].grid(True, alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(comparison_df[train_metric]):
        axes[row, col].text(i, v, f'{v:.2f}', ha='center', va='bottom', fontsize=9)
    
    # Test metrics
    row = 1
    axes[row, col].bar(comparison_df['Model'], comparison_df[test_metric], color='salmon')
    axes[row, col].set_title(f'{test_metric} Comparison', fontsize=12, fontweight='bold')
    axes[row, col].set_ylabel(test_metric)
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].grid(True, alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(comparison_df[test_metric]):
        axes[row, col].text(i, v, f'{v:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 8. Prediction Analysis

In [None]:
# Get best model predictions
best_result = all_results[best_model_idx]
y_test_pred_best = best_result['y_test_pred']

# Scatter plot: Actual vs Predicted
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot
axes[0].scatter(y_test, y_test_pred_best, alpha=0.5, color='steelblue')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Year', fontsize=12)
axes[0].set_ylabel('Predicted Year', fontsize=12)
axes[0].set_title(f'Actual vs Predicted ({best_model_name})', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Residual plot
residuals = y_test - y_test_pred_best
axes[1].scatter(y_test_pred_best, residuals, alpha=0.5, color='salmon')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Year', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title(f'Residual Plot ({best_model_name})', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Residual statistics
print(f"\nResidual Analysis for {best_model_name}:")
print(f"Mean Residual: {residuals.mean():.4f}")
print(f"Std Residual: {residuals.std():.4f}")
print(f"Min Residual: {residuals.min():.4f}")
print(f"Max Residual: {residuals.max():.4f}")

In [None]:
# Distribution of prediction errors
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Histogram of residuals
axes[0].hist(residuals, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Residual (years)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Residuals', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Q-Q plot for normality check
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot (Normality Check)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Compare predictions across all models
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for idx, result in enumerate(all_results):
    y_pred = result['y_test_pred']
    axes[idx].scatter(y_test, y_pred, alpha=0.5)
    axes[idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[idx].set_xlabel('Actual Year')
    axes[idx].set_ylabel('Predicted Year')
    axes[idx].set_title(f"{result['model_name']}\n(R²={result['test_r2']:.3f}, RMSE={result['test_rmse']:.2f})")
    axes[idx].grid(True, alpha=0.3)

# Hide the last subplot
axes[7].axis('off')

plt.suptitle('Actual vs Predicted - All Models', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## 9. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest
if 'Random Forest' in [r['model_name'] for r in all_results]:
    rf_idx = [r['model_name'] for r in all_results].index('Random Forest')
    rf_model_trained = all_results[rf_idx]['model']
    
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': rf_model_trained.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_20 = feature_importance.head(20)
    plt.barh(range(len(top_20)), top_20['importance'], color='steelblue')
    plt.yticks(range(len(top_20)), top_20['feature'])
    plt.xlabel('Importance', fontsize=12)
    plt.title('Top 20 Most Important Features (Random Forest)', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
    print("Top 20 Most Important Features:")
    print(feature_importance.head(20))

## 10. Summary and Conclusions

In [None]:
print("\n" + "="*100)
print("SONG YEAR PREDICTION PROJECT SUMMARY")
print("="*100)

print(f"\n1. Dataset Statistics:")
print(f"   - Total samples: {len(y)}")
print(f"   - Number of features: {X.shape[1]}")
print(f"   - Year range: {y.min():.0f} - {y.max():.0f}")
print(f"   - Training set: {len(y_train)} samples")
print(f"   - Test set: {len(y_test)} samples")

print(f"\n2. Models Evaluated:")
for i, result in enumerate(all_results, 1):
    print(f"   {i}. {result['model_name']}")

print(f"\n3. Best Model: {best_model_name}")
print(f"   - Test R²: {comparison_df.loc[best_model_idx, 'Test R²']:.4f}")
print(f"   - Test RMSE: {comparison_df.loc[best_model_idx, 'Test RMSE']:.4f} years")
print(f"   - Test MAE: {comparison_df.loc[best_model_idx, 'Test MAE']:.4f} years")
print(f"   - Interpretation: On average, predictions are off by ~{comparison_df.loc[best_model_idx, 'Test MAE']:.1f} years")

print(f"\n4. Key Techniques Used:")
print(f"   - Feature scaling (StandardScaler)")
print(f"   - Multiple regression algorithms (Linear, Ridge, Lasso, RF, XGBoost, LightGBM)")
print(f"   - Deep Learning (Neural Network with batch normalization and dropout)")
print(f"   - Comprehensive evaluation metrics (MSE, RMSE, MAE, R²)")
print(f"   - Residual analysis and visualization")

print(f"\n5. Model Performance Ranking (by Test R²):")
ranked_df = comparison_df.sort_values('Test R²', ascending=False).reset_index(drop=True)
for i, row in ranked_df.iterrows():
    print(f"   {i+1}. {row['Model']:20s} - R²: {row['Test R²']:.4f}, RMSE: {row['Test RMSE']:.2f}")

print("\n" + "="*100)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("="*100)