In [None]:
import pandas as pd
import numpy as np

df = pd.read_excel("hop1_miss1_len10.xlsx")

song_df = pd.read_excel("song_stats.xlsx")

In [6]:
df_processed = df.copy()
song_stats = song_df.set_index('name')

df_processed = df_processed.merge(song_stats, left_on='name', right_index=True, how='left')

# 1. Relative velocity
df_processed['relative_velocity'] = df_processed['velocity'] / df_processed['velocity_mean']
# 2. Relative note mean
df_processed['relative_note_mean'] = df_processed['note_mean'] / df_processed['note_mean_song']
# 3. Relative duration per event
df_processed['relative_duration_per_event'] = df_processed['duration'] / df_processed['length'] / df_processed['duration_per_event']
# for all < 1 values: 1 / df_processed['relative_duration_per_event'] 
df_processed['relative_duration_per_event'] = df_processed['relative_duration_per_event'].where(
    df_processed['relative_duration_per_event'] >= 1, 1 / df_processed['relative_duration_per_event'])
# 4. Normalized note entropy
df_processed['normed_note_entropy'] = df_processed['note_entropy'] / np.log2(df_processed['note_unique'])
df_processed['normed_note_entropy'] = df_processed['normed_note_entropy'].fillna(0)
# 5. Normalized note change
df_processed['normed_note_change'] = df_processed['note_change'] / df_processed['length']

df_processed['target'] = df_processed['tp'] - df_processed['fp']


In [7]:
features = ['length', 'misses', 'error', 'velocity', 'duration',
       'note_mean', 'note_std', 'note_entropy', 'note_unique', 'note_change',
       'relative_velocity', 'relative_note_mean',
       'relative_duration_per_event', 'normed_note_entropy',
       'normed_note_change']
len(features)

15

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X = df_processed[features]
y = df_processed['target']

# Remove any rows with NaN values
mask = ~(X.isna().any(axis=1) | y.isna())
X_clean = X[mask]
y_clean = y[mask]

print(f"Data shape: {X_clean.shape}")
print(f"Target shape: {y_clean.shape}")


Data shape: (893495, 15)
Target shape: (893495,)


In [10]:
def evaluate_feature_combination(feature_combo, X_data, y_data):
    """
    Evaluate a combination of features using linear regression
    Returns MAE, RMSE, and R-squared using cross-validation
    """
    X_subset = X_data[list(feature_combo)]
    
    # Use 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    mae_scores = []
    rmse_scores = []
    r2_scores = []
    
    for train_idx, test_idx in kf.split(X_subset):
        X_train, X_test = X_subset.iloc[train_idx], X_subset.iloc[test_idx]
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[test_idx]
        
        # Train model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        r2_scores.append(r2_score(y_test, y_pred))
    
    return {
        'features': feature_combo,
        'mae_mean': np.mean(mae_scores),
        'mae_std': np.std(mae_scores),
        'rmse_mean': np.mean(rmse_scores),
        'rmse_std': np.std(rmse_scores),
        'r2_mean': np.mean(r2_scores),
        'r2_std': np.std(r2_scores)
    }


In [18]:
%%time
# Test all combinations of features up to 8
n_combs = [1, 2, 3, 4, 5, 6, 7, 8]
results = []

print("Testing all feature combinations...")
print(f"Total combinations to test: {
    sum(len(list(combinations(features, n))) for n in n_combs)
}")

# Test all combinations
for n_features in n_combs:
    print(f"\nTesting {n_features}-feature combinations...")
    combo_list = list(combinations(features, n_features))
    
    for i, feature_combo in enumerate(combo_list):
        if i % 50 == 0:
            print(f"Progress: {i}/{len(combo_list)}")
        
        result = evaluate_feature_combination(feature_combo, X_clean, y_clean)
        result['n_features'] = n_features
        results.append(result)

print(f"\nCompleted! Evaluated {len(results)} combinations.")


Testing all feature combinations...
Total combinations to test: 22818

Testing 1-feature combinations...
Progress: 0/15

Testing 2-feature combinations...
Progress: 0/105
Progress: 50/105
Progress: 100/105

Testing 3-feature combinations...
Progress: 0/455
Progress: 50/455
Progress: 100/455
Progress: 150/455
Progress: 200/455
Progress: 250/455
Progress: 300/455
Progress: 350/455
Progress: 400/455
Progress: 450/455

Testing 4-feature combinations...
Progress: 0/1365
Progress: 50/1365
Progress: 100/1365
Progress: 150/1365
Progress: 200/1365
Progress: 250/1365
Progress: 300/1365
Progress: 350/1365
Progress: 400/1365
Progress: 450/1365
Progress: 500/1365
Progress: 550/1365
Progress: 600/1365
Progress: 650/1365
Progress: 700/1365
Progress: 750/1365
Progress: 800/1365
Progress: 850/1365
Progress: 900/1365
Progress: 950/1365
Progress: 1000/1365
Progress: 1050/1365
Progress: 1100/1365
Progress: 1150/1365
Progress: 1200/1365
Progress: 1250/1365
Progress: 1300/1365
Progress: 1350/1365

Testing 5

In [19]:
# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find best combinations for each metric
print("="*80)
print("BEST COMBINATIONS BY METRIC")
print("="*80)

# Best MAE (lower is better)
best_mae = results_df.loc[results_df['mae_mean'].idxmin()]
print(f"\nBest MAE: {best_mae['mae_mean']:.4f} (±{best_mae['mae_std']:.4f})")
print(f"Features ({best_mae['n_features']}): {list(best_mae['features'])}")
print(f"RMSE: {best_mae['rmse_mean']:.4f} (±{best_mae['rmse_std']:.4f})")
print(f"R²: {best_mae['r2_mean']:.4f} (±{best_mae['r2_std']:.4f})")

# Best RMSE (lower is better)
best_rmse = results_df.loc[results_df['rmse_mean'].idxmin()]
print(f"\nBest RMSE: {best_rmse['rmse_mean']:.4f} (±{best_rmse['rmse_std']:.4f})")
print(f"Features ({best_rmse['n_features']}): {list(best_rmse['features'])}")
print(f"MAE: {best_rmse['mae_mean']:.4f} (±{best_rmse['mae_std']:.4f})")
print(f"R²: {best_rmse['r2_mean']:.4f} (±{best_rmse['r2_std']:.4f})")

# Best R² (higher is better)
best_r2 = results_df.loc[results_df['r2_mean'].idxmax()]
print(f"\nBest R²: {best_r2['r2_mean']:.4f} (±{best_r2['r2_std']:.4f})")
print(f"Features ({best_r2['n_features']}): {list(best_r2['features'])}")
print(f"MAE: {best_r2['mae_mean']:.4f} (±{best_r2['mae_std']:.4f})")
print(f"RMSE: {best_r2['rmse_mean']:.4f} (±{best_r2['rmse_std']:.4f})")


BEST COMBINATIONS BY METRIC

Best MAE: 2.3861 (±0.0013)
Features (6): ['length', 'misses', 'error', 'velocity', 'duration', 'relative_duration_per_event']
RMSE: 4.7266 (±0.0104)
R²: 0.9826 (±0.0001)

Best RMSE: 4.5660 (±0.0115)
Features (8): ['length', 'misses', 'error', 'velocity', 'note_unique', 'relative_velocity', 'relative_note_mean', 'normed_note_change']
MAE: 2.4802 (±0.0026)
R²: 0.9837 (±0.0001)

Best R²: 0.9837 (±0.0001)
Features (8): ['length', 'misses', 'error', 'velocity', 'note_unique', 'relative_velocity', 'relative_note_mean', 'normed_note_change']
MAE: 2.4802 (±0.0026)
RMSE: 4.5660 (±0.0115)


In [21]:
# Show best performing combination for each n_features
print("="*100)
print("BEST COMBINATIONS BY NUMBER OF FEATURES")
print("="*100)

# Group by n_features and find the best for each metric
for n_feat in sorted(results_df['n_features'].unique()):
    subset = results_df[results_df['n_features'] == n_feat]
    
    print(f"\n{n_feat}-FEATURE COMBINATIONS:")
    print("-" * 50)
    
    # Best MAE for this n_features
    best_mae_idx = subset['mae_mean'].idxmin()
    best_mae = subset.loc[best_mae_idx]
    
    # Best RMSE for this n_features
    best_rmse_idx = subset['rmse_mean'].idxmin()
    best_rmse = subset.loc[best_rmse_idx]
    
    # Best R² for this n_features
    best_r2_idx = subset['r2_mean'].idxmax()
    best_r2 = subset.loc[best_r2_idx]
    
    print(f"Best MAE: {best_mae['mae_mean']:.4f} (±{best_mae['mae_std']:.4f})")
    print(f"  Features: {list(best_mae['features'])}")
    print(f"  RMSE: {best_mae['rmse_mean']:.4f}, R²: {best_mae['r2_mean']:.4f}")
    
    print(f"Best RMSE: {best_rmse['rmse_mean']:.4f} (±{best_rmse['rmse_std']:.4f})")
    print(f"  Features: {list(best_rmse['features'])}")
    print(f"  MAE: {best_rmse['mae_mean']:.4f}, R²: {best_rmse['r2_mean']:.4f}")
    
    print(f"Best R²: {best_r2['r2_mean']:.4f} (±{best_r2['r2_std']:.4f})")
    print(f"  Features: {list(best_r2['features'])}")
    print(f"  MAE: {best_r2['mae_mean']:.4f}, RMSE: {best_r2['rmse_mean']:.4f}")
    
    # Summary statistics for this n_features
    print(f"Summary for {n_feat} features:")
    print(f"  MAE range: {subset['mae_mean'].min():.4f} - {subset['mae_mean'].max():.4f}")
    print(f"  RMSE range: {subset['rmse_mean'].min():.4f} - {subset['rmse_mean'].max():.4f}")
    print(f"  R² range: {subset['r2_mean'].min():.4f} - {subset['r2_mean'].max():.4f}")
    print(f"  Total combinations tested: {len(subset)}")


BEST COMBINATIONS BY NUMBER OF FEATURES

1-FEATURE COMBINATIONS:
--------------------------------------------------
Best MAE: 2.7748 (±0.0048)
  Features: ['length']
  RMSE: 5.0867, R²: 0.9798
Best RMSE: 5.0867 (±0.0117)
  Features: ['length']
  MAE: 2.7748, R²: 0.9798
Best R²: 0.9798 (±0.0000)
  Features: ['length']
  MAE: 2.7748, RMSE: 5.0867
Summary for 1 features:
  MAE range: 2.7748 - 26.5295
  RMSE range: 5.0867 - 35.7827
  R² range: 0.0011 - 0.9798
  Total combinations tested: 15

2-FEATURE COMBINATIONS:
--------------------------------------------------
Best MAE: 2.4687 (±0.0040)
  Features: ['length', 'error']
  RMSE: 4.8474, R²: 0.9817
Best RMSE: 4.8474 (±0.0082)
  Features: ['length', 'error']
  MAE: 2.4687, R²: 0.9817
Best R²: 0.9817 (±0.0000)
  Features: ['length', 'error']
  MAE: 2.4687, RMSE: 4.8474
Summary for 2 features:
  MAE range: 2.4687 - 26.5068
  RMSE range: 4.8474 - 35.7557
  R² range: 0.0026 - 0.9817
  Total combinations tested: 105

3-FEATURE COMBINATIONS:
---

In [22]:
chosen_features = ['length', 'misses', 'error', 'velocity']
result = evaluate_feature_combination(chosen_features, X_clean, y_clean)
result

{'features': ['length', 'misses', 'error', 'velocity'],
 'mae_mean': np.float64(2.387722698647582),
 'mae_std': np.float64(0.0009481396608698162),
 'rmse_mean': np.float64(4.730797783113689),
 'rmse_std': np.float64(0.010353603383326887),
 'r2_mean': np.float64(0.9825403096978667),
 'r2_std': np.float64(5.9213547320988035e-05)}