# 5. Feature Engineering & Feature Selection

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Spotify_cleaned_stage3.csv')
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity_class,release_year,release_month
0,6f807x0ima9a1j3VPbc7VN,I Dont Care with Justin Bieber Loud Luxury Remix,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Dont Care with Justin Bieber Loud Luxury Remix,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,60‚Äì80,2019,6
1,0r7CVbZTWZgbTCYdfa2P31,Memories Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories Dillon Francis Remix,2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,60‚Äì80,2019,12
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time Don Diablo Remix,2019-05-07,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,60‚Äì80,2019,5
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,40‚Äì60,2019,7
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved Future Humans Remix,2019-05-03,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,60‚Äì80,2019,5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29729 entries, 0 to 29728
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  29729 non-null  object 
 1   track_name                29719 non-null  object 
 2   track_artist              29721 non-null  object 
 3   track_popularity          29729 non-null  int64  
 4   track_album_id            29729 non-null  object 
 5   track_album_name          29715 non-null  object 
 6   track_album_release_date  29729 non-null  object 
 7   playlist_name             29729 non-null  object 
 8   playlist_id               29729 non-null  object 
 9   playlist_genre            29729 non-null  object 
 10  playlist_subgenre         29729 non-null  object 
 11  danceability              29729 non-null  float64
 12  energy                    29729 non-null  float64
 13  key                       29729 non-null  int64  
 14  loudne

* feature engineering on `'track_popularity'`

In [None]:
TARGET_REG = "track_popularity"       # for regression branch
# I will leave popularity_class the dataframe for Stage 6 (classification)

# --------------------------------------------------------------
# Feature Enrichment (domain-based, pure transformations)
# --------------------------------------------------------------
# Base numeric features (adapt to your columns)
num_base = ['danceability','energy','loudness','acousticness',
    'instrumentalness','liveness','speechiness',
    'valence','tempo','duration_ms']

# Create enriched features
df['dance_energy']      = df['danceability'] * df['energy']
df['valence_energy']    = df['valence'] * df['energy']
df['acoustic_instru']   = df['acousticness'] + df['instrumentalness']
df['energy_diff']       = df['energy'] - df['acousticness']
df['speech_loud_ratio'] = df['speechiness'] / (df['loudness'].abs() + 1)
df['log_duration']      = np.log1p(df['duration_ms'])

# Context features (simple examples)
if 'release_year' in df.columns:
    # ensure numeric if present
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
    df['release_age']  = 2025 - df['release_year']

num_feats = num_base + ['dance_energy','valence_energy','acoustic_instru','energy_diff','speech_loud_ratio','log_duration'] + (['release_age'] if 'release_age' in df.columns else [])

# Categorical to encode (moderate cardinality only)
cat_feats = [c for c in ['playlist_genre','playlist_subgenre','release_year','release_month'] if c in df.columns]

# Convert year/month to string if included as categories
for c in ['release_year','release_month']:
    if c in cat_feats and c in df.columns:
        df[c] = df[c].astype('Int64').astype(str)  # robust cast to string

# --------------------------------------------------------------
# Build preprocessing (OHE cats, scale nums) ‚Äî no splitting yet
# --------------------------------------------------------------
preprocess = ColumnTransformer(
    transformers=[("num", StandardScaler(), num_feats),("cat", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False), cat_feats)],
    remainder="drop",
    verbose_feature_names_out=False
)

pipe = Pipeline([("prep", preprocess)])

# Fit the transformer on the entire dataset‚Äôs feature space
X_all = df[num_feats + cat_feats].copy()
y_all = pd.to_numeric(df[TARGET_REG], errors="coerce")

# Keep rows with a valid target
mask = y_all.notna()
X_all = X_all.loc[mask]
y_all = y_all.loc[mask]

X_prepared = pipe.fit_transform(X_all)

# Get final feature names
feature_names = pipe.named_steps["prep"].get_feature_names_out()
feature_names = list(feature_names)

print("‚úÖ Preprocessing complete.")
print("X_prepared shape:", X_prepared.shape)
print("Number of features:", len(feature_names))

# --------------------------------------------------------------
# Filter/Rank features (no model selection/tuning here)
#    - Mutual Information (nonlinear associations)
#    - Spearman rank correlation (monotonic associations)
# --------------------------------------------------------------
mi = mutual_info_regression(X_prepared, y_all, random_state=42)
mi = pd.Series(mi, index=feature_names, name="mutual_info")

# Spearman per feature (robust to non-normality)
spearman_vals = []
for i, fname in enumerate(feature_names):
    try:
        r, _ = spearmanr(X_prepared[:, i], y_all)
    except Exception:
        r = np.nan
    spearman_vals.append(r)

spearman_s = pd.Series(spearman_vals, index=feature_names, name="spearman_r")

rank_tbl = pd.concat([mi, spearman_s], axis=1)
rank_tbl["abs_spearman"] = rank_tbl["spearman_r"].abs()
rank_tbl = rank_tbl.sort_values(["mutual_info","abs_spearman"], ascending=False)

display(rank_tbl.head(25))
rank_tbl.to_csv("stage5_feature_ranking_mi_spearman.csv", index=True)
print("üíæ Saved: stage5_feature_ranking_mi_spearman.csv")

# --------------------------------------------------------------
# Produce a shortlist of features for Stage 6
#    (final selection will be re-fit on training data in Stage 6)
# --------------------------------------------------------------
TOP_K = 60  # keep a manageable number; adjust as you like
shortlist = rank_tbl.head(TOP_K).index.tolist()
print(f"üìã Shortlist prepared (TOP_K={TOP_K}).")

# Save artifacts for Stage 6
pd.DataFrame({"feature": feature_names}).to_csv("stage5_all_features.csv", index=False)
pd.DataFrame({"feature": shortlist}).to_csv("stage5_shortlist_features.csv", index=False)


print("‚úÖ Stage 5 finished. No model selection, no cross-validation, no tuning performed.")
print("‚û°Ô∏è Next: Stage 6 will use train/test split, re-fit the preprocesser on train only,")
print("         and evaluate models (RF/XGB/Linear) with CV + hyperparameter tuning.")


‚úÖ Preprocessing complete.
X_prepared shape: (29729, 130)
Number of features: 130


Unnamed: 0,mutual_info,spearman_r,abs_spearman
duration_ms,0.366226,-0.101018,0.101018
log_duration,0.366134,-0.101018,0.101018
speech_loud_ratio,0.364581,0.027049,0.027049
tempo,0.36046,-0.020317,0.020317
dance_energy,0.355429,-0.0444,0.0444
valence_energy,0.349048,-0.005158,0.005158
loudness,0.333051,0.053705,0.053705
acoustic_instru,0.33143,-0.016539,0.016539
energy_diff,0.321669,-0.139457,0.139457
acousticness,0.292234,0.124156,0.124156


üíæ Saved: stage5_feature_ranking_mi_spearman.csv
üìã Shortlist prepared (TOP_K=60).
‚úÖ Stage 5 finished. No model selection, no cross-validation, no tuning performed.
‚û°Ô∏è Next: Stage 6 will use train/test split, re-fit the preprocesser on train only,
         and evaluate models (RF/XGB/Linear) with CV + hyperparameter tuning.


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29729 entries, 0 to 29728
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  29729 non-null  object 
 1   track_name                29719 non-null  object 
 2   track_artist              29721 non-null  object 
 3   track_popularity          29729 non-null  int64  
 4   track_album_id            29729 non-null  object 
 5   track_album_name          29715 non-null  object 
 6   track_album_release_date  29729 non-null  object 
 7   playlist_name             29729 non-null  object 
 8   playlist_id               29729 non-null  object 
 9   playlist_genre            29729 non-null  object 
 10  playlist_subgenre         29729 non-null  object 
 11  danceability              29729 non-null  float64
 12  energy                    29729 non-null  float64
 13  key                       29729 non-null  int64  
 14  loudne

In [None]:
df.to_csv('Spotify_cleaned_stage5.csv', index=False)

# ‚úÖContinue Feature Engineering & Feature Selection, and then on to Model Selection and Fine Tuning