# Feature Selection stage

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [74]:
df_spotify_2 = pd.read_pickle(r'C:\Users\kagan\Dropbox\קורס DS בר-אילן\שיעורים\Projects\Project 3 - ML preparation and model\Spotify_data\spotify_df_after_eda.pkl')

In [76]:

# Assuming X and y are already defined from your modeling DataFrame (df_model)
# and that X has only numeric/boolean features.
X = df_spotify_2.drop('track_popularity', axis=1)
X = X.select_dtypes(include=['number', 'bool'])
y = df_spotify_2['track_popularity']

In [83]:

y = y.astype(float)

In [80]:
print(X.dtypes)

danceability                 float64
energy                       float64
key                            int64
loudness                     float64
mode                           int64
speechiness                  float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
duration_ms                    int64
track_album_release_year       int64
track_album_release_month      int64
song_age                       int64
decade                         int64
genre_edm                    boolean
genre_latin                  boolean
genre_pop                    boolean
genre_r&b                    boolean
genre_rap                    boolean
genre_rock                   boolean
dtype: object



# Feature Selection using multiple regressors 


In [94]:

# Lasso Regression
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Ridge Regression
ridge = Ridge(alpha=0.01).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

# LinearSVR (using dual=True to avoid parameter conflicts)
svm = LinearSVR(C=0.01, dual=True, max_iter=10000).fit(X, y)
svm_selected = (np.abs(svm.coef_) > 0).astype(int)

# Gradient Boosting Regressor
gb = GradientBoostingRegressor().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

# Random Forest Regressor
rf = RandomForestRegressor().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store the feature selection results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'SVR': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the selections for each feature (i.e., count how many models selected each feature)
selection_df['Sum'] = selection_df[['Lasso', 'SVR', 'GradientBoost', 'RandomForest', 'Ridge']].sum(axis=1)

print("Feature selection summary:")
print(selection_df)

Feature selection summary:
                      Feature  Lasso  SVR  GradientBoost  RandomForest  Ridge  \
0                danceability      1    1              1             1      1   
1                      energy      1    1              1             1      1   
2                         key      1    1              1             1      1   
3                    loudness      1    1              1             1      1   
4                        mode      1    1              1             1      1   
5                 speechiness      1    1              1             1      1   
6                acousticness      1    1              1             1      1   
7            instrumentalness      1    1              1             1      1   
8                    liveness      1    1              1             1      1   
9                     valence      1    1              1             1      1   
10                      tempo      1    1              1             1      1   
1

# --- Final Feature Selection ---

 Selected variables - recommended by 4 or more models 

In [105]:

# Select variables that were selected by at least 4 out of 5 models
threshold = 4
final_var = selection_df[selection_df['Sum'] >= threshold]['Feature'].tolist()

# Create the final modeling DataFrame with the selected features and the target variable
# (Make sure to include the target variable 'track_popularity' even if it wasn't part of X)
final_features = final_var + ['track_popularity']
df_model_final = df_spotify_2[final_features].copy()

# Output the result to verify
print("\nFinal modeling DataFrame info:")
df_model_final.info()


Final modeling DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26420 entries, 0 to 26419
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   danceability               26420 non-null  float64
 1   energy                     26420 non-null  float64
 2   key                        26420 non-null  int64  
 3   loudness                   26420 non-null  float64
 4   mode                       26420 non-null  int64  
 5   speechiness                26420 non-null  float64
 6   acousticness               26420 non-null  float64
 7   instrumentalness           26420 non-null  float64
 8   liveness                   26420 non-null  float64
 9   valence                    26420 non-null  float64
 10  tempo                      26420 non-null  float64
 11  duration_ms                26420 non-null  int64  
 12  track_album_release_year   26420 non-null  int64  
 13  track_album_re

In [110]:
# save to pickle for the next step
df_model_final.to_pickle(r'C:\Users\kagan\Dropbox\קורס DS בר-אילן\שיעורים\Projects\Project 3 - ML preparation and model\Spotify_data\df_spotify_after_feature_selection.pkl')
