### W23P1 STAT 857 - Feature Selection

In [None]:
pip install lightgbm

In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [8]:
## Reading the data
data = pd.read_csv('Data/W23P1_train_final.csv')

## Defining the input and target variables
X = data.drop(columns = ['fare_amount'])
Y = data['fare_amount']

### RFE with LightGBM

In [4]:
## Running RFECV
RFE_results = list()

for i in tqdm(range(0, 10)):
    
    auto_feature_selection = RFECV(estimator = LGBMRegressor(objective = 'rmse'), step = 1, min_features_to_select = 2, cv = 5).fit(X, Y)
    
    ## Extracting and storing features to be selected
    RFE_results.append(auto_feature_selection.support_)

## Changing to data-frame
RFE_results = pd.DataFrame(RFE_results)
RFE_results.columns = X.columns

## Computing the percentage of time features are flagged as important
RFE_results = 100 * RFE_results.apply(np.sum, axis = 0) / RFE_results.shape[0]
RFE_results

100%|██████████| 10/10 [12:25<00:00, 74.56s/it]


pickup_longitude         100.0
pickup_latitude          100.0
dropoff_longitude        100.0
dropoff_latitude         100.0
distance                 100.0
haversine                100.0
duration                 100.0
passenger_count          100.0
pickup_day               100.0
holiday                  100.0
Monday                   100.0
Tuesday                  100.0
Wednesday                  0.0
Thursday                 100.0
Friday                   100.0
Saturday                 100.0
weekend                  100.0
pickup_hour              100.0
rush_hour                100.0
overnight                100.0
airport                  100.0
LGA                      100.0
JFK                      100.0
EWR                      100.0
pickup_LGA                 0.0
dropoff_LGA                0.0
pickup_JFK               100.0
dropoff_JFK              100.0
pickup_EWR                 0.0
dropoff_EWR                0.0
pickup_airport           100.0
dropoff_airport          100.0
change_b

### RFE with Random Forest

In [18]:
def flat_list(my_list):
    
    ## Defining list to store results
    out_list = list()
    for i in my_list:
        out_list += i
    return out_list

def RF_RFE_rep_cross_val(X, Y, numb_folds, max_features, numb_reps):
    
    ## Defining list to store results
    RFE_rep_results = list()
    for i in range(2, max_features):
        RFE_rep_results.append(RF_rep_cross_val(X, Y, numb_folds, i, numb_reps))
        print('Features -->', i) ## Sanity check
    return RFE_rep_results

def RF_rep_cross_val(X, Y, numb_folds, numb_features, numb_reps):
    
    ## Defining the list to store results
    rep_results = list()
    for i in range(0, numb_reps):
        rep_results.append(RF_cross_val(X, Y, numb_folds, numb_features))
    return flat_list(rep_results)

def RF_cross_val(X, Y, numb_folds, numb_features):
    
    ## Defining list to store results
    results = list()
    
    ## Defining the number of folds
    kf = KFold(n_splits = numb_folds, shuffle = True)
    
    for train_index, test_index in kf.split(X):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
        
        ## Running RFE with i features
        RF_rfe = RFE(estimator = RandomForestRegressor(n_estimators = 100, max_depth = 3), 
                     n_features_to_select = numb_features).fit(X_train, Y_train)
        
        ## Variables to be considered
        to_select = X_train.columns[RF_rfe.support_]
        to_select_list.append(RF_rfe.support_)
        
        ## Building the Random Forest model
        X_train_md = X_train[to_select]
        X_test_md = X_test[to_select]
        
        RF_md = RandomForestRegressor(n_estimators = 100, max_depth = 3).fit(X_train_md, Y_train)
        
        ## Predicting on the test data-frame and storing RMSE
        results.append(mean_squared_error(Y_test, RF_md.predict(X_test_md), squared = False))

    return results

##--------------------------------------------------

## Defining list to store results
to_select_list = list()

## Running RFE to estimate number of features to be selected
RFE_numb_features = RF_RFE_rep_cross_val(X, Y, numb_folds = 3, max_features = 13, numb_reps = 1)

Features --> 2
Features --> 3
Features --> 4
Features --> 5
Features --> 6
Features --> 7
Features --> 8
Features --> 9
Features --> 10
Features --> 11
Features --> 12


In [19]:
## Identifying features
features = pd.DataFrame(to_select_list)
features.columns = X.columns
feature_selections = 100 * features.apply(np.sum, axis = 0) / features.shape[0]
feature_selections = pd.DataFrame(feature_selections).reset_index(drop = False)

## Model performance given the number of variables
feature_performance = pd.DataFrame(RFE_numb_features)
feature_performance.columns = [['Split_1', 'Split_2', 'Split_3']]
feature_performance['Mean'] = feature_performance.apply(np.mean, axis = 1)
feature_performance['Num_features'] = feature_performance.index + 2

In [23]:
feature_selections.sort_values(0, ascending = False)

Unnamed: 0,index,0
4,distance,100.0
5,haversine,100.0
2,dropoff_longitude,90.909091
6,duration,54.545455
0,pickup_longitude,51.515152
29,dropoff_EWR,51.515152
23,EWR,51.515152
31,dropoff_airport,36.363636
30,pickup_airport,33.333333
20,airport,24.242424


In [21]:
feature_performance

Unnamed: 0,Split_1,Split_2,Split_3,Mean,Num_features
0,4.156233,4.332403,4.14406,4.210899,2
1,3.854412,4.091507,4.456117,4.134012,3
2,4.033476,4.155102,4.13837,4.108983,4
3,4.238669,4.124631,4.1331,4.165467,5
4,4.293622,3.946469,4.119363,4.119818,6
5,4.168646,3.779943,4.413123,4.120571,7
6,4.147297,4.134957,4.058322,4.113525,8
7,3.971296,3.899632,4.6351,4.168676,9
8,3.838219,4.522384,3.996892,4.119165,10
9,3.925245,4.669388,3.775928,4.12352,11
