 # Features Selection Using Lasso

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Custom functions
from sample_panel.merge_datasets import merge_bank_macro_datasets
from supervised_learning.cross_validation import PanelDataSplit
from supervised_learning.cross_validation import search_best_model
from supervised_learning.cross_validation import Lasso_chosen_features
from supervised_learning.cross_validation import find_Lasso_coef

from supervised_learning.estimate_errors import estimate_median_relative_error
from supervised_learning.estimate_errors import estimate_mean_relative_error
from supervised_learning.estimate_errors import estimate_errors

## Preparing Data

In [3]:
# Load bank panel data
bank_data = pd.read_csv('df_response_vars.csv')

In [4]:
# Load macroeconomic data
macro_data = pd.read_csv('macro_features.csv')
macro_columns = macro_data.columns

# Factors with lags are not used in the model. Remove factors with lags
new_macro_columns = [col for col in macro_columns if '_lag' not in col]
macro_data = macro_data[new_macro_columns]

In [7]:
# Merge the bank panel and macroeconomic indicators
data_set = merge_bank_macro_datasets(bank_data, macro_data) #, pca_data, macro_data1)

In [8]:
# Delete Nans values due to the lag of the response variable
data_set.dropna(subset=['Provision_Lag1'], inplace=True)
data_set.reset_index(drop=True, inplace=True)

## Parameters

In [9]:
# Response variable
y_col = 'Provision for Loan Lease Losses as % of Aver. Assets'

In [10]:
# Models
# In the case of Lasso regression, we didn't need to specify a list of all potential models 
# because Lasso has the capability to autonomously select the model structure. 
# It achieves this by identifying the factors for which the coefficients are not equal to zero.
models = {
    'model1': ['Provision_Lag1', 'Real GDP growth_ema3', 'BBB corporate yield', 
               '3-month Treasury rate change', 'Dow Jones Total Stock Market Index change', 
               'Market Volatility Index'],
    'model2': ['Provision_Lag1', 'Real GDP growth_ema3', 'BBB corporate yield', 
               '3-month Treasury rate change', 'Dow Jones Total Stock Market Index change', 
               'Market Volatility Index change'],
    } 

## Additional features

In [11]:
# Fixed Effects (individual intercept for each bank)
data_set['IDRSSD1'] = data_set['IDRSSD']
data_set = pd.get_dummies(data_set, columns=['IDRSSD1'], drop_first=True)
fixed_effects_features = [col for col in data_set.columns if col.startswith('IDRSSD1_')]

## Train data set

In [12]:
# The last year is for test. We shouldn't use the last year to choose the features
data_set_train = data_set[data_set['Report Date']<='2021-12-31'].copy()
data_set_test = data_set[data_set['Report Date']>'2021-12-31'].copy()

### Removing outliers from the train set

In [13]:
lower_limit = np.percentile(data_set_train[y_col], 0.5)
upper_limit = np.percentile(data_set_train[y_col], 99)

data_set_train = data_set_train[(data_set_train[y_col]<=upper_limit)&(data_set_train[y_col]>=lower_limit)].copy()
data_set_train.reset_index(drop=True, inplace=True)

## Pipeline

In [14]:
# For lasso regression we need to scale data, so its coefficients has similar magnitude, 
# it is important for correct work of regularization. Otherwise regularization will penalize large coefficients
scaler = StandardScaler()
lasso_model = Lasso(max_iter=10000)

pipeline = Pipeline(steps=[("scaler", scaler), ("lasso", lasso_model)])

## Defining cross validator and GridSearchCV

In [15]:
# Custom cross-validator for panel data cross-validation split, based on sklearn's sklearn.model_selection.TimeSeriesSplit 
panel_cv = PanelDataSplit(test_size=4, date_axis=data_set_train['Report Date'], n_splits=5)

In [16]:
param_grid = {
    'lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.75, 1]
}
search = GridSearchCV(pipeline, param_grid, scoring='r2', cv=panel_cv)

##  Model Selection

In [None]:
best_model_name, best_score, best_model, models_results, estimators = \
    search_best_model(data_set_train, models, search, y_col, fixed_effects_features)

In [None]:
models_results['Cross-Validation R^2 Standard Error of the Mean'] = \
    models_results['Cross-Validation R^2 std'] / panel_cv.get_n_splits()**0.5

models_results

##  Features chosen with Lasso

### Model1

In [None]:
Lasso_chosen_features(estimators[0], models['model1'], y_col, fixed_effects_features, data_set_train)                

### Model2

In [None]:
Lasso_chosen_features(estimators[1], models['model2'], y_col, fixed_effects_features, data_set_train)   

##   The chosen model's performance on the test dataset

In [31]:
best_model_name

'model2'

In [32]:
best_model

In [40]:
# Fit the pipeline on the whole train set
model_factors_all = models['model2'] + fixed_effects_features
X_train = data_set_train[model_factors_all]
y_train = data_set_train[y_col]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_train)

### Train sample

In [41]:
estimate_errors(y_train, y_pred, lower_limit, upper_limit)

Unnamed: 0,measure
R squared,0.649312
RMSE,0.509759
"median relative error, %",43.474426


### Test sample

In [57]:
X_test = X_test.dropna()

In [95]:
X_test = data_set_test[model_factors_all]

In [96]:
y_test = data_set_test[y_col]

In [97]:
print(X_test.isnull().sum())  # Liczba braków danych w każdej kolumnie
print("Czy są NaN w X_test?", X_test.isnull().values.any())

Provision_Lag1                                  0
Real GDP growth_ema3                            0
BBB corporate yield                          1218
3-month Treasury rate change                 1218
Dow Jones Total Stock Market Index change    1218
                                             ... 
IDRSSD1_4041421                                 0
IDRSSD1_4114567                                 0
IDRSSD1_4160667                                 0
IDRSSD1_4262534                                 0
IDRSSD1_5278251                                 0
Length: 295, dtype: int64
Czy są NaN w X_test? True


In [98]:
X_test = X_test.dropna()

In [106]:
y_pred = best_model.predict(X_test)