## Machine Learning Model Building Pipeline: Feature Selection

In [1]:
# import des librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load dataset
X_train = pd.read_csv('Data/xtrain.csv')
X_test = pd.read_csv('Data/xtest.csv')

X_train.head()

Unnamed: 0,id_mutation,id_parcelle,id_bien,date_mutation,adresse_nom_voie,nom_commune,valeur_fonciere,nature_mutation,code_departement,nombre_lots,code_type_local,type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude,code_type_local_na,surface_reelle_bati_na,nombre_pieces_principales_na,surface_terrain_na,longitude_na,latitude_na
0,2017-1381514,95018000AV0057,95018000AV0057-95,2017-05-16,RUE DE ST QUENTIN,Argenteuil,12.354493,0.666667,0.95,0.0,0.0,1.0,0.331789,0.033333,0.317739,0.808165,0.984716,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-131542,132098460A0288,132098460A0288-13,2017-04-07,RUE ANTOINE FORTUNE MARION,Marseille 9e Arrondissement,13.075272,0.666667,0.875,0.0,0.0,1.0,0.381939,0.055556,0.405742,0.860498,0.940072,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-1162525,83038000AB0022,83038000AB0022-83,2017-05-22,SAINTE ANNE,Châteaudouble,11.652687,0.666667,0.85,0.0,0.0,0.0,0.349591,0.044444,0.49723,0.871099,0.943037,1.0,1.0,1.0,0.0,0.0,0.0
3,2019-173403,44109000NY0325,44109000NY0325-44,2019-03-29,RUE FELIX LEMOINE,Nantes,9.510445,0.666667,0.575,0.00303,0.666667,0.25,0.349591,0.0,0.416154,,0.971868,0.0,1.0,0.0,1.0,0.0,0.0
4,2017-242501,22011000AB0237,22011000AB0237-22,2017-04-27,LE BOURG,Boqueho,8.006368,0.666667,0.125,0.0,0.0,0.0,0.349591,0.044444,0.227757,,0.981227,1.0,1.0,1.0,0.0,0.0,0.0


In [3]:
# capture the target
y_train = X_train['valeur_fonciere']
y_test = X_test['valeur_fonciere']

# drop unnecessary variables from our training and testing sets
X_train.drop(['id_mutation', 'id_parcelle', 'id_bien', 'date_mutation', 'adresse_nom_voie', 'nom_commune', 'valeur_fonciere', 'longitude', 'latitude', 'code_type_local_na', 'surface_reelle_bati_na', 'nombre_pieces_principales_na', 'surface_terrain_na', 'longitude_na', 'latitude_na'], axis=1, inplace=True)
X_test.drop(['id_mutation', 'id_parcelle', 'id_bien', 'date_mutation', 'adresse_nom_voie', 'nom_commune', 'valeur_fonciere', 'longitude', 'latitude', 'code_type_local_na', 'surface_reelle_bati_na', 'nombre_pieces_principales_na', 'surface_terrain_na', 'longitude_na', 'latitude_na'], axis=1, inplace=True)

### Feature Selection

Let's go ahead and select a subset of the most predictive features. There is an element of randomness in the Lasso regression, so remember to set the seed.

In [4]:
sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=123)) # remember to set the seed, the random state in this function
sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=123))

In [5]:
# this command let's us visualise those features that were kept.
# Kept features have a True indicator
sel_.get_support()

array([ True,  True, False,  True,  True,  True, False,  True])

In [6]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 8
selected features: 6
features with coefficients shrank to zero: 2


In [7]:
# print the selected features
selected_feat

Index(['nature_mutation', 'code_departement', 'code_type_local', 'type_local',
       'surface_reelle_bati', 'surface_terrain'],
      dtype='object')

### Identify the selected variables

In [8]:
# this is an alternative way of identifying the selected features 
# based on the non-zero regularisation coefficients:
selected_feats = X_train.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]
selected_feats

Index(['nature_mutation', 'code_departement', 'code_type_local', 'type_local',
       'surface_reelle_bati', 'surface_terrain'],
      dtype='object')

In [9]:
# now we save the selected list of features
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)

That is all for this notebook. In the next video, we will go ahead and build the final model using the selected features. See you then!