# Machine Learning Pipeline - feature selection

# Reproducibility: Setting the seed
With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we set the seed.

In [2]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [3]:
X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

X_train.head()

Unnamed: 0,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,Pmin,PDweek,Dmonth,Dday,Dhour,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,0.123735,0.439752,0.690362,0.564855,0.743494,0.137613,0.909091,0.633333,0.782609,0.135593,0.166667,0.909091,0.633333,0.782609,0.440678,0.166667,0.47028,0.0,0.27027,0.238636,0.0,0.0,0.257256,0.151316
1,0.27063,0.536782,0.072524,0.390582,0.245192,0.248401,0.181818,0.5,0.043478,0.322034,0.666667,0.181818,0.5,0.086957,0.186441,0.666667,0.437063,0.0,0.283784,0.829545,0.0,0.0,0.282322,0.046053
2,0.041725,0.513858,0.536762,0.509323,0.570664,0.040074,0.727273,0.933333,0.565217,0.830508,0.833333,0.727273,0.933333,0.608696,0.016949,0.833333,0.746503,0.0,0.189189,0.340909,0.78125,0.0,0.693931,0.046053
3,0.072366,0.544863,0.312979,0.476247,0.32805,0.069557,0.818182,0.4,1.0,0.372881,0.833333,0.818182,0.4,1.0,0.559322,0.833333,0.503497,0.0,0.040541,0.590909,0.0,0.0,0.283641,0.059211
4,0.042026,0.138733,0.353218,0.163031,0.349159,0.024294,0.818182,1.0,0.347826,0.915254,0.333333,0.818182,1.0,0.391304,0.0,0.333333,0.375874,0.0,0.243243,0.647727,0.042614,0.0,0.208443,0.049342


In [6]:
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')
y_train.head()

Unnamed: 0,Duration
0,2.833213
1,3.931826
2,2.484907
3,2.302585
4,1.609438


In [7]:
X_test.shape, y_test.shape

((96012, 24), (96012, 1))

In [8]:
# remember to set the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=42))

# train Lasso model and select features
sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=42))

In [9]:
sel_.get_support().sum()

16

In [10]:
#feautures that were selected

In [11]:
sel_.get_support()

array([ True, False,  True, False, False,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False, False])

In [12]:
# this is how we can make a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 24
selected features: 16
features with coefficients shrank to zero: 8


In [13]:
selected_feats

Index(['Distance', 'PLatd', 'Haversine', 'Pmonth', 'Pday', 'Phour', 'Pmin',
       'PDweek', 'Dday', 'Dhour', 'Dmin', 'DDweek', 'Temp', 'Wind', 'Humid',
       'Solar'],
      dtype='object')

In [14]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)

  """Entry point for launching an IPython kernel.
