# Energy Dataset Feature_Selection

In [81]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [82]:
df_loaded = pd.read_csv("../Part3_Feature_Engineering/energydata_complete_transformed.csv")
df = df_loaded
X = df.drop(['Appliances'],axis=1)
y = df['Appliances']

# Creating Features Dataframe
all_feature = X.columns.tolist()
feature_df = pd.DataFrame(index = all_feature)

## Recursive Feature Elimination
> [Source](https://machinelearningmastery.com/feature-selection-in-python-with-scikit-learn/):
>The Recursive Feature Elimination (RFE) method is a feature selection approach. It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

>This recipe shows the use of RFE to select 3 attributes.

In [None]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import (LinearRegression, Ridge, Lasso)
from sklearn.ensemble import RandomForestRegressor

# Create a base classifier used to evaluate a subset of attributes
model_lr = LinearRegression()
model_lasso = Lasso()
model_rf = RandomForestRegressor(n_estimators=500)

# Create the RFE model and select 3 attributes
rfe_lr = RFE(model_lr, n_features_to_select = 3)
rfe_lr = rfe_lr.fit(X, y)

rfe_lasso = RFE(model_lasso, n_features_to_select = 3)
rfe_lasso = rfe_lasso.fit(X, y)

rfe_rf = RFE(model_rf, n_features_to_select = 3)
rfe_rf = rfe_rf.fit(X, y)

# Summarize the selection of the attributes
feature_df['RFE_LR_Rank'] = rfe_lr.ranking_
feature_df['RFE_Lasso_Rank'] = rfe_lasso.ranking_
feature_df['RFE_RF_Rank'] = rfe_rf.ranking_

feature_df

## Feature Importance
>[Source](https://machinelearningmastery.com/feature-selection-in-python-with-scikit-learn/):
>Methods that use ensembles of decision trees (like Random Forest or Extra Trees) can also compute the relative importance of each attribute. These importance values can be used to inform a feature selection process.

>This recipe shows the construction of an Extra Trees ensemble and displays the relative feature importance.

In [None]:
# Feature Importance
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

# Fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(X, y)

# Display the relative importance of each attribute
feature_df['Feature_Imp_ETC'] = model.feature_importances_
feature_df

As Random Forest Model gave the best prediction, using it for feature selection:

In [None]:
# Sort the dataframe as per Random Forest Rank
feature_rank_sort_df = feature_df.sort_values('RFE_RF_Rank')
feature_rank_sort_df['Feature'] = feature_rank_sort_df.index
feature_rank_sort_df

# Plot the ranking of the features
sns.factorplot(x='RFE_RF_Rank', y='Feature', data = feature_rank_sort_df, kind="bar", size=14, aspect=1.9, palette='coolwarm');

In [None]:
# Sort the dataframe as per Feature Importance
feature_imp_sort_df = feature_df.sort_values('Feature_Imp_ETC', ascending = False)
feature_imp_sort_df['Feature'] = feature_imp_sort_df.index
feature_imp_sort_df

# Plot the ranking of the features
# sns.factorplot(x='RFE_LR_Rank', y='Feature', data = feature_rank_sort_df, kind="bar", size=14, aspect=1.9, palette='coolwarm');