#  We will preform a feature selection but instead of trusting a single model to tell us which features are important we could have multiple models each cast their vote on whether we should keep a feature or not. We could then combine the votes to make a decision

In [104]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression

In [91]:
# pd.set_option("display.max_columns",100)
# pd.set_option("display.max_rows",6070)
df1=pd.read_csv('ANSUR_II_FEMALE.csv')
df2=pd.read_csv("ANSUR_II_MALE.csv")
anser_df_before=pd.concat([df1,df2])
anser_df_before.shape

(6068, 99)

# "Reducing df rows for my computaional power is kinda week"

In [92]:
t=anser_df_before.Gender
r=anser_df_before.drop("Gender",axis=1)
r_train,r_test,t_train,t_test=train_test_split(r,t,test_size=0.8,random_state=1,stratify=t)
anser_df=r_train.merge(t_train,left_on=r_train.index,right_on=t_train.index)
anser_df.drop("key_0",axis=1,inplace=True)

## the dataset have 99 features


In [93]:
print(anser_df.shape)


(1387, 99)


## first droping the non-numeric columns


In [94]:
to_drop=['Branch','Component','Gender','BMI_class','Height_class']
anser_df.drop(to_drop,axis=1,inplace=True)
anser_df.shape

(1387, 94)

## secound we train the models one by one. We'll be predicting BMI in the ANSUR dataset

In [95]:
from sklearn import preprocessing
from sklearn import utils

y=anser_df.BMI
X=anser_df.drop(["BMI"],axis=1)


lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y)
utils.multiclass.type_of_target(y.astype('int'))
utils.multiclass.type_of_target(encoded)


X_train,X_test,y_train,y_test=train_test_split(X,encoded,test_size=0.3)

### We will use LassoCV() we'll get an R squared of 99% and when we create a mask that tells us whether a feature has a coefficient different from 0 we can see that this is the case for 38 out of 93 features. We'll put this lcv_mask to the side for a moment and move on to the next model.

In [99]:
# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train,y_train)
# Calculate R squared on the test set
r_squared = lcv.score(X_test,y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_!=0 
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))

The model explains 95.0% of the test set variance
41 features out of 93 selected


### The second model to train is a random forest regressor model. by wrapping a Recursive Feature Selector or RFE, around the model to have it select the same number of features as the LassoCV() regressor did

In [98]:
rfe_rf=RFE(RandomForestClassifier(),verbose=1,step=10,n_features_to_select=41)

rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
rf_mask = rfe_rf.support_

Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
Fitting estimator with 43 features.
The model can explain 17.0% of the variance in the test set


### The third model to train is a Gradient Boosting Regressor model. by wrapping a Recursive Feature Selector or RFE, around the model to have it select the same number of features as the LassoCV() regressor did

In [100]:
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=41, step=10, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_

Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
Fitting estimator with 43 features.
The model can explain 97.0% of the variance in the test set


## Combining 3 feature selectors
### I'll combine the votes of the 3 models i built, to decide which features are important into a meta mask. I'll then use this mask to reduce dimensionality and see how a simple linear regressor performs on the reduced dataset.

In [106]:
from sklearn.preprocessing import StandardScaler
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes >= 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]
# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm=LinearRegression()
scaler=StandardScaler()
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print('The model can explain {0:.1%} of the variance in the test set using {1:} features.'.format(r_squared, len(lm.coef_)))

The model can explain 93.8% of the variance in the test set using 10 features.


# As you can see the simple linear model preformed very well on the dataset with only 10 features