# Data Exploration & Transformation  -  Feature Selection

Using Random Forest algorithm to identify relevant features for model building.

In [525]:
# Listing all numerical columns.
mod_numerical_cols = ['x0_CO', 'x0_GA', 'x0_IL', 'x0_MA', 'x0_MD', 'x0_NC', 'x0_NY', 
                      'x0_OH', 'x0_OR', 'x0_Rare', 'x0_TX', 'x0_VA', 'x0_WA', 'x1_Doctorate', 
                      'x1_Master', 'x1_Other', 'x2_Entry Level', 'x2_Mid-Senior level', 'x3_language_python',
                      'x4_language_r', 'x5_language_sql', 'x6_language_bash_shell', 'x7_language_java', 
                      'x8_language_javascript', 'x9_language_html_css', 'x10_platform_aws', 'x11_platform_gcp', 
                      'x12_platform_azure', 'x13_avg_age']

# Convert all object datatype to numeric datatype.
data[mod_numerical_cols] = data[mod_numerical_cols].apply(lambda x: x.astype('int'))

# Separate train and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    data[mod_numerical_cols],
    data['compensation'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((406, 29), (174, 29))

In [526]:
# Train random forest for regression and select features. SelectFrom model will select those features which importance
# is greater than the mean importance of all the features.

sel_ = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=10))
sel_.fit(X_train, y_train)

In [527]:
# Count the selected features.
selected_feat = X_train.columns[(sel_.get_support())]
len(selected_feat)

15

In [528]:
# Comparing amount of selected features with the amount of features which importance is above themean importance.

print('total features: {}'.format((X_train.shape[1])))

print('selected features: {}'.format(len(selected_feat)))

print(
    'features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(sel_.estimator_.feature_importances_ >
               sel_.estimator_.feature_importances_.mean())))

total features: 29
selected features: 15
features with coefficients greater than the mean coefficient: 15


In [529]:
# Selected features.
selected_features = selected_feat
selected_features

Index(['x0_Rare', 'x0_WA', 'x1_Doctorate', 'x1_Master', 'x2_Entry Level',
       'x2_Mid-Senior level', 'x3_language_python', 'x4_language_r',
       'x5_language_sql', 'x6_language_bash_shell', 'x7_language_java',
       'x9_language_html_css', 'x10_platform_aws', 'x11_platform_gcp',
       'x12_platform_azure'],
      dtype='object')

In [530]:
ml_data = pd.concat([data[selected_features], data['compensation']], axis = 1)

# Analysis on Feature Selection