##### The main aim of this notebook is to help myself review Sklearn and Feature engineering techniques, and also address the importance of feature selection.

##### 4/24/2019 
Many thanks to the blog "Feature Selection Techniques in Machine Learning with Python" , "https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e", which give me the intuition of the improtance of feature engineering and feature selection

This notbook is based on the project "https://www.kaggle.com/iabhishekofficial/mobile-price-classification"

In [68]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import Binarizer,OneHotEncoder,StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import accuracy_score

In [5]:
train_path = os.path.join('mobile-price-classification/train.csv')
test_path = os.path.join('mobile-price-classification/test.csv')
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [14]:
training_x = train_data.drop(axis=1,columns='price_range')
training_y = train_data.loc[:,'price_range']


In [26]:
## feel the dataset
training_x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [28]:
training_x.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0


## Training a RandomForrest without feature engineering  and feature selection

In [21]:
model = RandomForestClassifier()
model.fit(training_x,training_y)
train_acc = accuracy_score(y_pred=model.predict(training_x),y_true=training_y)
print('Raw acc on training dataset: ', train_acc)

Raw acc on training dataset:  0.996


#### Notice that it is overfitting, we split the training set and check the average performance on cross-validation set

In [25]:
cross_validation_score = cross_val_score(estimator=model,X=training_x,y=training_y,cv=5)
print('The avergae acc on 5-fold cross_validation: ', cross_validation_score)
print('The mean acc on 5-fold cross_validation:', cross_validation_score.mean())

The avergae acc on 5-fold cross_validation:  [0.83   0.8025 0.82   0.775  0.8175]
The mean acc on 5-fold cross_validation: 0.8089999999999999


## Let's do some feature engineering, but still without feature selection
  * Standarlization for all columns except 'blue', 'dual_sim', 'four_g','three_g','touch_screen','wifi', becasue they are all binary data.

In [39]:
training_x2 = training_x.copy()
for column in training_x.columns:
    if column in ('blue','dual_sim', 'four_g','three_g','touch_screen','wifi'):
        continue
    else:
        current_standarlier = StandardScaler()
        transformedColumn = current_standarlier.fit_transform(np.array(training_x2.loc[:,column]).reshape(-1,1))
        training_x2[column] = pd.Series(transformedColumn.reshape(-1,))



In [42]:
#feel training_data2
training_x2.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,-0.902597,0,0.830779,0,-0.762495,0,-1.380644,0.34074,1.349249,-1.101971,-1.30575,-1.408949,-1.146784,0.391703,-0.784983,0.283103,1.462493,0,0,1
1,-0.495139,1,-1.253064,1,-0.99289,1,1.155024,0.687548,-0.120059,-0.664768,-0.645989,0.585778,1.704465,0.467317,1.114266,-0.635317,-0.734267,1,1,0
2,-1.537686,1,-1.253064,1,-0.532099,1,0.493546,1.381165,0.134244,0.209639,-0.645989,1.392684,1.074968,0.441498,-0.310171,-0.864922,-0.36814,1,1,0
3,-1.419319,1,1.198517,0,-0.99289,0,-1.215274,1.034357,-0.261339,0.646842,-0.151168,1.28675,1.236971,0.594569,0.876859,0.512708,-0.002014,1,0,0
4,1.325906,1,-0.395011,0,2.002254,1,0.658915,0.34074,0.02122,-1.101971,0.673534,1.268718,-0.091452,-0.657666,-1.022389,-0.864922,0.73024,1,1,0


#### Let's re-train the same model on training_x2, which has been standardlized, and check the accuracy on cross-validation

In [44]:
model2 = RandomForestClassifier()
model2.fit(training_x2,training_y)
cross_acc2 = cross_val_score(model2,X=training_x2,y=training_y,cv=5)
print('The cross validation accuracy after standardlization is:', cross_acc2)
print('The average acc on cross validation is: ', cross_acc2.mean())

The cross validation accuracy after standardlization is: [0.85   0.7625 0.7825 0.8125 0.7775]
The average acc on cross validation is:  0.7969999999999999


#### Ops, standilization seem doesn't help, how about the most popular gboost algorithm?

In [51]:
model_gboost1 = GradientBoostingClassifier()
#model_gboost1.fit(training_x,training_y)
acc_gb_cross_val = cross_val_score(model_gboost1,training_x,training_y,cv=5)
print('Gboosting acc on cross validation:', acc_gb_cross_val)
print('The average acc on cross validation for gboosting:', acc_gb_cross_val.mean())

Gboosting acc on cross validation: [0.895  0.9175 0.9075 0.91   0.8925]
The average acc on cross validation for gboosting: 0.9045


In [54]:
# using standarlized data
model_gboost2 = GradientBoostingClassifier()
acc_gb_cross_val2 = cross_val_score(model_gboost2,training_x2,training_y,cv=5)
print('Gboosting acc on cross validation:', acc_gb_cross_val2)
print('The average acc on cross validation for gboosting:', acc_gb_cross_val2.mean())

Gboosting acc on cross validation: [0.895  0.9175 0.9075 0.9075 0.8925]
The average acc on cross validation for gboosting: 0.9039999999999999


#### From the experiment above, we find that :
   * The gradientboost classifier helps a lot
   * The standardlization doesn't help too much, sometimes it hurts

## Let us just tune some parameters

In [58]:
#grid search
num_estimators = np.arange(100,500,5)
best_model = 0
best_Acc_cross = 0
for number in num_estimators:
    current_model = GradientBoostingClassifier(n_estimators=number)
    current_acc = cross_val_score(current_model,training_x,training_y,cv=5).mean()
    if current_acc > best_Acc_cross:
        best_model = current_model
        best_Acc_cross = current_acc
        print('Estimator number: ',number,' best Cross_val_acc: ',current_acc)

Estimator number:  100  best Cross_val_acc:  0.9045
Estimator number:  140  best Cross_val_acc:  0.905
Estimator number:  165  best Cross_val_acc:  0.907
Estimator number:  285  best Cross_val_acc:  0.9075
Estimator number:  305  best Cross_val_acc:  0.908
Estimator number:  310  best Cross_val_acc:  0.9080000000000001
Estimator number:  325  best Cross_val_acc:  0.9085000000000001
Estimator number:  445  best Cross_val_acc:  0.909


##### We will use Estimator number = 445 because it can get good performance 

#### let's do some feature selection:
    * chi-square
    * Tree based selection

In [62]:
chi_feature_selector = SelectKBest(score_func=chi2,k=6)
training_x3 = chi_feature_selector.fit_transform(training_x,training_y)

In [67]:
model_xg3 = GradientBoostingClassifier(n_estimators=445)
cross_val_acc3 = cross_val_score(model_xg3,training_x3,training_y,cv=5).mean()
print('After feature selection（chi2,k=6）, the acc on cross_val_dataset is:', cross_val_acc3)

After feature selection（chi2,k=6）, the acc on cross_val_dataset is: 0.915


##### Let's try Tree-based feautre selection

In [74]:
model_xg4 = GradientBoostingClassifier(n_estimators=445)
treeSelector = ExtraTreesClassifier()
#treeSelector.fit(training_x,training_y)

training_x4 = SelectFromModel(treeSelector).fit_transform(training_x,training_y)


In [76]:
cross_val_acc4 = cross_val_score(model_xg4,training_x4,training_y,cv=5)
print('After feature selection [tree-based], the acc on cross validation set is: ',cross_val_acc4.mean())

After feature selection [tree-based], the acc on cross validation set is:  0.8734999999999999


#### In conclusion, we can see that the best combination until now is :
    * model: gboosting(n_estimator=445)
    * Standarlization: No
    * Feature selection: Yes, (Chi2, k=6)
    

### Now let us generate our final classifier

In [77]:
model_final = GradientBoostingClassifier(n_estimators=445)
feature_selector_final = SelectKBest(chi2,k=6)
training_x_final = feature_selector_final.fit_transform(training_x,training_y)
model_final.fit(training_x_final,training_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=445,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)