In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import RFE, SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    import warnings
    import statsmodels.api as sm
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
mobile_train = pd.read_csv("data/mobile_price_train.csv")
mobile_test = pd.read_csv("data/mobile_price_test.csv")

<IPython.core.display.Javascript object>

In [5]:

mobile_train.head()
# mobile_train.info()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


<IPython.core.display.Javascript object>

In [6]:
mobile_train["n_cores"].value_counts()

4    274
7    259
8    256
2    247
5    246
3    246
1    242
6    230
Name: n_cores, dtype: int64

<IPython.core.display.Javascript object>

In [7]:
mobile_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1,0


<IPython.core.display.Javascript object>

In [8]:
mobile_test["id"]

0         1
1         2
2         3
3         4
4         5
       ... 
995     996
996     997
997     998
998     999
999    1000
Name: id, Length: 1000, dtype: int64

<IPython.core.display.Javascript object>

In [9]:
X = mobile_train.iloc[:, 0:20]
y = mobile_train.iloc[:, 20]

# X_test = mobile_test.iloc[:, 1:21]
# y_test = mobile_test.iloc[:, 20]

<IPython.core.display.Javascript object>

In [10]:
bin_cols = ["blue", "dual_sim", "four_g", "three_g", "touch_screen", "wifi"]

num_cols = list(X.drop(columns=bin_cols).columns)
num_cols

['battery_power',
 'clock_speed',
 'fc',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time']

<IPython.core.display.Javascript object>

In [11]:
# preprocessing = ColumnTransformer(
#     [
#         # Should only use one of these
#         # Comment out or delete one of the below 2 lines
#         # ('OneHotEncoder', OneHotEncoder(drop=drop_cats), cat_cols),
#         # ('leaveoneoutencoder', LeaveOneOutEncoder(), cat_cols),
#         # Scale numeric columns (not needed for all models but can't hurt)
#         ("scaler", StandardScaler(), num_cols)
#         # bin_cols we'll leave untouch
#     ],
#     remainder="passthrough",
# )


# pipeline = Pipeline(
#     [
#         ("preprocessing", preprocessing),
#         # Choose your model and put it here
#         ("model", estimator),
#     ]
# )

<IPython.core.display.Javascript object>

In [12]:
mobile_train.drop(columns=bin_cols)

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,price_range
0,842,2.2,1,7,0.6,188,2,2,20,756,2549,9,7,19,1
1,1021,0.5,0,53,0.7,136,3,6,905,1988,2631,17,3,7,2
2,563,0.5,2,41,0.9,145,5,6,1263,1716,2603,11,2,9,2
3,615,2.5,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,2
4,1821,1.2,13,44,0.6,141,2,14,1208,1212,1411,8,2,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,0.5,0,2,0.8,106,6,14,1222,1890,668,13,4,19,0
1996,1965,2.6,0,39,0.2,187,4,3,915,1965,2032,11,10,16,2
1997,1911,0.9,1,36,0.7,108,8,3,868,1632,3057,9,1,5,3
1998,1512,0.9,4,46,0.1,145,5,5,336,670,869,18,10,19,0


<IPython.core.display.Javascript object>

In [13]:
mobile_train["price_range"].value_counts()

3    500
2    500
1    500
0    500
Name: price_range, dtype: int64

<IPython.core.display.Javascript object>

In [14]:
print_vif(X)

VIF results
-------------------------------
const            73.921098
battery_power     1.009945
blue              1.011342
clock_speed       1.006025
dual_sim          1.011555
fc                1.718987
four_g            1.528509
int_memory        1.009274
m_dep             1.006385
mobile_wt         1.004548
n_cores           1.008442
pc                1.720785
px_height         1.369052
px_width          1.362399
ram               1.008331
sc_h              1.356109
sc_w              1.353648
talk_time         1.010502
three_g           1.527367
touch_screen      1.006278
wifi              1.009100
dtype: float64
-------------------------------



<IPython.core.display.Javascript object>

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

<IPython.core.display.Javascript object>

In [16]:
estimator = RandomForestClassifier()
estimator.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

<IPython.core.display.Javascript object>

In [17]:
estimator.score(X_train, y_train)

1.0

<IPython.core.display.Javascript object>

In [18]:
estimator.score(X_test, y_test)

0.865

<IPython.core.display.Javascript object>

In [19]:
rfe_selector = RFE(estimator, n_features_to_select=3, verbose=1)
rfe_selector.fit(X_train, y_train)

Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=3, step=1, verbose=1)

<IPython.core.display.Javascript object>

In [20]:
random_forest_best_cols = rfe_selector.support_

X_train.iloc[:, random_forest_best_cols]

Unnamed: 0,battery_power,px_width,ram
1847,1685,1429,881
894,1497,1933,1329
1913,1969,756,298
518,1606,1924,3454
326,1144,724,3252
...,...,...,...
1482,520,519,2753
1471,714,1857,1164
867,1498,1076,3358
1876,864,1436,493


<IPython.core.display.Javascript object>

In [21]:
estimator.fit(X_train.iloc[:, random_forest_best_cols], y_train)
estimator.score(X_test.iloc[:, random_forest_best_cols], y_test)

0.8725

<IPython.core.display.Javascript object>

In [22]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

X_test[num_cols] = scaler.fit_transform(X_test[num_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

<IPython.core.display.Javascript object>

In [23]:
X_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1847,1.010549,0,-1.113286,0,2.300617,0,-0.874245,0.334949,-0.094046,1.068839,0.847433,-1.183543,0.422114,-1.172587,-1.512214,-0.644144,1.114253,0,1,1
894,0.582934,1,-0.991006,0,0.182673,0,-0.001976,0.680257,-1.363271,-0.690114,-0.145168,2.880910,1.584377,-0.755659,-1.512214,-1.104453,-0.727868,1,1,0
1913,1.656521,0,-0.379604,1,1.123981,0,1.688044,-1.046284,-0.235071,-0.250376,0.185699,-0.573875,-1.129875,-1.715152,-0.551220,-1.104453,-0.175232,1,1,0
518,0.830860,0,-0.135043,0,-0.287982,0,1.360943,-1.391593,0.385439,-0.250376,-0.972335,1.090292,1.563622,1.221958,0.169525,0.506627,1.666890,0,0,1
326,-0.219981,1,-0.012763,1,-0.993963,0,-0.601661,-0.355668,1.372614,-1.569591,0.185699,-1.095480,-1.203669,1.033968,0.890271,-1.334607,-0.543656,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,-1.639299,0,0.965480,0,-0.993963,1,-0.056493,1.025566,0.131594,0.629101,0.681999,-0.858387,-1.676415,0.569577,0.169525,0.276473,1.114253,1,0,1
1471,-1.198037,1,0.720919,1,0.182673,1,-1.146828,-1.391593,1.598254,-0.690114,-0.641468,2.497045,1.409115,-0.909215,0.650022,1.427245,1.482677,1,1,0
867,0.585208,1,-0.991006,0,-0.287982,1,-1.310379,-1.391593,0.836719,1.068839,-0.972335,-0.670971,-0.391931,1.132616,-1.271966,-0.644144,1.482677,1,0,0
1876,-0.856855,0,0.109517,1,-0.993963,1,1.142876,-0.700976,-1.476091,0.629101,-1.468635,-0.862903,0.438257,-1.533677,1.370768,0.736782,-0.175232,1,1,0


<IPython.core.display.Javascript object>

In [24]:
estimator = LogisticRegression(max_iter=5000)

# est = Pipeline([("model", estimator)])

log_selector = RFE(estimator, n_features_to_select=3, verbose=1)
log_selector.fit(X_train, y_train)

log_reg_best_cols = log_selector.support_

X_train.iloc[:, random_forest_best_cols]

Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.


Unnamed: 0,battery_power,px_width,ram
1847,1.010549,0.422114,-1.172587
894,0.582934,1.584377,-0.755659
1913,1.656521,-1.129875,-1.715152
518,0.830860,1.563622,1.221958
326,-0.219981,-1.203669,1.033968
...,...,...,...
1482,-1.639299,-1.676415,0.569577
1471,-1.198037,1.409115,-0.909215
867,0.585208,-0.391931,1.132616
1876,-0.856855,0.438257,-1.533677


<IPython.core.display.Javascript object>

In [25]:
estimator.fit(X_train, y_train)
print(estimator.score(X_train, y_train))
print(estimator.score(X_test, y_test))

0.979375
0.9275


<IPython.core.display.Javascript object>

In [26]:
estimator.fit(X_train.iloc[:, log_reg_best_cols], y_train)
estimator.score(X_test.iloc[:, log_reg_best_cols], y_test)

0.9

<IPython.core.display.Javascript object>