# import the dataset

In [3]:
import pandas as pd

df = pd.read_csv("../your_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Cholesterol,triglyceride,age,hearing(right),hemoglobin,LDL,dental caries,systolic,hearing(left),eyesight(left),smoking
0,0,172,300,55,1,16.5,75,0,135,1,0.5,1
1,1,194,55,70,2,16.2,126,1,146,2,0.6,0
2,2,178,197,20,1,17.4,93,0,118,1,0.4,1
3,3,180,203,35,1,15.9,102,1,131,1,1.5,0
4,4,155,87,30,1,15.4,93,0,121,1,1.5,1


# check if there null element  

In [5]:
df.isnull().sum()

Unnamed: 0        0
Cholesterol       0
triglyceride      0
age               0
hearing(right)    0
hemoglobin        0
LDL               0
dental caries     0
systolic          0
hearing(left)     0
eyesight(left)    0
smoking           0
dtype: int64

# print some details about the dataset

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Cholesterol,triglyceride,age,hearing(right),hemoglobin,LDL,dental caries,systolic,hearing(left),eyesight(left),smoking
count,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0
mean,79627.5,195.796165,127.616046,44.306626,1.023421,14.796965,114.607682,0.197996,122.503648,1.023974,1.005798,0.437365
std,45973.391572,28.396959,66.188989,11.842286,0.151238,1.431213,28.158931,0.39849,12.729315,0.152969,0.402113,0.496063
min,0.0,77.0,8.0,20.0,1.0,4.9,1.0,0.0,77.0,1.0,0.1,0.0
25%,39813.75,175.0,77.0,40.0,1.0,13.8,95.0,0.0,114.0,1.0,0.8,0.0
50%,79627.5,196.0,115.0,40.0,1.0,15.0,114.0,0.0,121.0,1.0,1.0,0.0
75%,119441.25,217.0,165.0,55.0,1.0,15.8,133.0,0.0,130.0,1.0,1.2,1.0
max,159255.0,393.0,766.0,85.0,2.0,21.0,1860.0,1.0,213.0,2.0,9.9,1.0


# Print the ratio between two classes

In [7]:
df.smoking.value_counts()

smoking
0    89603
1    69653
Name: count, dtype: int64

# drop the target from the dataset

In [8]:
X = df.drop("smoking",axis="columns")
y = df.smoking
y

0         1
1         0
2         1
3         0
4         1
         ..
159251    0
159252    0
159253    0
159254    1
159255    0
Name: smoking, Length: 159256, dtype: int64

# scale the data

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]

array([[-1.73203993, -0.83798548,  2.60442859,  0.90298511, -0.15486487,
         1.1899277 , -1.40658059, -0.49686626,  0.98170175, -0.15672539,
        -1.2578561 ],
       [-1.73201818, -0.06325221, -1.09710504,  2.16963644,  6.45724233,
         0.98031465,  0.40457337,  2.01261403,  1.84585151,  6.38058705,
        -1.00916876],
       [-1.73199643, -0.62669459,  1.04827363, -2.05253466, -0.15486487,
         1.81876686, -0.76734978, -0.49686626, -0.35380241, -0.15672539,
        -1.50654343]])

# split dataset into training and testing data

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [11]:
X_train.shape

(119442, 11)

In [12]:
X_test.shape

(39814, 11)

In [13]:
y_train.value_counts()

smoking
0    67202
1    52240
Name: count, dtype: int64

# Train using stand alone model

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores

array([0.61478086, 0.66318169, 0.6647515 , 0.66402939, 0.53737716])

In [15]:
scores.mean()

0.6288241199280307

# Train using Bagging

In [16]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train, y_train)
bag_model.oob_score_



0.7314345037758913

In [17]:
bag_model.score(X_test, y_test)

0.7371025267493846

In [18]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, X, y, cv=5)
scores



array([0.70020721, 0.73451383, 0.73963141, 0.73878371, 0.67809488])

In [19]:
scores.mean()

0.7182462076091279

# Train using Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=5)
scores.mean()

0.7268675609529716

<h1>GRIDSEARCH</h1>

<h3>GRIDSERACH FOR BaggingClassifier</h3>

In [31]:
from sklearn.model_selection import GridSearchCV
result = GridSearchCV(
        BaggingClassifier(),
        {
            'base_estimator': [DecisionTreeClassifier()],
            'n_estimators': [10, 50, 100],
            'max_samples': [0.3, 0.5, 0.8, 0.9],
            'oob_score': [True, False],
            'random_state': [0]
        },
    cv=5,
    return_train_score=False
)

result.fit(X,y)
result.cv_results_

  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


In [None]:
df = pd.DataFrame(result.cv_results_)
df

In [None]:
df [['param_n_estimators','params','mean_test_score']]

<h3>Random GridSearch for Bagging</h3>

In [None]:
from sklearn.model_selection import GridSearchCV
result = RandomizedSearchCV(
        BaggingClassifier(),
        {
            'base_estimator': [DecisionTreeClassifier()],
            'n_estimators': [10, 50, 100],
            'max_samples': [0.3, 0.5, 0.8, 0.9],
            'oob_score': [True, False],
            'random_state': [0]
        },
    cv=5,
    return_train_score=False
)

result.fit(X,y)
pd.DataFrame(result.cv_results_)[['param_n_estimators','params','mean_test_score']]

<h3>GRIDSERACH FOR RandomForestClassifier</h3>

In [21]:
from sklearn.model_selection import GridSearchCV
result = GridSearchCV(
    RandomForestClassifier(), {
        'n_estimators': [1,5,10,50]
    },
    cv = 5,return_train_score = False,n_iter =2
)

result.fit(X,y)
result.cv_results_


{'mean_fit_time': array([ 1.25783868,  5.54809279, 11.11732516, 55.32987885]),
 'std_fit_time': array([0.20563113, 0.4522177 , 0.61842406, 0.94903341]),
 'mean_score_time': array([0.04030766, 0.12421875, 0.216886  , 1.11028538]),
 'std_score_time': array([0.00273861, 0.03086734, 0.05636742, 0.19060589]),
 'param_n_estimators': masked_array(data=[1, 5, 10, 50],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 1},
  {'n_estimators': 5},
  {'n_estimators': 10},
  {'n_estimators': 50}],
 'split0_test_score': array([0.62680522, 0.6714492 , 0.66969107, 0.70987693]),
 'split1_test_score': array([0.66045022, 0.70820382, 0.71209695, 0.73693134]),
 'split2_test_score': array([0.66352705, 0.70763869, 0.71542495, 0.73586387]),
 'split3_test_score': array([0.66321309, 0.71134344, 0.71686917, 0.7396942 ]),
 'split4_test_score': array([0.5296223 , 0.65439076, 0.66880161, 0.72211234]),
 'mean_test_score': array([0.6287235

In [22]:
df = pd.DataFrame(result.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.257839,0.205631,0.040308,0.002739,1,{'n_estimators': 1},0.626805,0.66045,0.663527,0.663213,0.529622,0.628724,0.051443,4
1,5.548093,0.452218,0.124219,0.030867,5,{'n_estimators': 5},0.671449,0.708204,0.707639,0.711343,0.654391,0.690605,0.023274,3
2,11.117325,0.618424,0.216886,0.056367,10,{'n_estimators': 10},0.669691,0.712097,0.715425,0.716869,0.668802,0.696577,0.022371,2
3,55.329879,0.949033,1.110285,0.190606,50,{'n_estimators': 50},0.709877,0.736931,0.735864,0.739694,0.722112,0.728896,0.011291,1


<h3>picking columns from tables to see score</h3>

In [25]:
df [['param_n_estimators','params','mean_test_score']]

Unnamed: 0,param_n_estimators,params,mean_test_score
0,1,{'n_estimators': 1},0.628724
1,5,{'n_estimators': 5},0.690605
2,10,{'n_estimators': 10},0.696577
3,50,{'n_estimators': 50},0.728896


<h2>getting best score</h2>

In [27]:
result.best_score_

0.7288957340307631

<h3>getting best parameters</h3>

In [29]:
result.best_params_

{'n_estimators': 50}

<h2>RANDOMIZED GridSearch</h2>

In [30]:
from sklearn.model_selection import RandomizedSearchCV
result = RandomizedSearchCV(
    RandomForestClassifier(), {
        'n_estimators': [1,5,10,50]
    },
    cv = 5,return_train_score = False,n_iter =2  #give number of combinations of features
)

result.fit(X,y)
pd.DataFrame(result.cv_results_)[['param_n_estimators','params','mean_test_score']]


Unnamed: 0,param_n_estimators,params,mean_test_score
0,5,{'n_estimators': 5},0.683058
1,50,{'n_estimators': 50},0.729134
