In [48]:
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv('data/diabetes.csv')
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [50]:
df['Glucose'].replace(0, np.nan, inplace=True)
df['BloodPressure'].replace(0, np.nan, inplace=True)
df['SkinThickness'].replace(0, np.nan, inplace=True)
df['Insulin'].replace(0, np.nan, inplace=True)
df['BMI'].replace(0, np.nan, inplace=True)
df['DiabetesPedigreeFunction'].replace(0, np.nan, inplace=True)
df['Age'].replace(0, np.nan, inplace=True)

In [51]:
df.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [52]:
data = df.values
X = data[:, :-1]
y = data[:, -1]
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,6.0,148.0,72.0,35.0,,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31.0
2,8.0,183.0,64.0,,,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0
764,2.0,122.0,70.0,27.0,,36.8,0.340,27.0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0
766,1.0,126.0,60.0,,,30.1,0.349,47.0


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

labels = Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age

Mean - Glucose, BloodPressure, SkinThickness, BMI (columns:1,2,3,5)

Median - Insulin, DiabetesPedigreeFunction, Age (columns:4,6,7)

In [55]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
#min_max_scaler = MinMaxScaler(feature_range=(0,1)) skipped for now

In [56]:
X_train[:, (1,2,3,5)] = mean_imputer.fit_transform(X_train[:, (1,2,3,5)])
X_train[:, (4,6,7)] = median_imputer.fit_transform(X_train[:, (4,6,7)])
X_test[:, (1,2,3,5)] = mean_imputer.transform(X_test[:, (1,2,3,5)])
X_test[:, (4,6,7)] = median_imputer.transform(X_test[:, (4,6,7)])
print(X_train)

[[  9.    145.     80.    ...  37.9     0.637  40.   ]
 [ 10.    129.     62.    ...  41.2     0.441  38.   ]
 [  7.    102.     74.    ...  37.2     0.204  45.   ]
 ...
 [ 13.    126.     90.    ...  43.4     0.583  42.   ]
 [  4.    171.     72.    ...  43.6     0.479  26.   ]
 [  9.    102.     76.    ...  32.9     0.665  46.   ]]


In [57]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7
0,9.0,145.0,80.0,46.000000,130.0,37.9,0.637,40.0
1,10.0,129.0,62.0,36.000000,125.0,41.2,0.441,38.0
2,7.0,102.0,74.0,40.000000,105.0,37.2,0.204,45.0
3,8.0,120.0,78.0,28.770686,125.0,25.0,0.409,64.0
4,2.0,120.0,76.0,37.000000,105.0,39.7,0.215,29.0
...,...,...,...,...,...,...,...,...
609,2.0,157.0,74.0,35.000000,440.0,39.4,0.134,30.0
610,7.0,187.0,50.0,33.000000,392.0,33.9,0.826,34.0
611,13.0,126.0,90.0,28.770686,125.0,43.4,0.583,42.0
612,4.0,171.0,72.0,28.770686,125.0,43.6,0.479,26.0


In [58]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc.fit(X_train, y_train)

In [59]:
y_pred_rand = rfc.predict(X_test)
y_pred_rand

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
       0.])

In [60]:
gb = GradientBoostingClassifier(learning_rate=0.1)
gb.fit(X_train,y_train)

In [61]:
y_pred_grad = gb.predict(X_test)
y_pred_grad

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0.,
       1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
       0.])

In [62]:
acc_rand = accuracy_score(y_test, y_pred_rand)
print('Accuracy using random forest:', acc_rand)
acc_grad = accuracy_score(y_test, y_pred_grad)
print('Accuracy using gradient boosting:', acc_grad)

Accuracy using random forest: 0.7987012987012987
Accuracy using gradient boosting: 0.7857142857142857


Dataset 2

In [63]:
from sklearn.datasets import load_breast_cancer

In [64]:
dataset2 = load_breast_cancer()
df2 = pd.DataFrame(dataset2.data, columns=dataset2.feature_names)
df2['Target'] = dataset2.target
df2

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [66]:
X2 = dataset2.data
y2 = dataset2.target
pd.DataFrame(X2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [67]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, test_size=0.2, random_state=1)

In [68]:
rfc2 = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc2.fit(X2_train, y2_train)

In [69]:
y2_pred_rand = rfc2.predict(X2_test)
y2_pred_rand

array([1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1])

In [70]:
gb2 = GradientBoostingClassifier(learning_rate=0.1)
gb2.fit(X2_train,y2_train)

In [71]:
y2_pred_grad = gb2.predict(X2_test)
y2_pred_grad

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1])

In [72]:
acc2_rand = accuracy_score(y2_test, y2_pred_rand)
print('Accuracy using random forest:', acc2_rand)
acc2_grad = accuracy_score(y2_test, y2_pred_grad)
print('Accuracy using gradient boosting:', acc2_grad)

Accuracy using random forest: 0.956140350877193
Accuracy using gradient boosting: 0.9649122807017544


In [73]:
cr1 = classification_report(y2_test, y2_pred_rand,  digits=2, output_dict=True, zero_division='warn')
cr2 = classification_report(y2_test, y2_pred_grad,  digits=2, output_dict=True, zero_division='warn')

In [74]:
cr1

{'0': {'precision': 1.0,
  'recall': 0.8809523809523809,
  'f1-score': 0.9367088607594937,
  'support': 42},
 '1': {'precision': 0.935064935064935,
  'recall': 1.0,
  'f1-score': 0.9664429530201343,
  'support': 72},
 'accuracy': 0.956140350877193,
 'macro avg': {'precision': 0.9675324675324675,
  'recall': 0.9404761904761905,
  'f1-score': 0.951575906889814,
  'support': 114},
 'weighted avg': {'precision': 0.9589883800410116,
  'recall': 0.956140350877193,
  'f1-score': 0.9554882874504246,
  'support': 114}}

In [75]:
cr2

{'0': {'precision': 1.0,
  'recall': 0.9047619047619048,
  'f1-score': 0.9500000000000001,
  'support': 42},
 '1': {'precision': 0.9473684210526315,
  'recall': 1.0,
  'f1-score': 0.972972972972973,
  'support': 72},
 'accuracy': 0.9649122807017544,
 'macro avg': {'precision': 0.9736842105263157,
  'recall': 0.9523809523809523,
  'f1-score': 0.9614864864864865,
  'support': 114},
 'weighted avg': {'precision': 0.966759002770083,
  'recall': 0.9649122807017544,
  'f1-score': 0.9645092460881936,
  'support': 114}}