In [103]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [92]:
data = pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [93]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [94]:
for col in data.columns:
    zero = len(data[data[col] == 0])
    print("Total number of 0 values in",col,"=",zero)

Total number of 0 values in Pregnancies = 111
Total number of 0 values in Glucose = 5
Total number of 0 values in BloodPressure = 35
Total number of 0 values in SkinThickness = 227
Total number of 0 values in Insulin = 374
Total number of 0 values in BMI = 11
Total number of 0 values in DiabetesPedigreeFunction = 0
Total number of 0 values in Age = 0
Total number of 0 values in Outcome = 500


In [95]:
# Dropping skinthickness, insulin column as they have too much zero values
data.drop(['SkinThickness','Insulin'],axis=1,inplace=True)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,33.6,0.627,50,1
1,1,85,66,26.6,0.351,31,0
2,8,183,64,23.3,0.672,32,1
3,1,89,66,28.1,0.167,21,0
4,0,137,40,43.1,2.288,33,1


In [96]:
data['Outcome'].unique()

array([1, 0], dtype=int64)

In [97]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [88]:
#from sklearn.impute import SimpleImputer
#fill_values = SimpleImputer(missing_values=0, strategy="mean")
#X = fill_values.fit_transform(X)

In [98]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)

In [99]:
random_search = {'criterion': ['entropy', 'gini'],
               'max_depth': list(np.linspace(10, 1200, 10, dtype = int)) + [None],
               'max_features': ['auto', 'sqrt','log2', None],
               'min_samples_leaf': [4, 6, 8, 12],
               'min_samples_split': [5, 7, 10, 14],
               'n_estimators': list(np.linspace(151, 1200, 10, dtype = int))}

clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 80, 
                               cv = 4, verbose= 5, random_state= 101, n_jobs = -1)
model.fit(X_train,y_train)
best_model = model.best_estimator_
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

Fitting 4 folds for each of 80 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  6.3min finished


0.8181818181818182


In [101]:
filename = 'diabetes_model.pkl'
joblib.dump(best_model, filename)

['diabetes_model.pkl']