In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [9]:
data= pd.read_csv('diabetes.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
762,9,89,62,0,0,22.5,0.142,33,0
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


In [11]:
X = data.drop('Outcome', axis=1)
y= data['Outcome']

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [15]:
y_test

667    1
324    0
623    0
689    1
521    0
      ..
355    1
533    0
344    0
296    1
720    0
Name: Outcome, Length: 154, dtype: int64

In [17]:
y_train

60     0
554    0
346    0
294    0
231    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 613, dtype: int64

In [19]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model',RandomForestClassifier())
])

In [21]:
param_grid= {
    'model__n_estimators': [50,100,200],
    'model__max_depth':[None,10,20,30],
    'model__min_samples_split':[2,5,10]
}

In [23]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

In [25]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [26]:
best_model = grid_search.best_estimator_

In [27]:
best_params= grid_search.best_params_
print(f"Best Hyperparameters:{best_params}")

Best Hyperparameters:{'model__max_depth': 20, 'model__min_samples_split': 10, 'model__n_estimators': 200}


In [28]:
y_pred = best_model.predict(X_test)

In [29]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
      dtype=int64)

In [30]:
accuracy= accuracy_score(y_test,y_pred)
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Test Accuracy:{accuracy:.4f}")

Best Hyperparameters: {'model__max_depth': 20, 'model__min_samples_split': 10, 'model__n_estimators': 200}
Test Accuracy:0.7532


In [31]:
import pickle

In [32]:
with open('models.pkl','wb') as file:
    pickle.dump(best_model, file)

In [34]:
# ADDING EVOLUTION METRICS

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score



In [40]:
y_pred=best_model.predict(X_test)


                     

In [44]:
mae=mean_absolute_error(y_test,y_pred)

In [57]:
mae

0.24675324675324675

In [59]:
mse=mean_squared_error(y_test,y_pred)

In [61]:
mse

0.24675324675324675

In [63]:
r2=r2_score(y_test,y_pred)

In [65]:
r2

-0.1140300780506378