# Explore here

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
main_df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")

main_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = main_df.drop(columns=['Outcome'])
y = main_df['Outcome']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
gbc_model = GradientBoostingClassifier()

gbc_model.fit(X_train, y_train)

train_preds = gbc_model.predict(X_train)
test_preds = gbc_model.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [8]:
accuracy_score(y_train, train_preds)

0.9375

In [9]:
accuracy_score(y_test, test_preds)

0.7395833333333334

## Let's use Python to our advantage and automate scoring

In [38]:
def scoring_maker(model, X_train , X_test , y_train , y_test):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    print(f"Training accuracy: {accuracy_score(y_train, train_preds)}")
    print("\nTRAINING CLASSIFICATION REPORT:")
    print(classification_report(y_train, train_preds))
    print("\nTRAINING CONFUSION MATRIX:")
    print(confusion_matrix(y_train, train_preds))
    
    print(f"\nTesting accuracy {accuracy_score(y_test, test_preds)}")
    print("\nTESTING CLASSIFICATION REPORT:")
    print(classification_report(y_test, test_preds))
    print("\nTESTING CONFUSION MATRIX:")
    print(confusion_matrix(y_test, test_preds))

In [39]:
scoring_maker(gbc_model, X_train, X_test, y_train, y_test)

Training accuracy: 0.9375

TRAINING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       374
           1       0.93      0.89      0.91       202

    accuracy                           0.94       576
   macro avg       0.94      0.93      0.93       576
weighted avg       0.94      0.94      0.94       576


TRAINING CONFUSION MATRIX:
[[361  13]
 [ 23 179]]

Testing accuracy 0.7395833333333334

TESTING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       126
           1       0.63      0.58      0.60        66

    accuracy                           0.74       192
   macro avg       0.71      0.70      0.70       192
weighted avg       0.73      0.74      0.74       192


TESTING CONFUSION MATRIX:
[[104  22]
 [ 28  38]]


### Try a GridSearch

In [17]:
from sklearn.model_selection import GridSearchCV

In [43]:
hyperparameters = {
    "learning_rate": [0.01, 0.1, 0.5],
    "n_estimators": [20, 50, 100, 200],
    "max_depth" : [2, 5, 8, 10]
}

In [44]:
grid_model = GridSearchCV(GradientBoostingClassifier(random_state=25), param_grid=hyperparameters,
                          cv = 5, scoring="recall")

grid_model.fit(X_train, y_train)

In [48]:
revised_gbc_model = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=150)

revised_gbc_model.fit(X_train, y_train)

In [49]:
scoring_maker(revised_gbc_model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8611111111111112

TRAINING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.86      0.93      0.90       374
           1       0.85      0.73      0.79       202

    accuracy                           0.86       576
   macro avg       0.86      0.83      0.84       576
weighted avg       0.86      0.86      0.86       576


TRAINING CONFUSION MATRIX:
[[349  25]
 [ 55 147]]

Testing accuracy 0.765625

TESTING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       126
           1       0.71      0.55      0.62        66

    accuracy                           0.77       192
   macro avg       0.75      0.71      0.72       192
weighted avg       0.76      0.77      0.76       192


TESTING CONFUSION MATRIX:
[[111  15]
 [ 30  36]]


### This works for any sklearn model!

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
nneighbors = KNeighborsClassifier()

nneighbors.fit(X_train, y_train)

scoring_maker(nneighbors, X_train, X_test, y_train, y_test)

Training accuracy: 0.7760416666666666

TRAINING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       374
           1       0.70      0.64      0.67       202

    accuracy                           0.78       576
   macro avg       0.76      0.74      0.75       576
weighted avg       0.77      0.78      0.77       576


TRAINING CONFUSION MATRIX:
[[318  56]
 [ 73 129]]

Testing accuracy 0.734375

TESTING CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       126
           1       0.65      0.50      0.56        66

    accuracy                           0.73       192
   macro avg       0.71      0.68      0.69       192
weighted avg       0.73      0.73      0.72       192


TESTING CONFUSION MATRIX:
[[108  18]
 [ 33  33]]


In [50]:
147 / (55 + 147)

0.7277227722772277