# Reading Data

In [1]:
import pandas as pd

data = pd.read_csv('diabetes.csv')

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
data_input = data.drop(columns=['Outcome'])
data_output = data['Outcome']

In [3]:
data_input.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
data_output.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [5]:
data_output.unique()

array([1, 0], dtype=int64)

# Splitting Dataset to (Train - Validation - Test)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Getting test data
X, X_test, y, y_test = train_test_split(
    data_input, 
    data_output, 
    test_size=0.2, 
    random_state=1
)

In [9]:
# Getting train and val data
X_train, X_val, y_train, y_val = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=1
)

In [10]:
print('Train size =', X_train.shape[0])
print('Val size =', X_val.shape[0])
print('Test size =', X_test.shape[0])

Train size = 411
Val size = 203
Test size = 154


# Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [12]:
import pickle
with open('scaler.pickle', 'wb') as f:
    pickle.dump(scaler, f)

In [13]:
from sklearn.naive_bayes import GaussianNB

In [21]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train_scaled, y_train)
#gaussian_nb.fit(X_train_scaled, y_train)

GaussianNB()

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
y_pred_train = gaussian_nb.predict(X_train_scaled)
y_pred_val = gaussian_nb.predict(X_val_scaled)

In [24]:
acc_train = accuracy_score(y_train, y_pred_train)
acc_val = accuracy_score(y_val, y_pred_val)
    

In [26]:
print('Training accuracy =', acc_train)
print('Validation accuracy =', acc_val)

Training accuracy = 0.7591240875912408
Validation accuracy = 0.7487684729064039


In [27]:
y_pred_test =gaussian_nb.predict(X_test_scaled)

In [28]:
print('Test accuracy:', accuracy_score(y_test, y_pred_test))

Test accuracy: 0.7922077922077922


In [29]:
import pickle

with open('saved-model.pickle', 'wb') as f:
    pickle.dump(gaussian_nb, f)

In [30]:
with open('saved-model.pickle', 'rb') as f:
    loaded_model = pickle.load(f)

In [32]:
loaded_model

GaussianNB()