In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import tensorflow as tf
import tensorflow_hub as hub

# Load Data

In [None]:
path = '../input/docspot/datasets_228_482_diabetes.csv'
def raw_data():
    data = pd.read_csv(path)
    return data
df = raw_data()     # we can easily call this to renew our dataset anywhere in the notebook. Therefore, you will not neet to run all to execute only several lines.
df.sample(10)

In [None]:
TARGET = 'Outcome'

In [None]:
df.isna().sum()

# Data Vizualization

We will plot each column.

In [None]:
df[TARGET].value_counts()

In [None]:
def display_histograms(df):    # we will use this later again.
    for i in range(len(df.columns[:-1])):
        label = df.columns[i]
        plt.hist(df[df[TARGET]==1][label], color='red', label='Diabetes', alpha=0.5, density=True, bins=15)
        plt.hist(df[df[TARGET]==0][label], color='blue', label='Normal', alpha=0.5, density=True, bins=15) 
        plt.title(label)
        plt.ylabel('Probability')
        plt.xlabel(label)
        plt.legend()
        plt.show()

display_histograms(df)

## Prepare data

In [None]:
X = df[df.columns[:-1]].values
y = df[TARGET].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=42)


# Build model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
        
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(X_test, y_test)

In [None]:
model.evaluate(X_val, y_val)

Pretty bad. Because we did not train our model yet.

In [None]:
model.fit(X_train, y_train, batch_size=16, epochs=20, 
         validation_data=(X_val, y_val))

# Normalization

We will scale our data to improve the performance. Because  inference from our features such as BMI is not obvious.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_transformed = pd.DataFrame(X_scaled, columns=df.columns[:-1])
df_transformed[TARGET] = y
# y_scaled = scaler.fit(y)

In [None]:
display_histograms(df_transformed)

# Oversampling for imbalanced data

Since we have imbalanced target labels, we need to oversample the small number of the targets

In [None]:
df_transformed[TARGET].value_counts()

In [None]:
!pip install -U imbalanced-learn

In [None]:
from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler()
X_scaled_sampled, y_sampled = sampler.fit_resample(X_scaled, y)
df_transformed = pd.DataFrame(X_scaled_sampled, columns=df.columns[:-1])
df_transformed[TARGET] = y_sampled

In [None]:
df_transformed[TARGET].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_tmp, y_train, y_tmp = train_test_split(X_scaled_sampled, y_sampled, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=42)


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
        
])


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),    #increased
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=16, epochs=20,      
         validation_data=(X_val, y_val))

# Evaluate final model

In [None]:
model.evaluate(X_test, y_test)

Accuracy is way better now. Validated accuracy shows that there is no overfitting with 20 epochs.