In [None]:
#Predict whether a person will be diagonosed with diabetes or not
# we have a dataset of 768 people who were or were not diagonosed with diabetes

In [23]:

#  Imports necessary libraries: pandas for data manipulation, numpy for numerical operations,
#  and scikit-learn for machine learning tasks.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Preprocessing in the context of machine learning refers to the transformations applied to your data
# before feeding it into an algorithm. It's a crucial step to clean, format, and organize the raw data, 
# making it suitable for model training and improving the performance of the model.

In [24]:
# StandardScaler is a specific preprocessing technique provided by the scikit-learn library.
# It standardizes features by removing the mean and scaling to unit variance.
# The standard score of a sample x is calculated as: z = (x - u) / s, where u is the mean
# of the training samples or zero if with_mean=False, and s is the standard deviation of
# the training samples or one if with_std=False.
# This process helps to ensure that all features have a similar scale, which is important
# for many machine learning algorithms that are sensitive to the scale of the input features,
# such as K-Nearest Neighbors (KNN), Support Vector Machines (SVM), and algorithms that use
# gradient descent.

In [25]:
dataset = pd.read_csv('diabetes.csv')
len(dataset)

768

In [26]:

dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [27]:
#Replace zeroes
zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']


In [28]:
for column in zero_not_accepted:
    dataset[column]=dataset[column].replace(0, np.nan)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.nan, mean)

# 1. `dataset[column] = dataset[column].replace(0, np.NaN)`: It replaces all occurrences of the value 0 in the current column with `np.NaN` (Not a Number). 
     # This is done because in the context of this dataset and these specific columns (Glucose, BloodPressure, etc.), a value of 0 likely represents missing data rather than an actual measurement of zero.
    # Replacing 0 with `np.NaN` allows Pandas and NumPy functions to correctly handle these as missing values.

    
# 2. `mean = int(dataset[column].mean(skipna=True))`: It calculates the mean of the values in the
    # current column, ignoring the `np.NaN` values that were just introduced.
    # The `skipna=True` argument ensures that the missing values are excluded from the mean calculation. The result is then cast to an integer using `int()`. This integer mean will be used to fill the missing values.

    
# 3. `dataset[column] = dataset[column].replace(np.NaN, mean)`: It replaces all the `np.NaN` values (which were originally 0s) in the 
    # current column with the calculated integer mean. This is a common technique for handling missing data, known as mean imputation. By replacing missing values with the mean, you are essentially filling in the gaps with a central tendency of the existing data.

# In essence, this code snippet is performing data cleaning and imputation on specific columns of the dataset. It assumes that 0 in these columns indicates missing data and replaces these missing values with the mean of the non-missing values in that column.
# This is often done before training a machine learning model because many models cannot handle missing values, and imputing them allows the model to use all the available data.

In [29]:
#split dataset

X= dataset.iloc[:,0:8]
y = dataset.iloc[:,8]
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size = 0.2)

In [30]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

614
614
154
154


In [31]:
#Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [32]:
# Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')
# classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')
# So, the code assumes a value of k = 11

In [33]:
classifier.fit(X_train,y_train)

In [34]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [35]:
# Evaluate Model
cm = confusion_matrix(y_test, y_pred)
print (cm)
print(f1_score(y_test, y_pred))

[[94 13]
 [15 32]]
0.6956521739130435


In [36]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182
