In [1]:
import pandas as pd

In [2]:
diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Normalize Data

Here we normalize data by rescaling the numeric values dataframe so that they range between 0 and 1.

The formula we use will be:

scaled value = $\frac{\text{value} - \text{min}}{\text{max} - \text{min}}$

In [3]:
def dataset_minmax(dataset):
    minmax = list()
    for i in dataset.columns:
        min_val = min(dataset[i])
        max_val = max(dataset[i])
        minmax.append([min_val, max_val])
    return minmax

def normalize_dataset(dataset, minmax):
    for i in range(len(dataset.columns)):
        dataset.iloc[:,i] = (dataset.iloc[:,i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

minmax = dataset_minmax(diabetes)
normalize_dataset(diabetes, minmax)

1      0.058824
2      0.470588
3      0.058824
4      0.000000
         ...   
763    0.588235
764    0.117647
765    0.294118
766    0.058824
767    0.058824
Name: Pregnancies, Length: 768, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataset.iloc[:,i] = (dataset.iloc[:,i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
1      0.427136
2      0.919598
3      0.447236
4      0.688442
         ...   
763    0.507538
764    0.613065
765    0.608040
766    0.633166
767    0.467337
Name: Glucose, Length: 768, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataset.iloc[:,i] = (dataset.iloc[:,i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
1      0.540984
2      0.524590
3      0.540984
4      0.327869
         ...   
763    0.622951
764    0.573770
765    0.590164
766    0.491803
767    0.573770
Name: BloodPressure, Length: 768, dtype: float64' has dtype incompatible wit

In [4]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0


# Standardize Data

When we standardize data, we're centering the distribution of the data on the value 0 and the standard deviation to the value 1. Together, the mean and the standard deviation can be used to summarize a normal distribution.

mean = $\frac{\sum_{i=1}^{N} \text{values}_i}{\text{count(values)}}$

standard deviation = $\sqrt{\frac{\sum_{i=1}^{N} (value_i - \text{mean})^2}{\text{count(values)} - 1}}$

standardized value_i = $\frac{value_i - \text{mean}}{\text{stdev}}$


In [5]:
diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
def column_means(dataset):
    # get number of columns in dataset
    num_cols = len(dataset.columns)

    # set means for each column in the dataset to 0
    means = [0 for i in range(num_cols)]

    for i in range(num_cols):
        # get column total for each row in each column
        col_values = dataset.iloc[:, i] 
        
        # get the mean for each column
        means[i] = col_values.mean()

    return means

In [7]:
from math import sqrt

def column_stdevs(dataset, means):
    # get number of columns in dataset
    num_cols = len(dataset.columns)

    # set std dev for each column in the dataset to 0
    stdevs = [0 for i in range(num_cols)]
    for i in range(num_cols):
        variance = ((dataset.iloc[:, i] - means[i]) ** 2).sum()
        stdevs[i] = sqrt(variance / (len(dataset) - 1))

    return stdevs

In [8]:
# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for i in range(len(dataset.columns)):
        dataset.iloc[:, i] = (dataset.iloc[:, i] - means[i]) / stdevs[i]

In [9]:
means = column_means(diabetes)
stdevs = column_stdevs(diabetes, means)
standardize_dataset(diabetes, means, stdevs)

1     -0.844335
2      1.233077
3     -0.844335
4     -1.141108
         ...   
763    1.826623
764   -0.547562
765    0.342757
766   -0.844335
767   -0.844335
Name: Pregnancies, Length: 768, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataset.iloc[:, i] = (dataset.iloc[:, i] - means[i]) / stdevs[i]
1     -1.122665
2      1.942458
3     -0.997558
4      0.503727
         ...   
763   -0.622237
764    0.034575
765    0.003299
766    0.159683
767   -0.872451
Name: Glucose, Length: 768, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataset.iloc[:, i] = (dataset.iloc[:, i] - means[i]) / stdevs[i]
1     -0.160441
2     -0.263769
3     -0.160441
4     -1.503707
         ...   
763    0.356200
764    0.046215
765    0.149543
766   -0.470426
767    0.046215
Name: BloodPressure, Length: 768, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compati

In [10]:
# standardized data
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639530,0.847771,0.149543,0.906679,-0.692439,0.203880,0.468187,1.425067,1.365006
1,-0.844335,-1.122665,-0.160441,0.530556,-0.692439,-0.683976,-0.364823,-0.190548,-0.731643
2,1.233077,1.942458,-0.263769,-1.287373,-0.692439,-1.102537,0.604004,-0.105515,1.365006
3,-0.844335,-0.997558,-0.160441,0.154433,0.123221,-0.493721,-0.920163,-1.040871,-0.731643
4,-1.141108,0.503727,-1.503707,0.906679,0.765337,1.408828,5.481337,-0.020483,1.365006
...,...,...,...,...,...,...,...,...,...
763,1.826623,-0.622237,0.356200,1.721613,0.869464,0.115094,-0.908090,2.530487,-0.731643
764,-0.547562,0.034575,0.046215,0.405181,-0.692439,0.609757,-0.398023,-0.530677,-0.731643
765,0.342757,0.003299,0.149543,0.154433,0.279412,-0.734711,-0.684747,-0.275580,-0.731643
766,-0.844335,0.159683,-0.470426,-1.287373,-0.692439,-0.240048,-0.370859,1.169970,1.365006
