In [3]:
import pandas as pd
import numpy as np 
import matplotlib as plt
import seaborn as sns
from csv import reader
from math import sqrt


In [11]:
df = pd.read_csv("../data/pima-indians-diabetes.data.csv")
df.shape

(767, 9)

## Normalizing Data Using python 

scaleed value = (Value - Min)/(Max -Min)

In [2]:
# Using python to load a csv file 
def load_csv(filename):
    dataset = list()
    with open(filename, "r") as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


# Find the min and max values for each column
def dataset_minmax(dataset):
    # create an empty list that will contain the min and the max values
    minmax = list()
    # loop the length of the first row in the dataset
    for i in range(len(dataset[0])):
        # get the column values from the individual row
        col_values = [row[i] for row in dataset]

        # get the min value 
        min_value = min(col_values)
        # get the max value 
        max_value = max(col_values)

        minmax.append([min_value, max_value])
    return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = round((row[i] - minmax[i][0])/(minmax[i][1] - minmax[i][0]),4)


# Loaded data
filename = "../data/pima-indians-diabetes.data.csv"
new_dataset = load_csv(filename)

# converting  string column to float
for col_number in range(len(new_dataset[0])):
    str_column_to_float(new_dataset, col_number)
print(new_dataset)

# Contrive small dataset
# dataset = [[50, 30], [20, 90]]
# print(f'Actual Data : {dataset}')

# Calculate min and max for each column
minmax = dataset_minmax(new_dataset)
print(f'Expected Output: {minmax}')

# Normalize columns
normalized_columns = normalize_dataset(new_dataset, minmax)
print(f'Normalized Columns : {new_dataset}')


[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0], [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0], [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0], [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0], [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0], [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 0.0], [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 1.0], [10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 0.0], [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0], [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.232, 54.0, 1.0], [4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 0.0], [10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0, 1.0], [10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.441, 57.0, 0.0], [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.398, 59.0, 1.0], [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.587, 51.0, 1.0], [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 1.0], [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0], [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0, 1.0], [1.0, 103

# Standardizing Data Using python
The formula to calculate the mean is:
$$
\bar{x} = \frac{1}{n} \sum_{i=1}^n x_i
$$

The formula to calculate the std is:

$$
\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}
$$

The formula to calculate the standardized valuei is:

$$
standardized - value_i = \frac{value_i - mean}{stdev}
$$

In [13]:
# calculate column mean
def column_means(dataset):
    """Calculate the mean of each column in a dataset."""
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means


# calculate column standard deviations
def column_stdevs(dataset, means):
    """Calculate the std of each column in a dataset."""
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i]-means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
    return stdevs


# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = round((row[i] - means[i]) / stdevs[i], 4)


# Standardize dataset
# dataset = [[50, 30], [20, 90], [30, 50]]
# print(dataset)

filename = "../data/pima-indians-diabetes.data.csv"
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),len(dataset[0])))

# convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print(dataset[0])

# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
print(means)
print(stdevs)

# standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset[0])



Loaded data file ../data/pima-indians-diabetes.data.csv with 768 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[3.8450520833333335, 120.89453125, 69.10546875, 20.536458333333332, 79.79947916666667, 31.992578124999977, 0.4718763020833327, 33.240885416666664, 0.3489583333333333]
[3.3695780626988623, 31.97261819513622, 19.355807170644777, 15.952217567727677, 115.24400235133837, 7.8841603203754405, 0.33132859501277484, 11.76023154067868, 0.4769513772427971]
[0.6395, 0.8478, 0.1495, 0.9067, -0.6924, 0.2039, 0.4682, 1.4251, 1.365]


# Algorithm Evaluation Methods

The goal of resampling methods is to make the best use of your training data in order to accurately estimate the performance of a model on new unseen data. Accurate estimates of performance can then be used to help you choose which set of model parameters to use or which model to select.

Once you have chosen a model, you can train for final model on the entire training dataset and start using it to make predictions. There are two common resampling methods that you can use:

1. A train and test split of your data.  
2. k-fold cross-validation.



# Train and Test Split


The train and test split is the easiest resampling method. As such, it is the most widely used. The train and test split involves separating a dataset into two parts:
1. Training Dataset : The training dataset is used by the machine learning algorithm to train the model.
2. Test Dataset : The test dataset is held back and is used to evaluate the performance of the model.



In [39]:
# Example of Splitting a Contrived Dataset into Train and Test
from random import seed
from random import randrange


# Split a dataset into a train and test set
def train_test_split(dataset, split=0.60):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)

    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy


# test train/test split
seed(1)
# dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
filename = "../data/pima-indians-diabetes.data.csv"
dataset = load_csv(filename)
train, test = train_test_split(dataset)

print(f"Records for the training: {len(train)}")
print(train)
print(f"Records for the testing: {len(test)}")
print(test)


Records for the training: 461
[['0', '93', '60', '25', '92', '28.7', '0.532', '22', '0'], ['8', '100', '76', '0', '0', '38.7', '0.190', '42', '0'], ['7', '114', '66', '0', '0', '32.8', '0.258', '42', '1'], ['3', '142', '80', '15', '0', '32.4', '0.200', '63', '0'], ['6', '111', '64', '39', '0', '34.2', '0.260', '24', '0'], ['0', '139', '62', '17', '210', '22.1', '0.207', '21', '0'], ['10', '115', '98', '0', '0', '24.0', '1.022', '34', '0'], ['4', '99', '72', '17', '0', '25.6', '0.294', '28', '0'], ['6', '195', '70', '0', '0', '30.9', '0.328', '31', '1'], ['1', '131', '64', '14', '415', '23.7', '0.389', '21', '0'], ['6', '125', '68', '30', '120', '30.0', '0.464', '32', '0'], ['1', '71', '48', '18', '76', '20.4', '0.323', '22', '0'], ['2', '84', '50', '23', '76', '30.4', '0.968', '21', '0'], ['5', '117', '92', '0', '0', '34.1', '0.337', '38', '0'], ['0', '101', '62', '0', '0', '21.9', '0.336', '25', '0'], ['0', '91', '68', '32', '210', '39.9', '0.381', '25', '0'], ['2', '94', '76', '18', 

# k-fold Cross-Validation Split


In [32]:
def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


# test cross validation split
seed(1)
# dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
filename = "../data/pima-indians-diabetes.data.csv"
dataset = load_csv(filename)
folds = cross_validation_split(dataset)
print(folds)

[[['0', '93', '60', '25', '92', '28.7', '0.532', '22', '0'], ['8', '100', '76', '0', '0', '38.7', '0.190', '42', '0'], ['7', '114', '66', '0', '0', '32.8', '0.258', '42', '1'], ['3', '142', '80', '15', '0', '32.4', '0.200', '63', '0'], ['6', '111', '64', '39', '0', '34.2', '0.260', '24', '0'], ['0', '139', '62', '17', '210', '22.1', '0.207', '21', '0'], ['10', '115', '98', '0', '0', '24.0', '1.022', '34', '0'], ['4', '99', '72', '17', '0', '25.6', '0.294', '28', '0'], ['6', '195', '70', '0', '0', '30.9', '0.328', '31', '1'], ['1', '131', '64', '14', '415', '23.7', '0.389', '21', '0'], ['6', '125', '68', '30', '120', '30.0', '0.464', '32', '0'], ['1', '71', '48', '18', '76', '20.4', '0.323', '22', '0'], ['2', '84', '50', '23', '76', '30.4', '0.968', '21', '0'], ['5', '117', '92', '0', '0', '34.1', '0.337', '38', '0'], ['0', '101', '62', '0', '0', '21.9', '0.336', '25', '0'], ['0', '91', '68', '32', '210', '39.9', '0.381', '25', '0'], ['2', '94', '76', '18', '66', '31.6', '0.649', '23', 

In [31]:
256 * 3

768