# Loading data from a csv file, and pre-processing data for further analysis


<div style="text-align: justify"> A <a href="https://en.wikipedia.org/wiki/Comma-separated_values">comma separated values</a> (CSV) file contains different values separated by a delimiter, which acts as a database table or an intermediate form of a database table. In other words, a CSV file file is a set of database rows and columns stored in a text file such that the rows are separated by a new line while the columns are separated by a semicolon or a comma. A CSV file is primarily used to transport data between two databases of different formats through a computer program.</div>

In [49]:
import csv
import random
from math import sqrt

In [51]:
## Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file: 
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row) # adding row to the dataset
    return dataset

In [52]:
## Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [53]:
## Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [54]:
# Load iris dataset
filename = 'iris.csv'
dataset = load_csv(filename)

print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
print('First row of the dataset: ', dataset[0]) # print first line of the dataset
print('--------------------------------------')

# convert string columns to float 
for i in range(len(dataset[0])-1): # loop on all columns
    str_column_to_float(dataset, i)
# convert class column to int
lookup = str_column_to_int(dataset, 4)

print('First row of modified dataset: ', dataset[0]) # print first line of updated dataset
print(lookup) # print lookup dictionary containing the classification of the species and their corresponding number

Loaded data file iris.csv with 150 rows and 5 columns
First row of the dataset:  ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
--------------------------------------
First row of modified dataset:  [5.1, 3.5, 1.4, 0.2, 0]
{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


In [44]:
##### Normalize Data ###########

# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        colvalues = [row[i] for row in dataset]
        min_value = min(colvalues) 
        max_value = max(colvalues)
        minmax.append([min_value, max_value])
    return minmax

# Normalize the dataset except last row for classification values
def Normalize_Dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [45]:
# Calculate min and max for each column
minmax = dataset_minmax(dataset)
# Normalize columns
Normalize_Dataset(dataset, minmax)
print('First row of normalized dataset: ', dataset[0])

First row of normalized dataset:  [0.22222222222222202, 0.6249999999999999, 0.06779661016949144, 0.041666666666666644, -1.2206555615733707]


In [55]:
#### Standardize Data ######

# Calculate column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

# Calculate column standard deviations
def column_stdevs(dataset, means):
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i]-means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
        stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
    return stdevs

# Standardize the dataset
def Standardize_Dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]

In [75]:
# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
# standardize dataset
Standardize_Dataset(dataset, means, stdevs)
print(dataset[0])

[-9.179297646015436e+22, 98315463418529.08, -4160058.767277357, -194.9783686428172, -1.2206555615733707]
