In [108]:
import numpy as np
import pandas as pd

filename = 'sd-data_raw.csv'
data = pd.read_csv(filename)

raw_data = data.values # converting from pandas dataframe to numpy array
attribute_names = np.asarray(data.columns) # extracting the attribute names
print("These are the features:",attribute_names)

These are the features: ['sex' 'age' 'height' 'weight' 'waistline' 'sight_left' 'sight_right'
 'hear_left' 'hear_right' 'SBP' 'DBP' 'BLDS' 'tot_chole' 'HDL_chole'
 'LDL_chole' 'triglyceride' 'hemoglobin' 'urine_protein'
 'serum_creatinine' 'SGOT_AST' 'SGOT_ALT' 'gamma_GTP' 'SMK_stat_type_cd'
 'DRK_YN']


In [109]:
data = np.array(raw_data) # turning data into numpy array

# For the gender attribute we replace 'male' with the number 0 and 'female' with 1:
data[data == 'Male'] = 0 
data[data == 'Female'] = 1
# Same goes for drink yes/no. yes with 1 and no with 0
data[data == 'Y'] = 1 
data[data == 'N'] = 0

In [110]:
# Applying 1-out-of-K on some of the discrete variables. 
# hearing_left and hearing_right, K=2
# urine_protien, K=6 and smoking state, K=3
one_out_of_K_cols_dict = {'encoding1':'hear_left', 'encoding2':'hear_right', 'encoding3':'urine_protein', 'encoding4':'SMK_stat_type_cd'}

# getting indices of collums that needs encoding
indices_list = [index for index, element in enumerate(attribute_names) if element in one_out_of_K_cols_dict.values()]

key_iterator = iter(one_out_of_K_cols_dict)

# For loop for creating encoding matrix for all keys in one_out_of_K_cols_dict
for i in range(len(indices_list)):
    K = int(data[:,indices_list[i]].max())+1
    encoding = np.zeros((data[:,indices_list[i]].size, K))
    encoding[np.arange(data[:,indices_list[i]].size), data[:,indices_list[i]].astype(int)] = 1
    # deleting first column, because values incoded goes from 1:n and not 0:n, so K=K+1, for this to work.
    encoding = encoding[:,1:]
    # Storing encoding as value in dict
    one_out_of_K_cols_dict[str(next(key_iterator))] = encoding

# Deleting old data columns
for j in range(len(indices_list)):
    data = np.delete(data, indices_list[j]-j, axis=1)

# Remember that last key (encoding for 'SMK_stat_type_cd') will be the first colums in data matrix
for key in one_out_of_K_cols_dict:
    for k in range(one_out_of_K_cols_dict[str(key)].shape[1]):
        data = np.insert(data, k, one_out_of_K_cols_dict[str(key)][:,k], axis=1)

In [111]:
np.set_printoptions(suppress=True)
min_max_matrix = np.zeros((2, data.shape[1]), dtype=float)

# to get an overview of where min-max scaling methods should be applied, 
# we create a matrix containing the minimum and maximum value for each attribute

for i in range(data.shape[1]):
    min_max_matrix[0,i] = min(data[:,i])
    min_max_matrix[1,i] = max(data[:,i])

min_max_matrix

array([[   0. ,    0. ,    0. ,    0. ,    0. ,    0. ,    0. ,    0. ,
           0. ,    0. ,    0. ,    0. ,    0. ,    0. ,   20. ,  130. ,
          25. ,    8. ,    0.1,    0.1,   67. ,   32. ,   25. ,   30. ,
           1. ,    1. ,    1. ,    1. ,    0.1,    1. ,    1. ,    1. ,
           0. ],
       [   1. ,    1. ,    1. ,    1. ,    1. ,    1. ,    1. ,    1. ,
           1. ,    1. ,    1. ,    1. ,    1. ,    1. ,   85. ,  190. ,
         140. ,  999. ,    9.9,    9.9,  273. ,  185. ,  852. , 2344. ,
        8110. , 5119. , 9490. ,   25. ,   98. , 9999. , 7210. ,  999. ,
           1. ]])