In [2]:
import numpy as np
import pandas as pd

filename = 'sd-data_raw.csv'
data = pd.read_csv(filename)

raw_data = data.values # converting from pandas dataframe to numpy array
attribute_names = np.asarray(data.columns) # extracting the attribute names
print("These are the features:",attribute_names)

These are the features: ['sex' 'age' 'height' 'weight' 'waistline' 'sight_left' 'sight_right'
 'hear_left' 'hear_right' 'SBP' 'DBP' 'BLDS' 'tot_chole' 'HDL_chole'
 'LDL_chole' 'triglyceride' 'hemoglobin' 'urine_protein'
 'serum_creatinine' 'SGOT_AST' 'SGOT_ALT' 'gamma_GTP' 'SMK_stat_type_cd'
 'DRK_YN']


In [3]:
data = np.array(raw_data) # turning data into numpy array

# For the gender attribute we replace 'male' with the number 0 and 'female' with 1:
data[data == 'Male'] = 0 
data[data == 'Female'] = 1
# Same goes for drink yes/no. yes with 1 and no with 0
data[data == 'Y'] = 1 
data[data == 'N'] = 0

In [4]:
# Checked and works
# For 'sight_left' and 'sight_right' the scale goes from 0.1 - 2.5 depending on how good your sight is
# However 9.9 means blind. To handle this, we just change every 9.9 to a 0, given that 0 would be the worst possible sight 
data[:, 5][data[:, 5] == 9.9] = 0
data[:, 6][data[:, 6] == 9.9] = 0

In [5]:
# Checked and works
# Collecting outliners in wasteline and SGOT_AST attribute
indices = [i for i in range(len(data[:, 19])) if data[i, 19] > 3000 or (data[i, 4] == 999 or data[i, 4] < 40)]
# Now removing the rows with the indices found above
data = np.delete(data, indices, axis=0)

In [6]:
# Checked and works
# Applying 1-out-of-K on some of the discrete variables. 
# hearing_left and hearing_right, K=2
# urine_protien, K=6 and smoking state, K=3
one_out_of_K_cols_dict = {'encoding1':'hear_left', 'encoding2':'hear_right', 'encoding4':'SMK_stat_type_cd'}

# getting indices of collums that needs encoding
indices_list = [index for index, element in enumerate(attribute_names) if element in one_out_of_K_cols_dict.values()]

key_iterator = iter(one_out_of_K_cols_dict)

# For loop for creating encoding matrix for all keys in one_out_of_K_cols_dict
for i in range(len(indices_list)):
    K = int(data[:,indices_list[i]].max())+1
    encoding = np.zeros((data[:,indices_list[i]].size, K))
    encoding[np.arange(data[:,indices_list[i]].size), data[:,indices_list[i]].astype(int)] = 1
    # deleting first column, because values incoded goes from 1:n and not 0:n, so K=K+1, for this to work.
    encoding = encoding[:,1:]
    # Storing encoding as value in dict
    one_out_of_K_cols_dict[str(next(key_iterator))] = encoding

# Deleting old data columns
for j in range(len(indices_list)):
    data = np.delete(data, indices_list[j]-j, axis=1)

# Remember that last key (encoding for 'SMK_stat_type_cd') will be the first colums in data matrix
for key in one_out_of_K_cols_dict:
    for k in range(one_out_of_K_cols_dict[str(key)].shape[1]):
        data = np.insert(data, k, one_out_of_K_cols_dict[str(key)][:,k], axis=1)

In [7]:
# Checked and works
# standardizing data
for i in range(7, data.shape[1]):
    # Skipping attributes: sex and drink Y/N - binary
    if i != 7 and i != 27:
        data[:,i] = (data[:,i] - np.mean(data[:,i])) / np.std(data[:,i])

In [11]:
new_atts = ['SM1','SM2','SM3','HearingL1', 'HearingL2', 'HearingR1', 'HearingR2','sex', 'age', 'height', 'weight', 'waistline', 'sight_left', 'sight_right',
 'SBP' ,'DBP', 'BLDS' ,'tot_chole', 'HDL_chole',
 'LDL_chole', 'triglyceride', 'hemoglobin', 'urine_protein',
 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT' ,'gamma_GTP',
 'DRK_YN']
len(new_atts)

28

In [12]:
exportdata = pd.DataFrame(data)
exportdata.to_csv('sd-data_cleanedv1.csv', index=False, header=new_atts)