In [22]:
import numpy as np
import pandas as pd

filename = 'titanic.csv'
data = pd.read_csv(filename)

raw_data = data.values # converting from pandas dataframe to numpy array
attribute_names = np.asarray(data.columns) # extracting the attribute names
print("These are the features:",attribute_names)


These are the features: ['Survived' 'Pclass' 'Name' 'Sex' 'Age' 'Siblings/Spouses Aboard'
 'Parents/Children Aboard' 'Fare']


In [23]:
data = np.array(raw_data) # turning data into numpy array

# For the gender attribute we replace 'male' with the number 0 and 'female' with 1:
data[data == 'male'] = 0 
data[data == 'female'] = 1

# Normalize ratio attributes OBS 3*std
data[:,4] = (data[:,4]-np.mean(data[:,4]))/(3*np.std(data[:,4]))
data[:,7] = (data[:,7]-np.mean(data[:,6]))/(3*np.std(data[:,7]))

In [24]:
# Applying 1-out-of-K on name and pclass attribute. 
# For name we only use the 'titel' before the name. 'Mr', 'Mrs', 'Miss' and 'Master' is assigned a number each
# All other titels is just assigned a common number (about 1 procent of dataset) (skal lige fact tjekkes)

# Extract titles for all data points
titel_list = [raw_data[:,2][i].split('.')[0] for i in range(len(raw_data[:,2]))]

# Assign correct number for each title in titel_list
titel_to_number_dict = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Don':4, 'Rev':4, 'Dr':4, 'Mme':4, 'Ms':4, 'Major':4, 'Lady':4, 'Sir':4, 'Mlle':4, 'Col':4, 'Capt':4, 'the Countess':4, 'Jonkheer': 4}
for i in range(len(titel_list)):
    titel_list[i]=titel_to_number_dict[titel_list[i]]

# 1-out-of-K
titel_list_arr = np.array(titel_list)
K = titel_list_arr.max()+1
titel_encoding = np.zeros((titel_list_arr.size, K))
titel_encoding[np.arange(titel_list_arr.size), titel_list_arr] = 1

# Same process as above, but for pclass
class_list = [raw_data[:,1][i] for i in range(len(raw_data[:,1]))]
class_list_arr = np.array(class_list)
K2 = titel_list_arr.max()
class_encoding = np.zeros((class_list_arr.size, K2))
class_encoding[np.arange(class_list_arr.size), class_list_arr] = 1
class_encoding = class_encoding[:,1:]

In [25]:
# Replacing name and pclass with 1-out-of-K representations

# For class
data = np.delete(data, 1, axis=1)
for i in range(class_encoding.shape[1]):
    data = np.insert(data, i+1, class_encoding[:,i], axis=1)

# For name (titel)
data = np.delete(data, 4, axis=1)
for i in range(titel_encoding.shape[1]):
    data = np.insert(data, i+4, titel_encoding[:,i], axis=1)

In [26]:
# As shown below 'Siblings/Spouses Aboard' and 'Parents/Children Aboard' has a big interval compared to the rest of the data
np.min(data[:,11]), np.max(data[:,11]), np.min(data[:,12]), np.max(data[:,12])

(0, 8, 0, 6)

In [27]:
# To eleminate this bias, we use log()
for i in range(data.shape[0]):
    data[i,11] = np.log10(data[i,11]+1)
    data[i,12] = np.log10(data[i,12]+1)

np.min(data[:,11]), np.max(data[:,11]), np.min(data[:,12]), np.max(data[:,12])

(0.0, 0.9542425094393249, 0.0, 0.8450980400142568)

In [28]:
#exportdata = pd.DataFrame(data)
#exportdata.to_csv('titanicdata_cleanedv2.csv', index=False)