In [1]:
"""
VARIABLE DESCRIPTIONS:
survival        Survival
                (0 = No; 1 = Yes)
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

SPECIAL NOTES:
Pclass is a proxy for socio-economic status (SES)
 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

Age is in Years; Fractional if Age less than One (1)
 If the Age is Estimated, it is in the form xx.5
"""

import csv as csv
import numpy as np

csv_file_object = csv.reader(open('./data/train.csv','rb'))

#read in header and data
header = csv_file_object.next()
data = []
for row in csv_file_object:
    data.append(row)
    
data = np.array(data)

for i in range(len(header)):
    print str(i) + "   " + header[i]

0   PassengerId
1   Survived
2   Pclass
3   Name
4   Sex
5   Age
6   SibSp
7   Parch
8   Ticket
9   Fare
10   Cabin
11   Embarked


In [8]:
fare_ceiling = 40

#run through data, if fare >= 40, replace with fare 39 (ceiling)
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9] = fare_ceiling - 1.0

print(data[:10:,9])

#create fare brackets of size 10
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

print(number_of_price_brackets)

#create number of classes (given in data: 1st, 2nd, 3rd)
number_of_classes = len(np.unique(data[0::,2]))

#in order to avoid hardcoding, see how many values "sex" can take on
number_of_sexes = len(np.unique(data[0::,4]))

print(number_of_sexes)

['7.25' '39.0' '7.925' '39.0' '8.05' '8.4583' '39.0' '21.075' '11.1333'
 '30.0708']
4
2


In [9]:
#create 2x3x4 array matrix of bins (female/male, 1st/2nd/3rd class, price brackets)
survival_table = np.zeros([number_of_sexes,number_of_classes,number_of_price_brackets],float)
print(survival_table)

[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


In [22]:
#loop through each variable and find all those pasangers that agree with statements:
for i in xrange(number_of_classes):
    for j in xrange(number_of_price_brackets):
        women_only_stats = data[ (data[0::,4] == "female") & 
                                (data[0::,2].astype(np.float) == i+1) & 
                                (data[0::,9].astype(np.float) >= j*fare_bracket_size) &
                                (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size),
                                1]
        men_only_stats = data[ (data[0::,4] == "male") & 
                                (data[0::,2].astype(np.float) == i+1) & 
                                (data[0::,9].astype(np.float) >= j*fare_bracket_size) &
                                (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size),
                                1]
        survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))
        survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))

#take out the nan values (where the bracket is empty, resulting in division by zero)
survival_table[survival_table != survival_table] = 0
    
print(survival_table)

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


In [23]:
#assume that probability 0.5 or greater means survival
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5] = 1

print(survival_table)

[[[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


In [27]:
test_file_object = csv.reader(open('./data/test.csv','rb'))
header = test_file_object.next()

print(header)

p = csv.writer(open("./out/genderclassmodel.csv","wb"))

p.writerow(["PassengerId","Survived"])

for row in test_file_object:
    for j in xrange(number_of_price_brackets):
        try:
            row[8] = float(row[8])
        except:
            bin_fare = 3 - float(row[1])
            break
        if row[8] > fare_ceiling:
            bin_fare = number_of_price_brackets - 1
            break           
        if row[8] >= j * fare_bracket_size and row[8] < (j+1)*fare_bracket_size:
            bin_fare = j
            break
    if row[3] == "female":
        p.writerow([row[0], "%d" %int(survival_table[0, float(row[1])-1, bin_fare])])
    else:
        p.writerow([row[0], "%d" %int(survival_table[1, float(row[1])-1, bin_fare])])
        


['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
