In [2]:
# Kaggle Titanic tutorial with R inspired by Trevor Stephens
# Tutorial part 1

In [3]:
# Import train set and test set
train <- read.csv("train.csv")
test <- read.csv("test.csv")

In [4]:
# Examine structure of data
str(train)

'data.frame':	891 obs. of  12 variables:
 $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
 $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
 $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
 $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
 $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
 $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
 $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
 $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
 $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...


In [5]:
# Number of people who died and survived
table(train$Survived)


  0   1 
549 342 

In [6]:
# Proportion of number of people who died and survived
prop.table(table(train$Survived))


        0         1 
0.6161616 0.3838384 

In [7]:
# For the first submission, let's just say that everyone died

# Add a Survived column in test set
test$Survived <- rep(0, 418)

# Create a submit dataframe which contains PassengerId and Survived variables
submit <- data.frame(PassengerId=test$PassengerId, Survived=test$Survived)

# Write submit dataframe to csv file
write.csv(submit, file="theyallperished.csv", row.names=FALSE)

In [8]:
# Tutorial part 2 - closer view at genders and age

# Total of females and males
table(train$Sex)


female   male 
   314    577 

In [9]:
# Genders proportion
prop.table(table(train$Sex))


  female     male 
0.352413 0.647587 

In [10]:
# Proportion who died and survived in function of genders
prop.table(table(train$Sex, train$Survived), 1)

        
                 0         1
  female 0.2579618 0.7420382
  male   0.8110919 0.1889081

In [12]:
# We notice that 74.2% of women survived, whereas only 18.9% of men survived
# Let's make a submission in which all women survived

# All women survived
test$Survived[test$Sex == 'female'] = 1

# Write new dataframe to a csv file
submit <- data.frame(PassengerId=test$PassengerId, Survived=test$Survived)
write.csv(submit, file="onlywomensurvived.csv", row.names=FALSE)

In [13]:
# Let's see if the age variable has something to offer
summary(train$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.42   20.12   28.00   29.70   38.00   80.00     177 

In [14]:
# Create a new variable Child to indicate whether the passenger is below the age of 18
train$Child <- 0
train$Child[train$Age < 18] <- 1

In [15]:
# Number of survivors according to age and sex
aggregate(Survived ~ Child + Sex, data=train, FUN=sum)

Child,Sex,Survived
0,female,195
1,female,38
0,male,86
1,male,23


In [17]:
# Number of people according to age and sex
aggregate(Survived ~ Child + Sex, data=train, FUN=length)

Child,Sex,Survived
0,female,259
1,female,55
0,male,519
1,male,58


In [18]:
# Proportion of people who survived according to age and sex
aggregate(Survived ~ Child + Sex, data=train, FUN=function(x) {sum(x)/length(x)})

Child,Sex,Survived
0,female,0.7528958
1,female,0.6909091
0,male,0.1657033
1,male,0.3965517


In [19]:
# Make bins for the fare variable
train$Fare2 <- "30+"
train$Fare2[train$Fare < 10] <- "<10"
train$Fare2[train$Fare < 20 & train$Fare >= 10] <- "10-20"
train$Fare2[train$Fare < 30 & train$Fare >= 20] <- "20-30"

In [21]:
# Aggregate function proportion of people who survived in function of fare, class and sex
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x) {sum(x)/length(x)})

Fare2,Pclass,Sex,Survived
20-30,1,female,0.8333333
30+,1,female,0.9772727
10-20,2,female,0.9142857
20-30,2,female,0.9
30+,2,female,1.0
<10,3,female,0.59375
10-20,3,female,0.5813953
20-30,3,female,0.3333333
30+,3,female,0.125
<10,1,male,0.0


In [26]:
# Number of people who survived according to fare, class and sex
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=sum)

Fare2,Pclass,Sex,Survived
20-30,1,female,5
30+,1,female,86
10-20,2,female,32
20-30,2,female,27
30+,2,female,11
<10,3,female,38
10-20,3,female,25
20-30,3,female,7
30+,3,female,2
<10,1,male,0


In [27]:
# Number of people according to fare, class and sex
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=length)

Fare2,Pclass,Sex,Survived
20-30,1,female,6
30+,1,female,88
10-20,2,female,35
20-30,2,female,30
30+,2,female,11
<10,3,female,64
10-20,3,female,43
20-30,3,female,21
30+,3,female,16
<10,1,male,6


In [31]:
# Create new submit file in which all women survived but women with class 3 and fare <= 20
test$Survived <- 0
test$Survived[test$Sex == "female"] <- 1
test$Survived[test$Sex == "female" & test$Pclass == 3 & test$Fare >= 20] <- 0

submit <- data.frame(PassengerId=test$PassengerId, Survived=test$Survived)
write.csv(submit, file="mostwomensurvived.csv", row.names=FALSE)