Skip to content

Commit

Permalink
Update, enhancements
Browse files Browse the repository at this point in the history
  • Loading branch information
ColinLeverger committed Oct 9, 2016
1 parent df120df commit c327ed3
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 56 deletions.
6 changes: 0 additions & 6 deletions DQR_cat

This file was deleted.

6 changes: 6 additions & 0 deletions DQR_cat.csv
@@ -0,0 +1,6 @@
"","n.non.miss","n.miss","n.miss.percent","n.unique","cat_1","freq_1","cat_2","freq_2","cat_3","freq_3","cat_4","freq_4","cat_5","freq_5","cat_6","freq_6","cat_7","freq_7","cat_8","freq_8","cat_9","freq_9","cat_10","freq_10"
"Name",1309,0,0,1307,"Connolly, Miss. Kate",2,"Kelly, Mr. James",2,"Abbing, Mr. Anthony","1","Abbott, Master. Eugene Joseph","1","Abbott, Mr. Rossmore Edward","1","Abbott, Mrs. Stanton (Rosa Hunt)","1","Abelseth, Miss. Karen Marie","1","Abelseth, Mr. Olaus Jorgensen","1","Abelson, Mr. Samuel","1","Abelson, Mrs. Samuel (Hannah Wizosky)","1"
"Sex",1309,0,0,2,"male",843,"female",466,"","","","","","","","","","","","","","","",""
"Ticket",1309,0,0,929,"CA. 2343",11,"1601",8,"CA 2144","8","3101295","7","347077","7","347082","7","PC 17608","7","S.O.C. 14879","7","113781","6","19950","6"
"Cabin",295,1014,77.46,187,"C23 C25 C27",6,"B57 B59 B63 B66",5,"G6","5","B96 B98","4","C22 C26","4","C78","4","D","4","F2","4","F33","4","F4","4"
"Embarked",1307,2,0.15,4,"S",914,"C",270,"Q","123","","","","","","","","","","","","","",""
8 changes: 0 additions & 8 deletions DQR_cont

This file was deleted.

8 changes: 8 additions & 0 deletions DQR_cont.csv
@@ -0,0 +1,8 @@
"","non-missing","missing","missing percent","unique","mean","min","p1","p5","p10","p25","p50","p75","p90","p95","p99","max"
"PassengerId",1309,0,0,1309,655,1,14.08,66.4,131.8,328,655,982,1178.2,1243.6,1295.92,1309
"Survived",891,418,31.93,3,0.38,0,0,0,0,0,0,1,1,1,1,1
"Pclass",1309,0,0,3,2.29,1,1,1,1,2,3,3,3,3,3,3
"Age",1046,263,20.09,99,29.88,0.17,0.92,5,14,21,28,39,50,57,65,80
"SibSp",1309,0,0,7,0.5,0,0,0,0,0,0,1,1,2,5,8
"Parch",1309,0,0,8,0.39,0,0,0,0,0,0,0,2,2,4,9
"Fare",1308,1,0.08,282,33.3,0,0,7.22,7.57,7.9,14.45,31.27,78.05,133.65,262.38,512.33
131 changes: 89 additions & 42 deletions exploring.R
Expand Up @@ -19,98 +19,90 @@ setwd("/Users/colinleverger/Downloads/titanic-ml/")
# Load data
missing.types <- c("NA", "")
train.data <- read.csv("data/train.csv", na.strings = missing.types)

# Rename some cols for beter comprehension
colnames(train.data)
colnames(train.data)[colnames(train.data) == "SibSp"] <-
"SiblingsSpouses"
colnames(train.data)[colnames(train.data) == "ParCh"] <-
"ParentsChildren"
colnames(train.data)[colnames(train.data) == "Pclass"] <-
"PassengerClass"
colnames(train.data)
test.data <- read.csv("data/test.csv", na.string = missing.types)
total.data <- bind_rows(train.data, test.data)

#### DQR ####
checkDataQuality(train.data, out.file.num = "DQR_cont", out.file.cat = "DQR_cat")
dqr.cont <- read.csv("DQR_cont")
dqr.cat <- read.csv("DQR_cat")

typeof(train.data)

#### Exploring the data ####
checkDataQuality(total.data,
out.file.num = "DQR_cont.csv",
out.file.cat = "DQR_cat.csv")
dqr.cont <- read.csv("DQR_cont.csv")
dqr.cat <- read.csv("DQR_cat.csv")

#### Explore the data ####
# Explore difference between men's and women's death
survived <- train.data %>%
group_by(Sex) %>%
summarise(Survived = sum(Survived),
Died = n() - sum(Survived))

# Exploring difference between men's and women's death
survived.long <- melt(survived, id.vars = "Sex")
ggplot(survived.long, aes(x = variable, y = value, fill = factor(Sex))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_discrete(name = "Gender") +
xlab("People") + ylab("Population")

# Exploring survival rates
# Explore survival rates
barplot(
table(train.data$Survived),
names.arg = c("Perished", "Survived"),
main = "Survived",
col = "black"
)

# Exploring passenger classes
barplot(table(train.data$PassengerClass),
# Explore passenger classes
barplot(table(train.data$Pclass),
main = "Passenger Classes",
col = "red")

# Exploring gender repartition
# Explore gender repartition
barplot(table(train.data$Sex),
main = "Sex (gender)",
col = "blue")

# Exploring age repartition
# Explore age repartition
hist(train.data$Age,
main = "Age",
xlab = NULL,
col = "brown")
d <- density(train.data[!is.na(train.data$Age), ]$Age)
plot(d, main = "Age density", xlab = NULL, col = "brown")

# Exploring fare paid by passengers
# Explore fare paid by passengers
hist(train.data$Fare,
main = "Fare",
xlab = NULL,
col = "red")

# Exploring Siblings and spouses repartition
barplot(table(train.data$SiblingsSpouses),
# Explore Siblings and spouses repartition
barplot(table(train.data$SibSp),
main = "Siblings & Spouses",
col = "orange")

# Exploring parents and kid repartition
# Explore parents and kid repartition
barplot(table(train.data$Parch),
main = "Parch (parents and kid)",
col = "white")

# Exploring boarding location
# Explore boarding location
barplot(
table(train.data$Embarked),
names.arg = c("Cherbourg", "Queenstown", "Southampton"),
main = "Embarked",
col = "yellow"
)

# Exploring passenger Fate by Traveling Class
# Explore passenger Fate by Traveling Class
mosaicplot(
train.data$PassengerClass ~ train.data$Survived,
train.data$Pclass ~ train.data$Survived,
main = "Passenger Fate by Traveling Class",
shade = FALSE,
color = TRUE,
xlab = "Passenger Class",
ylab = "Survived"
)

# Exploring passenger Fate by Embarked places
# Explore passenger Fate by Embarked places
mosaicplot(
train.data$Embarked ~ train.data$Survived,
main = "Passenger Fate by Embarked places",
Expand All @@ -120,22 +112,77 @@ mosaicplot(
ylab = "Survived"
)

#### Treating the data ####
# Computing median and mean
mean_age <- train.data[!is.na(train.data$Age), ] %>%
# Explore passenger Travelling Class by Age
boxplot(
Age ~ Pclass,
data = train.data,
main = "Passenger Travelling Class by Age",
xlab = "Passenger Class",
ylab = "Age"
)

# Computing median and mean for age
mean.age <- train.data[!is.na(train.data$Age), ] %>%
group_by(Sex) %>%
summarise(
Sum = sum(Age),
"Population" = n(),
Mean = mean(Age),
Mediane = median(Age)
)

# Replacing missing ages
train.data[is.na(train.data$Age) & train.data$Sex == "male", ]$Age <-
mean_age[mean_age$Sex == 'male', ]$Mediane
# Extracting values "Mr", ... and create new categorical column
train.data$Title <- gsub('(.*, )|(\\..*)', '', train.data$Name)
train.data$Name <-
gsub('(, [a-zA-Z]{,20}. )', ', ', train.data$Name)

train.data[is.na(train.data$Age) & train.data$Sex == "female", ]$Age <-
mean_age[mean_age$Sex == 'female', ]$Mediane

# Displaying repartition of the titles
barplot(
table(train.data$Title),
main = "Title",
col = "blue"
)

# Linking it to the sex...
table(train.data$Sex, train.data$Title)

# Treatment of the rare titles
rare.title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don',
'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')

# TODO entire dataset
train.data$Title[train.data$Title == 'Mlle'] <- 'Miss'
train.data$Title[train.data$Title == 'Ms'] <- 'Miss'
train.data$Title[train.data$Title == 'Mme'] <- 'Mrs'
train.data$Title[train.data$Title %in% rare.title] <- 'RareTitle'

# Explore result again
barplot(
table(train.data$Title),
main = "Titles",
col = "blue"
)
table(train.data$Sex, train.data$Title)

# Explore families
train.data$FamilySize = train.data$SibSp + train.data$Parch + 1
barplot(
table(train.data$FamilySize),
main = "Family size repartition",
col = "red")

ggplot(train.data, aes(x = FamilySize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size')

# TODO entire dataset
train.data$FamilySizeD[train.data$FamilySize == 1] <- 'singleton'
train.data$FamilySizeD[train.data$FamilySize > 1 & train.data$FamilySize < 5] <- 'small'
train.data$FamilySizeD[train.data$FamilySize >= 5] <- 'big'

#### Replace missing values ####
# Dummy method: replace age by median for gender
train.data[is.na(train.data$Age) & train.data$Sex == "male", ]$Age <-
mean.age[mean.age$Sex == 'male', ]$Mediane

train.data[is.na(train.data$Age) & train.data$Sex == "female", ]$Age <-
mean.age[mean.age$Sex == 'female', ]$Mediane

0 comments on commit c327ed3

Please sign in to comment.