# Data Investigation

## Configuration

In [None]:
options(jupyter.plot_mimetypes = "image/svg+xml")
library(tidyverse)

In [None]:
round_by <- function(x, divisor) {
    round(x / divisor) * divisor
}

## Data Import

In [None]:
training_data <- read_csv('train_processed.csv')

In [None]:
total_passengers <- training_data %>%
    summarize(TotalPassengers = n()) %>%
    first()
total_passengers

In [None]:
training_data

## Exploratory plots

Is class an important factor to the survival rate?

In [None]:
# training_data %>%
#     group_by(Pclass, Sex) %>%
#     summarize(TotalSurvived = sum(Survived)) %>%
#     ggplot(aes(x = Pclass, y = TotalSurvived, fill = Sex)) +
#     geom_col()

training_data %>%    
    group_by(Pclass, Sex) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%    
            group_by(Pclass) %>%
            count() %>%
            rename(TotalPassengersByPclass = n),
        by = "Pclass"
    ) %>%
    ggplot(aes(x = Pclass, y = TotalSurvived / TotalPassengersByPclass, fill = Sex)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Sex and Passenger class are factors to survival",
        subtitle = "Females and first class passengers are more likely to survive",
        x = "Passenger class",
        y = "Survivor percentage",
        caption = "Surviror percentage is grouped by passenger class"
    )

Is sex an important factor to the survival rate?

In [None]:
training_data %>%
    group_by(Sex, Pclass) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%    
            group_by(Sex) %>%
            count() %>%
            rename(TotalPassengersBySex = n),
        by = "Sex"
    ) %>%
    ggplot(aes(x = Sex, y = TotalSurvived / TotalPassengersBySex, fill = Pclass)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Sex is a factor to survival",
        subtitle = "Females are likely to survive regardless of class",
        x = "Sex",
        y = "Survivor percentage",
        fill = "Passenger class",
        caption = "Surviror percentage is grouped by sex"
    )

In [None]:
training_data %>%
    mutate(Age = round_by(Age, 10)) %>%
    group_by(Sex, Age) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            group_by(Sex) %>%
            count() %>%
            rename(TotalPassengersBySex = n),
        by = "Sex"
    ) %>%
    ggplot(aes(x = Sex, y = TotalSurvived / TotalPassengersBySex, fill = Age)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Sex and age are factors to survival",
        x = "Sex",
        y = "Survivor percentage",
        caption = "Surviror percentage is grouped by sex"
    )

Is age an important factor to the survival rate?

In [None]:
training_data %>%
    mutate(Age = round_by(Age, 5)) %>%
    group_by(Age, Sex) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            mutate(Age = round_by(Age, 5)) %>%
            group_by(Age) %>%
            count() %>%
            rename(TotalPassengersByAge = n),
        by = "Age"
    ) %>%
    ggplot(aes(x = Age, y = TotalSurvived / TotalPassengersByAge, fill = Sex)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Age is a factor to survival",
        x = "Age",
        y = "Survivor percentage",
        fill = "Passenger class",
        caption = "Surviror percentage is grouped by age"
    )

In [None]:
Is the starting point

In [None]:
training_data %>%
    group_by(Embarked, FamilySize) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            group_by(Embarked) %>%
            count() %>%
            rename(TotalPassengersByEmbarked = n),
        by = "Embarked"
    ) %>%
    ggplot(aes(x = Embarked, y = TotalSurvived / TotalPassengersByEmbarked, fill = FamilySize)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Embarked location is a factor to survival",
        subtitle = "Those from C are more likely to survive than those from Q and S",
        x = "Embarked location",
        y = "Survivor percentage",
        fill = "Family size",
        caption = "Surviror percentage is grouped by embarked location"
    )

In [None]:
training_data %>%
    group_by(NameTitle) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            group_by(NameTitle) %>%
            count() %>%
            rename(TotalPassengersByNameTitle = n),
        by = "NameTitle"
    ) %>%
    ggplot(aes(x = NameTitle, y = TotalSurvived / TotalPassengersByNameTitle)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        x = "Passenger's title (Mr., Mrs., etc.)",
        y = "Survivor percentage",
        caption = "Surviror percentage is grouped by title"
    )

In [None]:
training_data %>%
    group_by(Deck, Pclass) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            group_by(Deck) %>%
            count() %>%
            rename(TotalPassengersByDeck = n),
        by = "Deck"
    ) %>%
    ggplot(aes(x = Deck, y = TotalSurvived / TotalPassengersByDeck, fill = Pclass)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        title = "Deck has marginal impact on survival",
        subtitle = "Most survival rates are near 75%, irrespective of deck",
        x = "Deck",
        y = "Survivor percentage",
        fill = "Passenger class",
        caption = "Surviror percentage is grouped by deck"
    )

In [None]:
training_data %>%
    mutate(FarePerPerson = round_by(FarePerPerson, 20)) %>%
    group_by(FarePerPerson, Pclass) %>%
    summarize(TotalSurvived = sum(Survived)) %>%
    left_join(
        training_data %>%
            mutate(FarePerPerson = round_by(FarePerPerson, 20)) %>%
            group_by(FarePerPerson) %>%
            count() %>%
            rename(TotalPassengersByFarePerPerson = n),
        by = "FarePerPerson"
    ) %>%
    ggplot(aes(x = FarePerPerson, y = TotalSurvived / TotalPassengersByFarePerPerson, fill = Pclass)) +
    geom_col() +
    coord_cartesian(ylim = c(0, 1)) +
    labs(
        x = "Fare per family member",
        y = "Survivor percentage",
        fill = "Passenger class",
        caption = "Surviror percentage is grouped by fare per family member"
    )

In [None]:
training_data %>%
    mutate(Age = round_by(Age, 5)) %>%
    ggplot(aes(x = Sex, y = Age)) +
    geom_violin()

In [None]:
training_data %>%
    ggplot(aes(x = FamilySize, y = Pclass)) +
    geom_jitter()

In [None]:
training_data %>%
    group_by(Pclass) %>%
    summarize(min = min(Fare), lower_quartile = quantile(Fare, 0.25), median = median(Fare), upper_quartile = quantile(Fare, 0.75), max = max(Fare))