# Preliminary Data Analysis

In [105]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [106]:
training_set_location = "train.csv"
train_raw = pd.read_csv(training_set_location)

test_set_location = "test.csv"
test_raw = pd.read_csv(test_set_location)

train_raw.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [107]:
train_raw.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

We explain the columns of the dataset

- Survived if passenger survives
- Ticket class = 1, 2, 3
- Sex = male/female
- Age = age in years, 0.5 if estimate
- Sibsp = siblings/spouses, Parch = parents/children
- ticket = ticket number
- Passenger fare
- Cabin number
- Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

## Missing values
We analyze the missing values in the training data.

In [108]:
train_raw.notna().all() # the following code indicates that only Age, Cabin, and Embarked have missing values

PassengerId     True
Survived        True
Pclass          True
Name            True
Sex             True
Age            False
SibSp           True
Parch           True
Ticket          True
Fare            True
Cabin          False
Embarked       False
dtype: bool

In [109]:
print(f"Number of passengers without known age: {len(train_df.loc[train_df["Age"].isna()])} of {len(train_df)}")
train_raw.loc[train_raw["Age"].isna()]

Number of passengers without known age: 0 of 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


The following code deals with the columns with NaNs

We now specify how we deal with missing values:
- We drop the cabin number, temporarily. This column isn't clean, some passengers have multiple.
- We impute the Age with the average age. We add a binary column, AgeKnown, if the age is exactly known.
- We create a new category for Embarked, M = missing

In [None]:
average_age = train_raw["Age"].mean(skipna=True)

def clean_dataframe(df: pd.DataFrame, average_age) -> pd.DataFrame:
    """Takes raw Titanic dataframes. Imputes NaNs for Age and Embarked, removes Cabin, and adds AgeKnown.
    Args:
        df (pd.DataFrame): Raw DataFrame
    Returns:
        pd.DataFrame: Clean DataFrame
    """
    df_clean = df.copy()
    df_clean["AgeKnown"] = ((df["Age"] % 1.0) == 0.0)|(df_clean["Age"] < 1.0)
    df_clean["Age"] = df_clean["Age"].fillna(value=average_age)
    df_clean["Embarked"] = df_clean["Embarked"].fillna(value="M")
    df_clean = df_clean.drop(columns="Cabin")

    return df_clean

In [111]:
train_clean = clean_dataframe(train_raw, average_age)
train_clean

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,AgeKnown
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,S,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,S,True
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,S,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,S,True
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,S,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,S,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C,True


## Derived features
In this section, we explain certain derived features

- Add .5 category, as this indicates uncertainty in the age