# Data preparation and cleaning

In [29]:
# Imports
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt

## Detecting problems

In [25]:
dft = pd.read_csv("data/titanic_processed.csv")
dft.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1063.0,1063.0,1063.0,863.0,1063.0,1063.0,1063.0
mean,445.687676,0.394167,2.301976,30.746419,0.521167,0.398871,32.455538
std,256.715401,0.488901,0.834782,17.174277,1.100109,0.808895,50.118943
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,222.0,0.0,2.0,20.0,0.0,0.0,7.925
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.5
75%,665.5,1.0,3.0,39.0,1.0,1.0,31.275
max,891.0,1.0,3.0,107.0,8.0,6.0,512.3292


### Empty values

The first and most obvious thing to check is whether there are empty values in the data.


In [None]:
for col in dft.columns:
    print(f'Column {col: <12} has {dft[col].isna().sum(): 4d} NaN values')


Column PassengerId  has    0 NaN values
Column Survived     has    0 NaN values
Column Pclass       has    0 NaN values
Column Name         has    0 NaN values
Column Sex          has    0 NaN values
Column Age          has  205 NaN values
Column SibSp        has    0 NaN values
Column Parch        has    0 NaN values
Column Ticket       has    0 NaN values
Column Fare         has    0 NaN values
Column Cabin        has  826 NaN values
Column Embarked     has    2 NaN values


### Duplicates

Records have an unpleasant tendency to be duplicated. Luckily Pandas can help detect this.


In [24]:
print(f'Number of duplicates is {dft.duplicated( keep='first').sum()}')

Number of duplicates is 172


### Nonsensical values

Let's check each column to see if the values make sense.

For numerical columns we do this by checking the range of values while for text columns we list the unique values.

This data set has one column for which this is impossible: we likely have no way of determining whether a value in the "Name" column is plausible or not. The same is true for the "Ticket" column which holds the ticket number and for the "Cabin" column which holds the cabin number.

In [35]:
columns_to_skip = ['Name', 'Ticket', 'Cabin']
for col in dft.columns:
    if col in columns_to_skip:
        continue
    if is_numeric_dtype(dft[col]):
        print(f'Column "{col}" has range {dft[col].min()} - {dft[col].max()}')
    else:
        print(f'Column {col} has values {dft[col].unique()} ')

Column "PassengerId" has range 1 - 891
Column "Survived" has range 0 - 1
Column "Pclass" has range 1 - 3
Column Sex has values ['female' 'male' 'fmale' 'caprio'] 
Column "Age" has range 0.42 - 107.0
Column "SibSp" has range 0 - 8
Column "Parch" has range 0 - 6
Column "Fare" has range 0.0 - 512.3292
Column Embarked has values ['S' 'Q' 'C' nan] 


## Fixing problems

### Duplicates