In [None]:
# | label: setup
# | echo: false

import difflib as dl
import re
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import plotly.express as px
from ydata_profiling import ProfileReport

In [None]:
from itables import init_notebook_mode  # Display dataframes in a friendly manner

init_notebook_mode(all_interactive=True)

In [None]:
# | label: data_source
# | echo: false

DATA_URL = "https://hbiostat.org/data/repo/titanic3.csv"  # hopefully this is a "definitive" source

## Did a male octogenarian really survive the sinking of the RMS Titanic?

### Or: Is there a long-standing error in an oft-used dataset?

As it’s not necessarily a word we use often, let me paraphrase: did an 80 year old guy really manage to make it out of the freezing waters to safety following the infamous maritime disaster?

The short answer is NO. However, read on and let me explain how this article came to be as part of my Data Science travels – in a [Kaggle](http://www.kaggle.com) warm-up “competition” specifically.

### Source data - "1999 Original" (`titanic3`)

The source data has moved a few times:

- TODO

The latest incarnation can be found here:

- https://hbiostat.org/data/
- https://hbiostat.org/data/repo/titanic
- https://hbiostat.org/data/repo/titanic3.csv

and has been replicated countless times including the Kaggle version.


Is it possible to find actual date of birth for each passanger? These data sources only have age (it seems).

These data sets were downloaded from https://www.kaggle.com/c/titanic/data

#### DuckDB

In [None]:
table_name = "titanic"

In [None]:
load_data_sql = f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM read_csv_auto('{DATA_URL}')"

In [None]:
load_data_sql

In [None]:
def load_data_duckdb(sql):
    con = duckdb.connect()
    con.sql(sql)
    return con

In [None]:
con = load_data_duckdb(load_data_sql)
df = con.sql("SELECT * FROM titanic").df()

In [None]:
df.info()

In [None]:
con.sql("SELECT pclass as passenger_class FROM titanic").df()

In [None]:
# con.close()

#### Pandas (comparison)

In [None]:
pd_df = pd.read_csv(DATA_URL)

In [None]:
df.equals(pd_df)

Check local version of file in `seeds` directory

In [None]:
seeds_df = pd.read_csv("../seeds/titanic3.csv")

In [None]:
seeds_df.rename(
    columns={"home_dest": "home.dest"}, inplace=True
)  # May need to change the column name from home.dest to home_dest to get tools to load

In [None]:
seeds_df.equals(df)  # with this rename all good

In [None]:
seeds_df.info()

In [None]:
[col1 == col2 for col1, col2 in list(zip(seeds_df.columns.tolist(), df.columns.tolist()))]

In [None]:
[type1 == type2 for type1, type2 in list(zip(seeds_df.dtypes.tolist(), df.dtypes.tolist()))]

In [None]:
concatenated_df = pd.concat([df, seeds_df])
differences = concatenated_df.drop_duplicates(keep=False)

In [None]:
len(differences) == 0

### SDF

https://www.sdf.com

In [None]:
# QUESTION: How can I get the data out of SDF to reconcile?

### Profiling

In [None]:
profile = ProfileReport(df)

In [None]:
profile.to_notebook_iframe()

Now put the other dataset(s) age data on this plot too.

In [None]:
# df_wikipedia = pd.read_html('https://en.m.wikipedia.org/wiki/Passengers_of_the_RMS_Titanic', header=0)

# Not working

In [None]:
facts_tables = pd.read_html("http://www.titanicfacts.net/titanic-passenger-list.html", header=0)

In [None]:
facts_tables[0].info()

In [None]:
facts_tables[1].info()

In [None]:
facts_tables[2].info()

So the "Titanic Facts" has the data spread across 3 tables with 324 (1st class), 284 (2nd class) and 709 (3rd class) passengers (1317 total) respectively. There is age data for all 1317 passengers. Contrast this with 1309 passengers in the Kaggle dataset [which does not claim to be complete -- in fact, somewhat disappointingly, there does not appear to be a reference for the data] with only 1046 age values.

In [None]:
facts = pd.merge(pd.merge(facts_tables[0], facts_tables[1], how="outer"), facts_tables[2], how="outer")

In [None]:
facts.to_csv("facts.csv")

In [None]:
facts.info()

In [None]:
facts.head()