In [2]:

import pandas as pd
import seaborn as sns

df = sns.load_dataset("titanic")  # comes with Seaborn
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# Renames columns into something better...

df = df.rename(columns={
    "sibsp": "siblings_spouses",
    "parch": "parents_children"
})
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'siblings_spouses',
       'parents_children', 'fare', 'embarked', 'class', 'who', 'adult_male',
       'deck', 'embark_town', 'alive', 'alone'],
      dtype='object')

In [7]:

# Let's see what type these data points are ...
df.dtypes

# set explicit types (to keep things tidy and memory-friendly)
df = df.astype({
    "survived": "int8",
    "pclass": "int8",
    "age": "float32",
    "fare": "float32"
})

# make categoricals expilicit
for col in ["sex", "embarked", "class", "who", "adult_male"]:
    if col in df.columns:
        df[col] = df[col].astype("category")

df.dtypes

survived                int8
pclass                  int8
sex                 category
age                  float32
siblings_spouses       int64
parents_children       int64
fare                 float32
embarked            category
class               category
who                 category
adult_male          category
deck                category
embark_town           object
alive                 object
alone                   bool
dtype: object

In [9]:
# Handling nulls
# See where the holes are...
df.isna().sum()

# simple, defensive fixes:
df["age"] = df["age"].fillna(df["age"].median())  # numeric -> median
df["embarked"] = df["embarked"].fillna(df["embarked"].mode()[0])  # categorical -> mode
df = df.dropna(subset=["fare"])  # drop if missing fare (we aren't here but this is how you would)

# See where the holes are now...
df.isna().sum()

survived              0
pclass                0
sex                   0
age                   0
siblings_spouses      0
parents_children      0
fare                  0
embarked              0
class                 0
who                   0
adult_male            0
deck                688
embark_town           2
alive                 0
alone                 0
dtype: int64

In [10]:
# Feature Engineering
# family size = self + siblings/spouses + parent/children
df["family_size"] = df["siblings_spouses"] + df["parents_children"] + 1

# age bands (ordinal cateforical)
df["age_group"] = pd.cut(
   df["age"],
   bins=[0, 12, 18, 35, 55, 120],
   labels=["child", "teen", "young_adult", "adult", "senior"],
   right=True 
)

# fare per person
df["fare_per_person"] = (df["fare"] / df["family_size"]).round(2)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,siblings_spouses,parents_children,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,age_group,fare_per_person
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2,young_adult,3.62
1,1,1,female,38.0,1,0,71.283302,C,First,woman,False,C,Cherbourg,yes,False,2,adult,35.64
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1,young_adult,7.93


In [None]:
# duplicates (across a sensible subset of cols)
dupes = df.duplicated(subset=[
    "pclass", "sex", "age", "fare", "siblings_spouses", "parents_children"]).sum()
print("Duplicate rows: ", dupes) 
print("In this case due to us using median in age I have decided that these " \
"duplicates aren't really duplicates, so have decided to leave them in.")

# invalid ranges / values
invalid = {
    "negative_fare": int((df["fare"] < 0).sum()),
    "negative_age": int((df["age"] < 0).sum()),
    "bad_pclass": int((~df["pclass"].isin([1, 2, 3])).sum()),
    "non_binary_sex": int((~df["sex"].isin(["male", "female"])).sum())
}

# light assertions (will raise if something's off)
assert invalid["negative_fare"] == 0, "Found negative fares"
assert invalid["negative_age"] == 0, "Found negative ages"
assert invalid["bad_pclass"] == 0, "pclass must be 1, 2, 3"
assert invalid["non_binary_sex"] == 0, "sex must be male/female"



Duplicate rows:  136


{'negative_fare': 0, 'negative_age': 0, 'bad_pclass': 0, 'non_binary_sex': 0}

In [None]:
# Final tidy table + export
clean = df[[
    "survived", "pclass", "sex", "age", "fare", "family_size", "age_group", "fare_per_person",
    # keep the one-hots for embarked 
    *[c for c in df.columns if c.startswith("embarked_")]
]]
clean.head()

clean.to_csv("titanic_clean.csv", index=False)
print("Saved titanic_clean.csv with", len(clean), "rows.")

Saved titanic_clean_indexed.csv with 891 rows.
