In [None]:
import seaborn as sns
import pandas as pd

# Load Titanic dataset
df = sns.load_dataset("titanic")
print(df)

In [None]:
# View first row
print(df.head(), end="\n\n")

# View last row
print(df.tail(), end="\n\n")

# Check shape
print("Shape = " + f"{df.shape}", end="\n\n")

# Get column info & types
print(df.info(), end="\n\n")

# Check summary statistics
print(df.describe())

In [None]:
# Find missing values
print(df.isnull().sum(), end="\n\n")

# Fill age with median
df['age'] = df['age'].fillna(df['age'].median())
print(df['age'], end="\n\n")

# Fill embarked with mode
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
print(df['embarked'], end="\n\n")

# Drop deck
df.drop(columns=['deck'], inplace=True)
print(df.info(), end="\n\n")

In [None]:
# Check for duplicates
print(df.duplicated().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

In [None]:
# Standardize
cols_to_clean = ['sex', 'embarked', 'class', 'who', 'alive']

for col in cols_to_clean:
  df[col] = df[col].astype(str).str.lower().str.strip()

# Check for unique values
for col in cols_to_clean:
  print(df[col].unique())

In [None]:
# Convert categorial columns into proper categories
df['pclass'] = df['pclass'].astype('category')
df['sex'] = df['sex'].astype('category')
df['embarked'] = df['embarked'].astype('category')

# Convert alive into boolean
df['alive'] = df['alive'].astype(bool)

df.info()

In [None]:
# Detect outliers
print(df['fare'].describe(), end="\n\n")
print(df['age'].describe(), end="\n\n")

# Cap at 99th percentile
upper_limit = df['fare'].quantile(0.99)
df['fare_capped'] = df['fare'].clip(upper=upper_limit)
print(df['fare_capped'].describe())

In [None]:
# Rename columns
df = df.rename(columns={
    'sibsp': 'siblings_spouses_abroad',
    'parch': 'parents_children_abroad'
})

print(df.info())

In [None]:
# Pre-final check
print(df.isnull().sum(), end="\n\n")

print(df.dtypes, end="\n\n")

print(df.duplicated().sum(), end="\n\n")

print(df.info(), end="\n\n")

print(df.head())

In [None]:
# Final changes

# drop embarked column
# df.drop(columns="embarked", inplace=True)

# Fill missing values in emark_town with mode
mode_value = df['embark_town'].mode()[0]
df['embark_town'] = df['embark_town'].astype('str')
df['embark_town'] = df['embark_town'].replace('nan', mode_value)

# Standaradize the embark_town columns
df['embark_town'] = df['embark_town'].str.lower().str.strip()
df['embark_town'] = df['embark_town'].astype('category')

# Reset index
df = df.reset_index(drop=True)

In [None]:
# Final check
print(df.isnull().sum(), end="\n\n")

print(df.dtypes, end="\n\n")

print(df.info(), end="\n\n")

print(df.head(), end="\n\n")

In [None]:
df.to_csv("titanic_cleaned.csv", index=False)