In [1]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv("Titanic Dataset.csv")
print("✓ Data loaded successfully!")
print(f"Original shape: {df.shape}")
print(f"\nOriginal columns: {df.columns.tolist()}")

✓ Data loaded successfully!
Original shape: (1309, 14)

Original columns: ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']


In [2]:
# TODO: Print the shape of the DataFrame
# df.shape[0] gives rows, df.shape[1] gives columns
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

Rows: 1309
Columns: 14


In [3]:
# TODO: Use .head() to display the first 5 rows
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
# TODO: Use .isnull().sum() to count missing values per column
print("Missing values in original data:")
print(df.isnull().sum())

Missing values in original data:
pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [5]:
# most missing values are cabin, bodys, and boat ect.

In [6]:
# TODO: Select only the useful columns
df = df[['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

print("✓ Columns selected!")
print(f"New shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

✓ Columns selected!
New shape: (1309, 8)

Columns: ['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']


In [9]:
# Names are unique identifiers so bassically high cardinality. Unless I extract titles like Mr. or Mrs. , the name itself doesn't help the math of the model predict survival.

In [8]:
# It has too many missing values. Dropping the rows would lose too much data, and filling them is bad because I don't have a good reference point/info.

In [10]:
# This causes Data Leakage. If a passenger has a lifeboat number, it implies they survived. The model would just "cheat" by looking at this column rather than learning patterns from age or class and then everything is messed up.

In [11]:
# TODO: Check for missing values in the cleaned dataset
print("Missing values after feature selection:")
print(df.isnull().sum())

Missing values after feature selection:
pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64


In [None]:
# 263

In [12]:
# 1

In [13]:
# 2

In [14]:
# TODO: Calculate the median age
median_age = df['age'].median()
print(f"Median age: {median_age}")

Median age: 28.0


In [17]:
# TODO: Fill missing ages with median_age
df['age'].fillna(median_age, inplace=True)

print("✓ Missing ages filled with median!")
print(f"Missing ages now: {df['age'].isnull().sum()}")

✓ Missing ages filled with median!
Missing ages now: 0


In [18]:
# Calculate both median and mean
print(f"Median age: {df['age'].median():.2f}")
print(f"Mean age: {df['age'].mean():.2f}")

Median age: 28.00
Mean age: 29.50


In [None]:
# The Mean is barely larger. Because age is "right-skewed"—there is a couple old people who pull the average (mean) up, while the median stays in the middle.

In [23]:
# TODO: Calculate median fare
median_fare = df['fare'].median()
print(f"Median fare: ${median_fare:.2f}")

# TODO: Fill missing fare with median 
df['fare'] = df['fare'].fillna(median_fare)

print("✓ Missing fare filled!")
print(f"Missing fares now: {df['fare'].isnull().sum()}")

Median fare: $14.45
✓ Missing fare filled!
Missing fares now: 0


In [24]:
rows_before = len(df)
print(f"Rows before dropping: {rows_before}")

Rows before dropping: 1309


In [25]:
# TODO: Drop rows where 'embarked' is missing
df.dropna(subset=['embarked'], inplace=True)

rows_after = len(df)
rows_dropped = rows_before - rows_after

print("✓ Rows with missing embarked dropped!")
print(f"Rows after dropping: {rows_after}")
print(f"Rows dropped: {rows_dropped}")
print(f"Missing embarked now: {df['embarked'].isnull().sum()}")

✓ Rows with missing embarked dropped!
Rows after dropping: 1307
Rows dropped: 2
Missing embarked now: 0


In [None]:
# A small amount about 0.15%. Since we only lost 2 rows out of 1300+, it is better to drop them rather than guessing where they embarked.

In [26]:
# TODO: Check for any remaining missing values
print("Final missing value check:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

Final missing value check:
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

Total missing values: 0


In [28]:
# Nah the sum should be 0.

In [29]:
# Display summary statistics
print("Summary statistics after cleaning:")
df.describe()

Summary statistics after cleaning:


Unnamed: 0,pclass,survived,age,sibsp,parch,fare
count,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0
mean,2.296863,0.381025,29.471821,0.499617,0.385616,33.209595
std,0.836942,0.485825,12.881592,1.042273,0.866092,51.748768
min,1.0,0.0,0.17,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.8958
50%,3.0,0.0,28.0,0.0,0.0,14.4542
75%,3.0,1.0,35.0,1.0,0.0,31.275
max,3.0,1.0,80.0,8.0,9.0,512.3292


In [30]:
# 29 years old and 33 $.

In [31]:
# Save cleaned data
df.to_csv("Titanic_Cleaned.csv", index=False)     
print("✓ Cleaned data saved to 'Titanic_Cleaned.csv'")

✓ Cleaned data saved to 'Titanic_Cleaned.csv'


In [None]:
# Machine learning algorithms are math equations. Most algorithms can't handle "NaN" values and will crash or throw errors. Even if they don't crash, missing data can lead to biased or inaccurate predictions.

In [None]:
# When you have a lot of missing data in a column that is important, or when the number of rows with missing data is large you don't want to lose that much data.
# Drop when only a small percentage of rows are missing data like the 'embarked' part, or when a column is missing so much data it is useless like 'cabin'.

In [None]:
# The median is more robust to outliers. If there were a few 80byear olds on the ship, they would pull the Mean higher, potentially making it inaccurate for the average person. The Median represents the true "middle" of the crowd.

In [None]:
# The code would likely fail/error out. If the library handles it automatically, it might make bad assumptions like assuming missing values are 0, leading to a model that makes bad predictions.

In [None]:
# A user skips an optional question on a survey; a weather sensor battery dies and stops recording temperature for an hour; a clerical error where a doctor forgets to write down a patient's weight.
# so could be a small deal, or a huge deal just depends.