Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Setting style for plots
sns.set_theme(style="whitegrid")

# 1. Load Dataset from URL
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("---  Titanic Dataset Loaded ---")
display(df.head())

Data Quality Check

In [None]:
# 2. Structure & Missing Values
print("--- Data Info ---")
df.info()

print("\n--- Missing Values Count ---")
# Visualizing missing values is a great skill
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Data Cleaning Strategy

In [None]:
# 3. Handling Missing Data

# A. Drop 'Cabin' column (Too many missing values > 75%)
if "Cabin" in df.columns:
    df.drop(columns=["Cabin"], inplace=True)
    print(" Dropped 'Cabin' column.")

# B. Fill missing 'Age' values with the Mean
# Note: Assigning back is safer than inplace=True in new Pandas versions
average_age = df["Age"].mean()
df["Age"] = df["Age"].fillna(average_age)
print(f" Filled missing ages with mean: {average_age:.1f}")

# C. Drop any remaining tiny missing rows (like Embarked)
df.dropna(inplace=True)

# 4. Remove Duplicates
duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f" Removed {duplicates} duplicate rows.")

Analysis

In [None]:
# 5. Analysis Questions

# Q1: What is the survival rate by Gender?
survival_by_gender = df.groupby("Sex")["Survived"].mean() * 100

print("--- Survival Rate by Gender ---")
print(survival_by_gender)

# Q2: Comparison (Average Age of Survivors vs Non-Survivors)
age_survival = df.groupby("Survived")["Age"].mean()
print("\n--- Average Age by Survival (0=Died, 1=Survived) ---")
print(age_survival)

Visualization

In [None]:
# 6. Visualization

plt.figure(figsize=(10, 5))

# Plot 1: Survival Count
plt.subplot(1, 2, 1)
sns.countplot(data=df, x="Survived", palette="pastel")
plt.title("Total Survivors (0=No, 1=Yes)")

# Plot 2: Survival by Gender
plt.subplot(1, 2, 2)
sns.barplot(data=df, x="Sex", y="Survived", palette="muted")
plt.title("Survival Rate by Gender")

plt.tight_layout()
plt.show()

Save

In [None]:
# 7. Save Cleaned Data
df.to_csv("cleaned_titanic_data.csv", index=False)
print("Dataset saved successfully as 'cleaned_titanic_data.csv'")