# Titanic Dataset Cleaning & Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from scipy import stats

In [None]:
# Load the dataset
df = pd.read_csv("train.csv")

print("===== Initial Dataset Overview =====")
print(df.head(), "\n")
print(df.info(), "\n")
print("Missing values before cleaning:\n", df.isnull().sum(), "\n")

In [None]:
#  Drop irrelevant or non-informative columns
df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

In [None]:
#  Fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

In [None]:
# Embarked â†’ fill with mode (most frequent)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [None]:
# Encode categorical columns
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])  # male=1, female=0
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

In [None]:
# Handle outliers in numeric columns using z-score
numeric_cols = df.select_dtypes(include=[np.number]).columns
df = df[(np.abs(stats.zscore(df[numeric_cols])) < 3).all(axis=1)]

In [None]:
# Scale numerical features (0 to 1 range)
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])