# Titanic Data Cleaning & Preprocessing
This notebook completes Task 1 of the AI & ML Internship.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load the dataset
df = pd.read_csv("Titanic-Dataset.csv")
df.head()

In [None]:
# Basic Info
print("Initial Dataset Info:\n")
print(df.info())
print("\nMissing Values:\n")
print(df.isnull().sum())

In [None]:
# Handle missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop columns with too many missing values
if 'Cabin' in df.columns:
    df.drop(columns=['Cabin'], inplace=True)
if 'Body' in df.columns:
    df.drop(columns=['Body'], inplace=True)

In [None]:
# Encode categorical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [None]:
# Standardize numerical features
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [None]:
# Visualize outliers
sns.boxplot(data=df[['Age', 'Fare']])
plt.title("Boxplot for Age and Fare")
plt.show()

In [None]:
# Remove outliers in Fare
fare_limit = df['Fare'].quantile(0.99)
df = df[df['Fare'] < fare_limit]

In [None]:
# Save cleaned dataset
df.to_csv("Cleaned_Titanic.csv", index=False)
print("Cleaned dataset saved as 'Cleaned_Titanic.csv'")