# Titanic Dataset - Data Cleaning & Preprocessing
*Date: 2025-06-23*

This notebook performs data cleaning and preprocessing on the Titanic dataset as part of an AI & ML internship task.

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from google.colab import files

## Step 2: Upload and Load the Dataset

In [None]:
uploaded = files.upload()  # Upload Titanic-Dataset.csv
df = pd.read_csv('Titanic-Dataset.csv')  # Make sure file name matches
df.head()

## Step 3: Explore Dataset

In [None]:
df.info()
df.isnull().sum()

## Step 4: Handle Missing Values

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns='Cabin', inplace=True)

## Step 5: Encode Categorical Features

In [None]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

## Step 6: Normalize Numerical Features

In [None]:
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

## Step 7: Visualize Outliers

In [None]:
sns.boxplot(x=df['Age'])
plt.title("Boxplot - Age")
plt.show()

sns.boxplot(x=df['Fare'])
plt.title("Boxplot - Fare")
plt.show()

## Step 8: Remove Outliers Using IQR

In [None]:
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

df = df[(df['Fare'] >= Q1 - 1.5 * IQR) & (df['Fare'] <= Q3 + 1.5 * IQR)]

## Step 9: Save Cleaned Data (Optional)

In [None]:
df.to_csv('cleaned_titanic.csv', index=False)
files.download('cleaned_titanic.csv')