# Titanic Dataset – EDA (Task Code: CST_DS_02)

This notebook performs data cleaning and exploratory data analysis (EDA) on the Titanic dataset as part of my Data Science Internship at SkillCraft Technology.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

In [None]:
df = pd.read_csv('titanic.csv')  # Make sure titanic.csv is in the same folder
df.head()

## Basic Info & Missing Values

In [None]:
df.info()
df.isnull().sum()

## Data Cleaning

In [None]:
# Fill missing Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' due to too many missing values
df.drop(columns=['Cabin'], inplace=True)

## Summary Statistics

In [None]:
df.describe()

## Univariate Analysis

In [None]:
sns.countplot(data=df, x='Survived')
plt.title('Survival Count')
plt.show()

In [None]:
sns.countplot(data=df, x='Pclass')
plt.title('Passenger Class Distribution')
plt.show()

In [None]:
df['Age'].plot.hist(bins=30, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.show()

## Bivariate Analysis

In [None]:
sns.countplot(data=df, x='Sex', hue='Survived')
plt.title('Survival by Gender')
plt.show()

In [None]:
sns.countplot(data=df, x='Pclass', hue='Survived')
plt.title('Survival by Class')
plt.show()

In [None]:
sns.boxplot(data=df, x='Survived', y='Age')
plt.title('Age vs Survival')
plt.show()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()