In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (CSV file)
data = pd.read_csv("titanic.csv")

# Display first 5 rows to understand data structure
print("Sample data:")
print(data.head())

# Data overview: types, missing values
print("\nData info:")
print(data.info())

# Summary statistics for numerical columns
print("\nSummary statistics:")
print(data.describe())

# Check for missing values
print("\nMissing values per column:")
print(data.isnull().sum())

# Fill missing values for 'Age' with median age
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill missing 'Embarked' values with most frequent port
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column due to too many missing values
data.drop(columns=['Cabin'], inplace=True)

# Verify cleaning
print("\nAfter cleaning missing data:")
print(data.isnull().sum())

# Basic survival rate
survival_rate = data['Survived'].mean()
print(f"\nOverall survival rate: {survival_rate:.2%}")

# Survival by gender
print("\nSurvival rates by gender:")
print(data.groupby('Sex')['Survived'].mean())

# Visualization 1: Survival count by gender
sns.countplot(x='Sex', hue='Survived', data=data)
plt.title('Survival Counts by Gender')
plt.show()

# Visualization 2: Age distribution by survival
sns.histplot(data=data, x='Age', hue='Survived', kde=True, multiple='stack')
plt.title('Age Distribution by Survival')
plt.show()

# Visualization 3: Survival rate by passenger class (Pclass)
sns.barplot(x='Pclass', y='Survived', data=data)
plt.title('Survival Rate by Passenger Class')
plt.show()
