# Titanic Survival Analysis 🚢

### Objective: Perform EDA & Predictive Analysis

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load Titanic dataset
file_path = "Titanic_Data_Set.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

# Display first few rows
df.head()


### Checking Missing Values & Summary Statistics

In [None]:

# Check missing values
df.isnull().sum()


In [None]:

# Summary statistics
df.describe()


### Data Cleaning

In [None]:

# Fill missing Age values with the median
df["Age"].fillna(df["Age"].median(), inplace=True)

# Fill missing Fare value with the median
df["Fare"].fillna(df["Fare"].median(), inplace=True)

# Drop Cabin column due to too many missing values
df.drop(columns=["Cabin"], inplace=True)

df.head()


### Visualizing Survival Rate by Gender

In [None]:

sns.set_style("whitegrid")

# Map 'Sex' to categorical values
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

# Plot survival rate by gender
plt.figure(figsize=(6, 4))
sns.barplot(x="Sex", y="Survived", data=df, palette="coolwarm")
plt.title("Survival Rate by Gender")
plt.ylabel("Survival Probability")
plt.show()


### Survival Rate by Ticket Class

In [None]:

# Plot survival rate by ticket class
plt.figure(figsize=(6, 4))
sns.barplot(x="Pclass", y="Survived", data=df, palette="viridis")
plt.title("Survival Rate by Ticket Class")
plt.xlabel("Passenger Class")
plt.ylabel("Survival Probability")
plt.show()


### Age Distribution & Survival

In [None]:

# Plot age distribution with survival overlay
plt.figure(figsize=(8, 5))
sns.histplot(df, x="Age", hue="Survived", bins=30, kde=True, palette={0: "red", 1: "green"})
plt.title("Age Distribution by Survival")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


### Feature Importance Analysis

In [None]:

# Encode categorical variables
df_encoded = df.copy()
df_encoded["Sex"] = LabelEncoder().fit_transform(df["Sex"])
df_encoded["Embarked"] = LabelEncoder().fit_transform(df["Embarked"])

# Select features and target variable
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = df_encoded[features]
y = df_encoded["Survived"]

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importances
feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x=feature_importance.values, y=feature_importance.index, palette="coolwarm")
plt.title("Feature Importance in Predicting Survival")
plt.xlabel("Importance Score")
plt.show()
