In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the dataset
data_path = r"C:\Users\subra\Desktop\Projects\Codsoft\Titanic_Survival\data\titanic.csv"
df = pd.read_csv(data_path)

In [None]:
# Display basic information about the dataset
print("First five rows of the dataset:")
print(df.head())

print("\nSummary statistics of the dataset:")
print(df.describe(include='all'))

print("\nDataset information:")
print(df.info())

In [None]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


In [None]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)  # Fill missing ages with the median
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Fill missing embarked with mode
df.drop(columns=['Cabin'], inplace=True)  # Drop Cabin due to too many missing values


In [None]:
# Exploratory Data Analysis (EDA)
# Survival rate
print("\nSurvival Rate:")
print(df['Survived'].value_counts(normalize=True))

sns.countplot(x='Survived', data=df, palette='viridis')
plt.title("Survival Count")
plt.xlabel("Survived (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
# Gender vs Survival
sns.countplot(x='Survived', hue='Sex', data=df, palette='Set2')
plt.title("Survival by Gender")
plt.xlabel("Survived (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
# Class vs Survival
sns.countplot(x='Survived', hue='Pclass', data=df, palette='coolwarm')
plt.title("Survival by Passenger Class")
plt.xlabel("Survived (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()


In [None]:
# Age Distribution
sns.histplot(df['Age'], kde=True, bins=30, color='blue')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Feature Engineering
# Convert categorical columns to numerical
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

In [None]:
# Drop unnecessary columns
df.drop(columns=['Name', 'Ticket'], inplace=True)


In [None]:
# Feature and target selection
X = df.drop(columns=['Survived'])  # Features
y = df['Survived']                # Target


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Train the model using Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
print("\nAccuracy of the model:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Did Not Survive', 'Survived'], yticklabels=['Did Not Survive', 'Survived'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
