In [1]:
import pandas as pd

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [5]:
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported Success")

Libraries imported Success


In [6]:
# Load your uploaded CSV file
df = pd.read_csv('Titanic-Dataset.csv')

print(f"Dataset loaded! Shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()


Dataset loaded! Shape: (891, 12)

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# How many people survived vs died?
print("Survival count:")
print(df['Survived'].value_counts())

# As percentages
print("\nSurvival percentage:")
print(df['Survived'].value_counts(normalize=True) * 100)


Survival count:
Survived
0    549
1    342
Name: count, dtype: int64

Survival percentage:
Survived
0    61.616162
1    38.383838
Name: proportion, dtype: float64


In [11]:
# Simple question: survival by gender
print("Survival by gender:")
gender_survival = df.groupby('Sex')['Survived'].sum()
print(gender_survival)

print("\nTotal people by gender:")
gender_total = df.groupby('Sex').size()
print(gender_total)

print("\nSurvival rate by gender:")
survival_rate = gender_survival / gender_total
print(survival_rate)


Survival by gender:
Sex
female    233
male      109
Name: Survived, dtype: int64

Total people by gender:
Sex
female    314
male      577
dtype: int64

Survival rate by gender:
Sex
female    0.742038
male      0.188908
dtype: float64


In [12]:
# Survival by passenger class
print("Survival by class:")
class_survival = df.groupby('Pclass')['Survived'].sum()
print(class_survival)

print("\nTotal people by class:")
class_total = df.groupby('Pclass').size()
print(class_total)

print("\nSurvival rate by class:")
class_rate = class_survival / class_total
print(class_rate)


Survival by class:
Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64

Total people by class:
Pclass
1    216
2    184
3    491
dtype: int64

Survival rate by class:
Pclass
1    0.629630
2    0.472826
3    0.242363
dtype: float64


In [19]:
# Step 1: Prepare the data for machine learning


# Create a copy to work with
data = df.copy()

# Convert 'male' to 0, 'female' to 1
data['Sex_num'] = data['Sex'].map({'male': 0, 'female': 1})

# Fill missing ages with average age
data['Age'].fillna(data['Age'].mean(), inplace=True)

print("Data prepared!")
print("Sample of our prepared data:")
print(data[['Sex', 'Sex_num', 'Pclass', 'Age', 'Survived']].head())


Data prepared!
Sample of our prepared data:
      Sex  Sex_num  Pclass   Age  Survived
0    male        0       3  22.0         0
1  female        1       1  38.0         1
2  female        1       3  26.0         1
3  female        1       1  35.0         1
4    male        0       3  35.0         0


In [18]:

# We'll use the patterns we discovered: gender, class, and age
X = data[['Sex_num', 'Pclass', 'Age']]  # Features (inputs)
y = data['Survived']                     # Target (what we want to predict)

# Split data: some for training, some for testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

print(" Model trained successfully!")
print(f"Training data: {len(X_train)} passengers")
print(f"Test data: {len(X_test)} passengers")


 Model trained successfully!
Training data: 712 passengers
Test data: 179 passengers


In [17]:
# Make predictions on test data
predictions = model.predict(X_test)

# Check accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy: {accuracy:.1%}")
print(f"This means our model correctly predicts survival {accuracy:.1%} of the time!")

# Show some example predictions
print("\n=== EXAMPLE PREDICTIONS ===")
for i in range(5):
    actual = y_test.iloc[i]
    predicted = predictions[i]
    sex = "Female" if X_test.iloc[i]['Sex_num'] == 1 else "Male"
    pclass = X_test.iloc[i]['Pclass']
    age = X_test.iloc[i]['Age']

    result = " CORRECT" if actual == predicted else " WRONG"
    print(f"{sex}, Class {pclass}, Age {age:.0f} → Predicted: {'Survived' if predicted else 'Died'}, Actual: {'Survived' if actual else 'Died'} {result}")


Model Accuracy: 79.3%
This means our model correctly predicts survival 79.3% of the time!

=== EXAMPLE PREDICTIONS ===
Male, Class 3.0, Age 30 → Predicted: Died, Actual: Survived  WRONG
Male, Class 2.0, Age 31 → Predicted: Died, Actual: Died  CORRECT
Male, Class 3.0, Age 20 → Predicted: Died, Actual: Died  CORRECT
Female, Class 2.0, Age 6 → Predicted: Survived, Actual: Survived  CORRECT
Female, Class 3.0, Age 14 → Predicted: Died, Actual: Survived  WRONG
