<a href="https://colab.research.google.com/github/Anwerzain/NeuroNexus/blob/main/%F0%9F%93%88_Project_Titanic_Survival_Prediction_(Machine_Learning_Beginner_Project).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                        Step 1: Import Libraries
#<----------------------------------------------------------------------------------------------------------------->

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#<----------------------------------------------------------------------------------------------------------------->
#.                                                ML
#<----------------------------------------------------------------------------------------------------------------->

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

#<----------------------------------------------------------------------------------------------------------------->
#                                       Step 2: Load Titanic Dataset
#<----------------------------------------------------------------------------------------------------------------->

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

#<----------------------------------------------------------------------------------------------------------------->
#                                        Step 3: Data dekhte hain
#<----------------------------------------------------------------------------------------------------------------->

df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                           🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->
#                                           Line	  Explanation
#<----------------------------------------------------------------------------------------------------------------->

# pandas as pd                     <--------------------->   	Data ko load, clean, explore karne ke liye — Excel ki tarah
# numpy as np                      <--------------------->  	Maths/array ka kaam
# seaborn, matplotlib.pyplot	     <--------------------->   	Beautiful graphs banane ke liye
# train_test_split	               <--------------------->   	Model train/test ke liye data divide karta hai
# LogisticRegression               <--------------------->   	Humara ML algorithm (predict karega survival)
# accuracy_score, confusion_matrix <--------------------->   	Model ki performance check karne ke liye


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                          🔧 Step 4: Data Cleaning & Missing Values Check
#<----------------------------------------------------------------------------------------------------------------->
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                            🔧 Step 4: Data Cleaning & Missing Values Check
#<----------------------------------------------------------------------------------------------------------------->
# Null values check
# isnull() → Check karta hai kaunse cells empty (null) hain
# PassengerId      0
# Survived         0
# Pclass           0
# Name             0
# Sex              0
# Age            177   👈 Missing hai
# SibSp            0
# Parch            0
# Fare             0
# Cabin          687   👈 Missing hai
# Embarked         2   👈 Missing hai


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                             🔧 Step 5: Drop Columns (jo kaam ke nahi hain)
#<----------------------------------------------------------------------------------------------------------------->
df.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

#                                                🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->
# drop(columns=[]) → Unwanted columns hatao
# inplace=True → Original data df me changes apply ho jaayein

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                         🔧 Step 6: Fill Missing Values
#<----------------------------------------------------------------------------------------------------------------->
#                                           Fill missing Age with median
#<----------------------------------------------------------------------------------------------------------------->

df['Age'] = df['Age'].fillna(df['Age'].median())

#<----------------------------------------------------------------------------------------------------------------->
#                                 Fill missing Embarked with mode (most frequent)
#<----------------------------------------------------------------------------------------------------------------->

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

#<----------------------------------------------------------------------------------------------------------------->
#                                                 🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->



# Column	Fill With	Kyu?
# Age	Median	Outliers ka effect kam hota hai
# Embarked	Mode	Most common value fill karte hain


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                🔧 Step 7: Convert Categorical to Numeric
#<----------------------------------------------------------------------------------------------------------------->
#                                     Sex: male → 0, female → 1
#<----------------------------------------------------------------------------------------------------------------->
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
#<----------------------------------------------------------------------------------------------------------------->
#                                     Embarked: S → 0, C → 1, Q → 2
#<----------------------------------------------------------------------------------------------------------------->
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                              Model Trained Kar rahe Hai ab
#<----------------------------------------------------------------------------------------------------------------->

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                            🔹 Step 1: Feature & Label Split
#<----------------------------------------------------------------------------------------------------------------->
X = df.drop('Survived', axis=1)
y = df['Survived']

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                             🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->
#           Part	                                                       Meaning
#<----------------------------------------------------------------------------------------------------------------->
# df.drop('Survived', axis=1)	<--------------------->   	Ye Survived column ko hata dega, kyunki hum use predict karna chahte hain.
# axis=1	                    <--------------------->   	1 matlab column, agar axis=0 hota toh rows ke liye hota
# X = ...	                    <--------------------->   	Ye saare input features hain (jaise age, fare, sex, etc.)
# y = df['Survived']	        <--------------------->   	Ye hamara target/output hai, jisko model predict karega (0 = not survived, 1 = survived)


In [None]:
# 🤔 Q: Kyu kiya split?
# Model ko batana padta hai ki "Ye inputs hain (X), aur ye output hai (y)". Tabhi model seekh payega.

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                      🔹 Step 2: Train-Test Split
#<----------------------------------------------------------------------------------------------------------------->
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                             🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->
#           Part	                                                       Meaning
#<----------------------------------------------------------------------------------------------------------------->
# train_test_split()  	<--------------------->   	Ye function data ko 2 parts me divide karta hai: training & testing
# X_train, X_test	      <--------------------->   	Input features ka 80% training ke liye, 20% testing ke liye
# y_train, y_test	    	<--------------------->   	Labels (Survived) ka 80% training ke liye, 20% testing ke liye
# test_size=0.2	      	<--------------------->   	Matlab 20% test data
# random_state=42	    	<--------------------->   	Ye fixed random seed hai — same shuffle har baar milega (repeatability ke liye)
#<----------------------------------------------------------------------------------------------------------------->
# 🤔 Q: Kyu split karte hain?
# Training data = Model ko sikhaate hain
# Testing data = Model ko test karte hain ki woh naye data pe kaise perform karega

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                           🔹 Step 3: Train a Classifier (Logistic Regression)
#<----------------------------------------------------------------------------------------------------------------->
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                             🔍 Explanation:
#<----------------------------------------------------------------------------------------------------------------->
#           Part	                                                       Meaning
#<----------------------------------------------------------------------------------------------------------------->
# LogisticRegression()  	<--------------------->   		Ye ek machine learning algorithm hai jo 0 ya 1 jaise output ke liye use hota hai
# model.fit()	          	<--------------------->   	Model ko sikhaate hain using training data (X_train, y_train)
#<----------------------------------------------------------------------------------------------------------------->

# 🤔 Q: Logistic Regression kyu?
# Kyunki Titanic dataset me hamare paas binary outcome hai: Survived (1) or Not Survived (0)
# Logistic Regression is ideal for this kind of problem

In [None]:
# 💬 Example Samajhne ke liye:
# Agar tu kisi aadmi ki age, fare, sex jaane, to model predict karega:

# "Yeh aadmi Titanic me bacha hoga ya nahi?" 💡

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                                     📊 Step: Model Accuracy Check + Prediction
#<----------------------------------------------------------------------------------------------------------------->
# Yahan hum dekhenge ki model kitna sahi predict kar raha hai test data pe.

In [None]:
from sklearn.metrics import accuracy_score

# Model se predictions
y_pred = model.predict(X_test)

# Accuracy score calculate
accuracy = accuracy_score(y_test, y_pred)
print(f"📈 Model Accuracy: {accuracy:.2f}")


📈 Model Accuracy: 0.80


In [None]:
# 🔍 Kya ho raha hai is code me?
# Code	Kya karta hai
# model.predict(X_test) <---------------------> 	Test data pe prediction karta hai
# accuracy_score(...)	  <---------------------> Batata hai ki predictions kitne sahi hain
# print(...)	          <---------------------> Final result screen pe dikhata hai

In [None]:
#<----------------------------------------------------------------------------------------------------------------->
#                              🔍 Step: Confusion Matrix + Classification Report
#<----------------------------------------------------------------------------------------------------------------->
from sklearn.metrics import confusion_matrix, classification_report

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("🔢 Confusion Matrix:")
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("\n📋 Classification Report:")
print(cr)


🔢 Confusion Matrix:
[[90 15]
 [21 53]]

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
# 📖 Samjhaun kya hota hai ye?
#<----------------------------------------------------------------------------------------------------------------->
# Confusion Matrix:
#<----------------------------------------------------------------------------------------------------------------->
# [[TN  FP]
#  [FN  TP]]
#<----------------------------------------------------------------------------------------------------------------->
#       Term	                                      Meaning
#<----------------------------------------------------------------------------------------------------------------->
# TN (True Negative)	<------------------>  Actually died, model ne sahi kaha died
# TP (True Positive)	<------------------>  Actually survived, model ne sahi kaha survived
# FP (False Positive)	<------------------>  Actually died, model ne galat kaha survived
# FN (False Negative)	<------------------>  Actually survived, model ne galat kaha died

#<----------------------------------------------------------------------------------------------------------------->
#                                            Classification Report:
#<----------------------------------------------------------------------------------------------------------------->
# Isme hota hai:
# Precision (kitne predict kiye sahi the)
# Recall (actual survivors me se kitne pakde)
# F1-score (precision + recall ka balance)
# Support (kitne log the is category me)
#<----------------------------------------------------------------------------------------------------------------->