In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
titanic = pd.read_csv("train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
print(titanic.isna().sum())   # will give a sum of all the Nan values each column holds
print("---------------------------------------")

print(titanic['Age'].isna().sum())   # will give a sum of all the Nan values Age column holds


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
---------------------------------------
177


In [4]:
# Predict -> Survived
# Input -> Sex, Age, (Sibsp, Parch if needed)


# Select features and target
x = titanic[['Sex', 'Age', 'SibSp', 'Parch']].copy()
y = titanic['Survived']

# Convert categorical column to numeric
x['Sex'] = x['Sex'].map({'male': 0, 'female': 1})

# Handle missing values
x['Age'].fillna(x['Age'].median(), inplace=True)

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
reg = LogisticRegression()
reg.fit(x_train, y_train)


yPred = reg.predict(x_test)

pd.DataFrame({
    'Sex': x_test['Sex'],
    'Age': x_test['Age'],
    'SibSp': x_test['SibSp'],
    'Parch': x_test['Parch'],
    'ACTUAL PREDICTIOIN': y_test,
    'Predicted_Survived': yPred
})



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['Age'].fillna(x['Age'].median(), inplace=True)


Unnamed: 0,Sex,Age,SibSp,Parch,ACTUAL PREDICTIOIN,Predicted_Survived
709,0,28.0,1,1,1,0
439,0,31.0,0,0,0,0
840,0,20.0,0,0,0,0
720,1,6.0,0,1,1,1
39,1,14.0,1,0,1,1
...,...,...,...,...,...,...
433,0,17.0,0,0,0,0
773,0,28.0,0,0,0,0
25,1,38.0,1,5,1,1
84,1,17.0,0,0,1,1


In [5]:
print(f"ACCURACY : {np.round(accuracy_score(y_test, yPred), 2)*100} %")

ACCURACY : 78.0 %


In [6]:
# since working with four columns gave only 78 % result, now we will be working on more columns

# 'Sex'    → Women had a much higher survival rate than men (very strong predictor)
# 'Age'    → Children and younger passengers had better survival chances than older ones
# 'SibSp'  → Number of siblings/spouses aboard (small families often stayed together, extreme counts less likely to survive)
# 'Parch'  → Number of parents/children aboard (family connections could affect rescue chances)
# 'Pclass' → Passenger class (1st > 2nd > 3rd) — richer passengers had better access to lifeboats
# 'Fare'   → Ticket price — correlates with class and wealth, both influencing survival odds

x = titanic[['Sex', 'Age', 'SibSp', 'Parch', 'Pclass', 'Fare']].copy()
y = titanic['Survived']


# Convert categorical 'Sex' to numeric
x['Sex'] = x['Sex'].map({'male': 0, 'female': 1})

# Handle missing values
# x['Age'].fillna(x['Age'].median(), inplace=True)      # if you write inplace=True, it will make changes to the original dataset
x['Age'] = x['Age'].fillna(x['Age'].median())
x['Fare'] = x['Fare'].fillna(x['Fare'].median())


x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)


# reg = LogisticRegression(max_iter=500)        # It means the maximum number of iterations the optimization algorithm will run while trying to find the best model parameters (weights).
reg = LogisticRegression()
reg.fit(x_train, y_train)


yPred = reg.predict(x_test)


results = pd.DataFrame({
    'Sex': x_test['Sex'],
    'Age': x_test['Age'],
    'SibSp': x_test['SibSp'],
    'Parch': x_test['Parch'],
    'Pclass': x_test['Pclass'],
    'Fare': x_test['Fare'],
    'ACTUAL OUTPUT': y_test,
    'PREDICTED OUTPUT': yPred
}).reset_index(drop=True)

results



Unnamed: 0,Sex,Age,SibSp,Parch,Pclass,Fare,ACTUAL OUTPUT,PREDICTED OUTPUT
0,0,28.0,1,1,3,15.2458,1,0
1,0,31.0,0,0,2,10.5000,0,0
2,0,20.0,0,0,3,7.9250,0,0
3,1,6.0,0,1,2,33.0000,1,1
4,1,14.0,1,0,3,11.2417,1,1
...,...,...,...,...,...,...,...,...
174,0,17.0,0,0,3,7.1250,0,0
175,0,28.0,0,0,3,7.2250,0,0
176,1,38.0,1,5,3,31.3875,1,0
177,1,17.0,0,0,2,10.5000,1,1


In [7]:
print(f"ACCURACY : {np.round(accuracy_score(y_test, yPred), 2)*100} %")

ACCURACY : 81.0 %


In [8]:
print(f"ACCURACY : {accuracy_score(y_test, yPred) * 100:.2f} %")

ACCURACY : 81.01 %
