In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import streamlit as st  # For deploying the model

# Set visualization style
sns.set(style="whitegrid")

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")


In [6]:
# Load the Titanic datasets
df_train = pd.read_csv("Titanic_train.csv")
df_test = pd.read_csv("Titanic_test.csv")

In [8]:
# Display dataset structure
print("Train Dataset Info:")
df_train.info()

Train Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
# Display first few rows
print("\nFirst 5 rows of dataset:")
display(df_train.head())


First 5 rows of dataset:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
# Check for missing values
print("\nMissing Values:")
print(df_train.isnull().sum())


Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [14]:
# Handle missing values in 'Age' column using median imputation
imputer = SimpleImputer(strategy='median')
df_train['Age'] = imputer.fit_transform(df_train[['Age']])
df_test['Age'] = imputer.transform(df_test[['Age']])

# Drop unnecessary columns ('Cabin' has too many missing values, 'Name' & 'Ticket' are irrelevant)
df_train.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)
df_test.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

# Encode categorical variables (Sex)
le = LabelEncoder()
df_train['Sex'] = le.fit_transform(df_train['Sex'])
df_test['Sex'] = le.transform(df_test['Sex'])

# One-hot encode 'Embarked' column
df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['Embarked'], drop_first=True)

# Split features and target variable
X = df_train.drop(columns=['Survived'])  # Features
y = df_train['Survived']  # Target variable

# Train-test split (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Model training completed
print("Logistic Regression model has been trained successfully!")


Logistic Regression model has been trained successfully!


In [18]:
# Make predictions
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]  # Probabilities for ROC-AUC

# Compute evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


Accuracy: 0.8045
Precision: 0.8000
Recall: 0.7027
F1-score: 0.7482
ROC-AUC: 0.8676


In [20]:
# Make predictions
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]  # Probabilities for ROC-AUC

# Compute evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


Accuracy: 0.8045
Precision: 0.8000
Recall: 0.7027
F1-score: 0.7482
ROC-AUC: 0.8676


In [22]:
# Extract feature coefficients
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': model.coef_[0]})

# Sort by importance
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)

# Display feature importance
display(coefficients)


Unnamed: 0,Feature,Coefficient
8,Embarked_S,0.236209
6,Fare,0.006601
0,PassengerId,0.000497
7,Embarked_Q,-0.023883
3,Age,-0.025498
5,Parch,-0.120931
4,SibSp,-0.301524
1,Pclass,-0.626322
2,Sex,-2.49177


In [24]:
import pickle

# Save the trained model
with open("model.pkl", "wb") as file:
    pickle.dump(model, file)


In [27]:
print(X.shape[1])  # Number of features


9


In [2]:
# Q1: What is the difference between precision and recall?

print("Precision vs. Recall:\n"
      "- Precision measures how many of the predicted positive cases are actually positive.\n"
      "- Formula: Precision = True Positives / (True Positives + False Positives).\n\n"
      "- Recall (Sensitivity) measures how many actual positive cases were correctly identified.\n"
      "- Formula: Recall = True Positives / (True Positives + False Negatives).\n\n"
      "Key Difference:\n"
      "- Precision focuses on reducing false positives, useful in cases like spam detection.\n"
      "- Recall focuses on reducing false negatives, critical in medical diagnoses.")


Precision vs. Recall:
- Precision measures how many of the predicted positive cases are actually positive.
- Formula: Precision = True Positives / (True Positives + False Positives).

- Recall (Sensitivity) measures how many actual positive cases were correctly identified.
- Formula: Recall = True Positives / (True Positives + False Negatives).

Key Difference:
- Precision focuses on reducing false positives, useful in cases like spam detection.
- Recall focuses on reducing false negatives, critical in medical diagnoses.


In [4]:
# Q2: What is cross-validation, and why is it important in binary classification?

print("Cross-Validation:\n"
      "- Cross-validation is a technique to assess how well a model generalizes to unseen data.\n"
      "- The dataset is split into multiple training and testing subsets, and the model is trained and validated iteratively.\n\n"
      "Importance in Binary Classification:\n"
      "- Helps prevent overfitting by ensuring the model performs well on different data splits.\n"
      "- Provides a more reliable estimate of model performance compared to a single train-test split.\n"
      "- Common methods: k-Fold Cross-Validation, Stratified k-Fold (for imbalanced datasets), and Leave-One-Out Cross-Validation.")


Cross-Validation:
- Cross-validation is a technique to assess how well a model generalizes to unseen data.
- The dataset is split into multiple training and testing subsets, and the model is trained and validated iteratively.

Importance in Binary Classification:
- Helps prevent overfitting by ensuring the model performs well on different data splits.
- Provides a more reliable estimate of model performance compared to a single train-test split.
- Common methods: k-Fold Cross-Validation, Stratified k-Fold (for imbalanced datasets), and Leave-One-Out Cross-Validation.
