In [1]:
# Import required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")


In [4]:
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

df.head()




Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# ***Preprocessing ***

In [5]:
# Drop unnecessary columns
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Fill missing values
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# Convert categorical to numeric
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df = pd.get_dummies(df, columns=["Embarked"], drop_first=True)

df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,False,True
1,1,1,1,38.0,1,0,71.2833,False,False
2,1,3,1,26.0,0,0,7.925,False,True
3,1,1,1,35.0,1,0,53.1,False,True
4,0,3,0,35.0,0,0,8.05,False,True


In [6]:
X = df.drop("Survived", axis=1)
y = df["Survived"]


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

default_model = LogisticRegression()
default_model.fit(X_train, y_train)

y_pred_default = default_model.predict(X_test)

default_results = {
    "Accuracy": accuracy_score(y_test, y_pred_default),
    "Precision": precision_score(y_test, y_pred_default),
    "Recall": recall_score(y_test, y_pred_default),
    "F1-Score": f1_score(y_test, y_pred_default)
}

default_results


{'Accuracy': 0.8044692737430168,
 'Precision': 0.7931034482758621,
 'Recall': 0.6666666666666666,
 'F1-Score': 0.7244094488188977}

## Grid search

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "max_iter": [100, 200, 500],
    "solver": ["lbfgs"]
}

grid_search = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_grid_model = grid_search.best_estimator_
y_pred_grid = best_grid_model.predict(X_test)

grid_results = {
    "Accuracy": accuracy_score(y_test, y_pred_grid),
    "Precision": precision_score(y_test, y_pred_grid),
    "Recall": recall_score(y_test, y_pred_grid),
    "F1-Score": f1_score(y_test, y_pred_grid)
}

grid_results


{'Accuracy': 0.7988826815642458,
 'Precision': 0.8,
 'Recall': 0.6376811594202898,
 'F1-Score': 0.7096774193548387}

# **Randomized Search**

In [10]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist = {
    "C": np.logspace(-3, 2, 20),
    "max_iter": [100, 200, 300, 500],
    "solver": ["lbfgs"]
}

random_search = RandomizedSearchCV(
    LogisticRegression(),
    param_dist,
    n_iter=15,
    cv=5,
    scoring="f1",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_random_model = random_search.best_estimator_
y_pred_random = best_random_model.predict(X_test)

random_results = {
    "Accuracy": accuracy_score(y_test, y_pred_random),
    "Precision": precision_score(y_test, y_pred_random),
    "Recall": recall_score(y_test, y_pred_random),
    "F1-Score": f1_score(y_test, y_pred_random)
}

random_results


{'Accuracy': 0.7932960893854749,
 'Precision': 0.7857142857142857,
 'Recall': 0.6376811594202898,
 'F1-Score': 0.704}

In [11]:
results_df = pd.DataFrame(
    [default_results, grid_results, random_results],
    index=["Default Model", "Grid Search", "Randomized Search"]
)

results_df


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Default Model,0.804469,0.793103,0.666667,0.724409
Grid Search,0.798883,0.8,0.637681,0.709677
Randomized Search,0.793296,0.785714,0.637681,0.704


# Project Summary :
**In this notebook, a Logistic Regression model is applied to the Titanic dataset to predict passenger survival. First, a baseline model is trained using default parameters. Then, hyperparameter optimization is performed using Grid Search and Randomized Search with cross-validation to improve model performance. The models are evaluated using accuracy, precision, recall, and F1-score. Finally, the performance of the default model is compared with the tuned models to demonstrate the impact of hyperparameter optimization.**