In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

### 1. Load and Preprocess Dataset

In [2]:
df = sns.load_dataset('titanic')
print("Dataset loaded. Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nTarget distribution:")
print(df['survived'].value_counts())

# Drop irrelevant columns
df.drop(['deck', 'embark_town', 'alive', 'who', 'adult_male', 'class'], axis=1, inplace=True)

# Drop rows with missing target
df.dropna(subset=['survived'], inplace=True)

# Fill missing values
df['age'] = df['age'].fillna(df['age'].median())
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

print(f"\nMissing values after preprocessing:\n{df.isnull().sum()}")

Dataset loaded. Shape: (891, 15)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None

Target distribution:
0   

In [None]:
# Encode categorical value
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

print(f"\nFinal dataset shape: {df.shape}")
print(f"Features: {list(df.columns)}")


Final dataset shape: (891, 9)
Features: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone']


### 2. Feature and Target Selection

In [4]:
X = df.drop('survived', axis=1)
y = df['survived']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")


Feature matrix shape: (891, 8)
Target vector shape: (891,)


### 3. Train-Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4. Train Multiple Models

In [6]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC()
}

print("Model Evaluation Metrics:\n")

for name, model in models.items():
    if name == 'SVM':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

Model Evaluation Metrics:


Logistic Regression
Accuracy:  0.7989
Precision: 0.7714
Recall:    0.7297
F1 Score:  0.7500
Confusion Matrix:
[[89 16]
 [20 54]]

Random Forest
Accuracy:  0.8268
Precision: 0.8028
Recall:    0.7703
F1 Score:  0.7862
Confusion Matrix:
[[91 14]
 [17 57]]

SVM
Accuracy:  0.8156
Precision: 0.8060
Recall:    0.7297
F1 Score:  0.7660
Confusion Matrix:
[[92 13]
 [20 54]]


### 5. Hyperparameter Tuning

In [7]:
# Logistic Regression - GridSearchCV
log_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_lr = GridSearchCV(LogisticRegression(), log_params, cv=5, scoring='accuracy')
grid_lr.fit(X_train_scaled, y_train)
print("\nBest Logistic Regression Params (GridSearchCV):", grid_lr.best_params_)


Best Logistic Regression Params (GridSearchCV): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [8]:
# Random Forest - RandomizedSearchCV
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10]
}

random_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_rf.fit(X_train, y_train)
print("Best Random Forest Params (RandomizedSearchCV):", random_rf.best_params_)

Best Random Forest Params (RandomizedSearchCV): {'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 6}


### 6. Final Model Comparison

In [9]:
final_models = {
    'Logistic Regression (Tuned)': grid_lr.best_estimator_,
    'Random Forest (Tuned)': random_rf.best_estimator_,
    'SVM': SVC()
}

print("\nFinal Tuned Model Performance:\n")

for name, model in final_models.items():
    if name == 'SVM':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")


Final Tuned Model Performance:


Logistic Regression (Tuned)
Accuracy:  0.7933
Precision: 0.7761
Recall:    0.7027
F1 Score:  0.7376

Random Forest (Tuned)
Accuracy:  0.8268
Precision: 0.8525
Recall:    0.7027
F1 Score:  0.7704

SVM
Accuracy:  0.8156
Precision: 0.8060
Recall:    0.7297
F1 Score:  0.7660


# Assignment Summary:
- Trained 3 models: Logistic Regression, Random Forest, SVM
- Evaluated using accuracy, precision, recall, F1-score
- Applied GridSearchCV and RandomizedSearchCV
- Selected best model based on evaluation metrics