In [None]:
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
data = sns.load_dataset('titanic')
print("Data loaded: ", data.shape)

Data loaded:  (891, 15)


In [4]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [5]:
cols = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'survived']
df = data[cols].copy()
print("Selected columns: ", df.columns.tolist())

Selected columns:  ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'survived']


In [6]:
print("Handling missing numeric values:-")
for col in ['age', 'fare']:
    median_value = df[col].median()
    print(f"Filling missing in {col} with median {median_value}")
    df[col] = df[col].fillna(median_value)

Handling missing numeric values:-
Filling missing in age with median 28.0
Filling missing in fare with median 14.4542


In [7]:
print("Handling missing categorical values for embarked:-")
mode_value = df['embarked'].mode()[0]
print(f"Filling missing embarked with mode '{mode_value}'")
df['embarked'] = df['embarked'].fillna(mode_value)

Handling missing categorical values for embarked:-
Filling missing embarked with mode 'S'


In [8]:
print("Encoding categorical variables:-")
df = pd.get_dummies(df, columns=['sex', 'embarked'], drop_first=True)
print("New columns after encoding: ", df.columns.tolist())

Encoding categorical variables:-
New columns after encoding:  ['pclass', 'age', 'sibsp', 'parch', 'fare', 'survived', 'sex_male', 'embarked_Q', 'embarked_S']


In [9]:
y = df['survived']
X = df.drop('survived', axis=1)
print("Feature shape: ", X.shape, "; Target shape: ", y.shape)

Feature shape:  (891, 8) ; Target shape:  (891,)


In [10]:
print("Splitting into train and test sets:-")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)
print("Training set size: ", X_train.shape)
print("Test set size: ", X_test.shape)

Splitting into train and test sets:-
Training set size:  (712, 8)
Test set size:  (179, 8)


In [11]:
def get_models_and_params():
    print("Defining models and parameter grids ->")
    models = {}

    # Logistic Regression
    lr = LogisticRegression(max_iter=500)
    lr_params = {'C': [0.1, 1, 10]}
    print("Logistic Regression params: ", lr_params)
    models['Logistic Regression'] = (lr, lr_params)

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt_params = {'max_depth': [None, 3, 5]}
    print("Decision Tree params: ", dt_params)
    models['Decision Tree'] = (dt, dt_params)

    # Random Forest
    rf = RandomForestClassifier()
    rf_params = {'n_estimators': [50, 100]}
    print("Random Forest params: ", rf_params)
    models['Random Forest'] = (rf, rf_params)

    # Gradient Boosting
    gb = GradientBoostingClassifier()
    gb_params = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}
    print("Gradient Boosting params: ", gb_params)
    models['Gradient Boosting'] = (gb, gb_params)

    return models

models = get_models_and_params()
results = []

Defining models and parameter grids ->
Logistic Regression params:  {'C': [0.1, 1, 10]}
Decision Tree params:  {'max_depth': [None, 3, 5]}
Random Forest params:  {'n_estimators': [50, 100]}
Gradient Boosting params:  {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}


In [12]:
print("Starting training and evaluation-->")
auto_tune = True  # set True to use GridSearchCV
for name, (model, params) in models.items():
    print(f"\nProcessing model: {name}")

    if auto_tune:
        print("  Using GridSearchCV for tuning")
        grid = GridSearchCV(model, params, cv=5)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print(f"  Best params for {name}: {grid.best_params_}")
    else:
        print("  Fitting default model (no tuning)")
        best_model = model.fit(X_train, y_train)

    # Predictions
    preds = best_model.predict(X_test)

    # Compute metrics
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print(f"  Metrics — Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")

    # Append to results
    results.append({
        'Model': name,
        'Accuracy': round(acc, 2),
        'Precision': round(prec, 2),
        'Recall': round(rec, 2),
        'F1 Score': round(f1, 2)
    })

Starting training and evaluation-->

Processing model: Logistic Regression
  Using GridSearchCV for tuning
  Best params for Logistic Regression: {'C': 0.1}
  Metrics — Accuracy: 0.81, Precision: 0.76, Recall: 0.74, F1: 0.75

Processing model: Decision Tree
  Using GridSearchCV for tuning
  Best params for Decision Tree: {'max_depth': 5}
  Metrics — Accuracy: 0.82, Precision: 0.81, Recall: 0.68, F1: 0.74

Processing model: Random Forest
  Using GridSearchCV for tuning
  Best params for Random Forest: {'n_estimators': 50}
  Metrics — Accuracy: 0.84, Precision: 0.81, Recall: 0.75, F1: 0.78

Processing model: Gradient Boosting
  Using GridSearchCV for tuning
  Best params for Gradient Boosting: {'learning_rate': 0.1, 'n_estimators': 100}
  Metrics — Accuracy: 0.84, Precision: 0.90, Recall: 0.67, F1: 0.77


In [13]:
print("\nModel Evaluation Summary:")
print(pd.DataFrame(results))


Model Evaluation Summary:
                 Model  Accuracy  Precision  Recall  F1 Score
0  Logistic Regression      0.81       0.76    0.74      0.75
1        Decision Tree      0.82       0.81    0.68      0.74
2        Random Forest      0.84       0.81    0.75      0.78
3    Gradient Boosting      0.84       0.90    0.67      0.77
