In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

def load_data(path : str | Path = r"C:\Users\DELL\OneDrive\Desktop\anu course\datasets\titanic.csv"):
    path = Path(path)

    df = pd.read_csv(path)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna(df['Cabin'].mode()[0])
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    x = df.drop(columns="Survived",axis=1)
    y = df["Survived"]

    return x ,y

In [10]:
df=pd.read_csv( r"C:\Users\DELL\OneDrive\Desktop\anu course\datasets\titanic.csv")
df.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [12]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [14]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.pipeline import Pipeline

def build_preprocessor(x):
    num_cols = x.select_dtypes(include = ["int64","float64"]).columns
    cat_cols = x.select_dtypes(include = ["object"]).columns


    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(handle_unknown="ignore")


    preprocessor = ColumnTransformer(transformers = [("num",num_transformer,num_cols)
                                                     ,("cat",cat_transformer,cat_cols)])

    return preprocessor,num_cols,cat_cols

In [18]:
from sklearn.preprocessing import LabelEncoder
def encoder_target(y):

    if y.dtypes == "objects" or "bool" :
        le = LabelEncoder()
        enc_y= le.fit_transform(y)
        problem_type = "classification"

    else:
        le = None
        enc_y=y.value
        problem_type = "regression"


    return enc_y,le,problem_type
    

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier,StackingClassifier,RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

def build_pipeline(preprocessor, model="vote", problem_type="classification"):
    cl1 = LogisticRegression(max_iter=1000, random_state=42)
    cl2 = RandomForestClassifier(n_estimators=100, random_state=42)
    cl3 = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    cl4 = MLPClassifier(max_iter=500)

    if model == "vote":
        final_model = VotingClassifier(
            estimators=[("lr", cl1), ("rf", cl2), ("xgb", cl3), ("mlp", cl4)],
            voting="soft"
        )
    elif model == "stack":
        final_model = StackingClassifier(
            estimators=[("lr", cl1), ("rf", cl2), ("xgb", cl3)],
            final_estimator=LogisticRegression()
        )
    else:
        raise ValueError("Unknown model type")

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", final_model) ])  

    return pipeline


In [34]:
from sklearn.metrics import confusion_matrix, classification_report,f1_score,recall_score,accuracy_score
from sklearn.model_selection import train_test_split
def main(model= "vote"):
    x , y= load_data()
    preprocessor, num_cols,cat_cols = build_preprocessor(x)
    enc_y,le,problem_type = encoder_target(y)


    x_train, x_test, y_train, y_test = train_test_split(
    x, enc_y, test_size=0.33, random_state=42
)

    x_train_preprocessed = preprocessor.fit_transform(x_train)
    x_test_preprocessed = preprocessor.transform(x_test)

    model = build_pipeline(preprocessor, model="vote",problem_type="classification")
    model.fit(x_train,y_train)
    y_proba = model.predict_proba(x_test)[:,1]
    y_pred = (y_proba >= 0.4).astype(int)

    print("Accuracy:", round(accuracy_score(y_test, y_pred)*100,2),"%")
    print("F1 score:", round(f1_score(y_test, y_pred)*100,2),"%")
    print("Recall:", round(recall_score(y_test, y_pred)*100,2),"%")
    print("Classification report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    



In [36]:
if __name__ == "__main__" :
    main(model ="voting" )

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 83.05 %
F1 score: 79.34 %
Recall: 80.0 %
Classification report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.86       175
           1       0.79      0.80      0.79       120

    accuracy                           0.83       295
   macro avg       0.82      0.83      0.82       295
weighted avg       0.83      0.83      0.83       295

Confusion matrix:
 [[149  26]
 [ 24  96]]
