In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from scipy.stats import uniform, randint

#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier
train = pd.read_csv(r"D:\code\ML\Datasets\titanic_train.csv")
test = pd.read_csv(r"D:\code\ML\Datasets\titanic_test.csv")
test_id = test["PassengerId"]
#training data general preprocessing
titanic_label = train["Survived"] #spliting target label
titanic = train.drop("Survived", axis=1) #removing taregt
titanic["Deck"] = titanic["Cabin"].str[0] #extracting deck(C) letter from cabin(C72)
#replacing null values in deck column according to Pcalss of the passenger(Pclass{1: (A,B,C), 2:(D,E), 3:(F,G,T)})
titanic.loc[titanic["Deck"].isnull() & (titanic["Pclass"]==1), "Deck"] = "B" 
titanic.loc[titanic["Deck"].isnull() & (titanic["Pclass"]==2), "Deck"] = "D"
titanic.loc[titanic["Deck"].isnull() & (titanic["Pclass"]==3), "Deck"] = "F"
#replacing null values in emarbarked to most frequent value
titanic["Embarked"] = titanic["Embarked"].fillna(titanic["Embarked"].mode()[0])

titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"] + 1
titanic["IsAlone"] = (titanic["FamilySize"] == 1).astype(int)
titanic["Title"] = titanic["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
titanic["Title"] = titanic["Title"].replace(['Mlle','Ms'],'Miss')
titanic["Title"] = titanic["Title"].replace('Mme','Mrs')
titanic["Title"] = titanic["Title"].replace(['Capt','Col','Major','Dr','Rev'], 'Officer')
titanic["Title"] = titanic["Title"].replace(['Don','Dona','Lady','Countess','Jonkheer','Sir'], 'Noble')

#dropping attribute i think not useful anymore
drop_attribute = ["PassengerId","Name","Ticket","Cabin"]
titanic = titanic.drop(drop_attribute, axis=1)
#category and numerical attribute
#cat_attribute = ["Sex", "Embarked", "Deck"]
#num_attribute = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
cat_attributes = titanic.select_dtypes(include=["object", "category"]).columns.tolist()
num_attributes = titanic.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

preprocessor = make_column_transformer(
    (num_pipeline, num_attributes),
    (OneHotEncoder(handle_unknown="ignore"), cat_attributes),
)
titanic_transformed = preprocessor.fit_transform(titanic)
columns = preprocessor.get_feature_names_out()
titanic_transformed_df = pd.DataFrame(titanic_transformed, columns=columns)
X_train, X_test, y_train, y_test = train_test_split(titanic_transformed, titanic_label, random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)
rf = RandomForestClassifier(random_state=42)
svc = SVC(random_state=42)
voting_clf = VotingClassifier(
estimators=[
 ('lr', lr),
 ('rf', rf),
 ('svc', svc)
 ]
 )
param_distributions = {
    #logistic regression
    'lr__C': uniform(0.01,10),
    'lr__penalty': ['l2'],
    'lr__solver': ['liblinear'],

    # Random Forest
    'rf__n_estimators': randint(50,175),
    'rf__max_depth': [None, 5, 10, 15],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__min_samples_split': randint(2, 10),

    # SVC
    'svc__C': uniform(0.1,6),
    'svc__kernel': ['rbf'],
    'svc__gamma': ['scale', 'auto']
}

random_search = RandomizedSearchCV(
    estimator=voting_clf,
    param_distributions=param_distributions,
    n_iter=45,
    cv=4,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best cross-val score:", random_search.best_score_)
print("Test score:", random_search.score(X_test, y_test))

#hyperparameter tuning accoriding to rs
best_params = random_search.best_params_

lr_best = LogisticRegression(
    C=best_params['lr__C'],
    penalty=best_params['lr__penalty'],
    solver=best_params['lr__solver'],
    random_state=42
)

rf_best = RandomForestClassifier(
    n_estimators=best_params['rf__n_estimators'],
    max_depth=best_params['rf__max_depth'],
    max_features=best_params['rf__max_features'],
    min_samples_split=best_params['rf__min_samples_split'],
    random_state=42
)

svc_best = SVC(
    C=best_params['svc__C'],
    kernel=best_params['svc__kernel'],
    gamma=best_params['svc__gamma'],
    probability=True,
    random_state=42
)

# Voting classifier (soft voting usually performs better)
voting_clf_2 = VotingClassifier(
    estimators=[
        ('lr', lr_best),
        ('rf', rf_best),
        ('svc', svc_best)
    ],
    voting='soft', 
    n_jobs=-1
)
voting_clf_2.fit(X_train, y_train)

# Evaluate
print("Train Accuracy:", voting_clf_2.score(X_train, y_train))
print("Test Accuracy:", voting_clf_2.score(X_test, y_test))

y_pred_tuned = voting_clf_2.predict(X_test)
print("Accuracy:", voting_clf_2.score(X_test, y_test))

cross_val_score(voting_clf_2, X_train, y_train, cv=5, scoring="accuracy")

for name, clf in voting_clf_2.named_estimators_.items():     
    print(name, "=", clf.score(X_test, y_test))

voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
print("Accuracy:", voting_clf.score(X_test, y_test))

cross_val_score(voting_clf, X_train, y_train, cv=5, scoring="accuracy")

for name, clf in voting_clf.named_estimators_.items():     
    print(name, "=", clf.score(X_test, y_test))

stacking_clf = StackingClassifier(
 estimators=[
 ('lr',lr_best),
 ('rf', rf_best),
 ('svc', svc_best)
 ],
 final_estimator=RandomForestClassifier(random_state=43),
 cv=5  # number of cross-validation folds
 )
stacking_clf.fit(X_train, y_train)
print("Train Accuracy:", stacking_clf.score(X_train, y_train))
cross_val_score(stacking_clf, X_train, y_train, cv=5, scoring="accuracy")  

#test data preprocessing
test["Deck"] = test["Cabin"].str[0]
test.loc[test["Deck"].isnull() & (test["Pclass"]==1), "Deck"] = "B"
test.loc[test["Deck"].isnull() & (test["Pclass"]==2), "Deck"] = "D"
test.loc[test["Deck"].isnull() & (test["Pclass"]==3), "Deck"] = "F"
test["Embarked"] = test["Embarked"].fillna("S")
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
test["IsAlone"] = (test["FamilySize"] == 1).astype(int)
test["Title"] = test["Name"].str.extract(r' ([A-Za-z]+)\.', expand=False)
test["Title"] = test["Title"].replace(['Mlle','Ms'],'Miss')
test["Title"] = test["Title"].replace('Mme','Mrs')
test["Title"] = test["Title"].replace(['Capt','Col','Major','Dr','Rev'], 'Officer')
test["Title"] = test["Title"].replace(['Don','Dona','Lady','Countess','Jonkheer','Sir'], 'Noble')
test = test.drop(drop_attribute, axis=1)
test_transformed = preprocessor.transform(test)
feature_names = preprocessor.get_feature_names_out()
test_transformed_df = pd.DataFrame(test_transformed, columns=feature_names)

test_transformed_df.head()

KeyboardInterrupt: 