In [11]:
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


df = pd.read_csv("../datasets/spaceship_titanic/train.csv")

df = df.drop(columns=["Name", "PassengerId", "Cabin"])

X = df.drop('Transported', axis=1)
y = df['Transported']

num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_cols = ["HomePlanet", "Destination"]
bool_cols = ["CryoSleep", "VIP"]


num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

bool_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols),
    ("bool", bool_pipeline, bool_cols)
])



In [12]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
import numpy as np

model = Pipeline([
    ("preprocessor", preprocessor),
    ("LRmodel", LogisticRegression(max_iter=1000))
])

kfold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scores = cross_val_score(
    model,
    X,
    y,
    cv=kfold,
    scoring="accuracy"
)

print("Scores per fold:", scores)
print("Mean:", scores.mean())
print("Std:", scores.std())


Scores per fold: [0.77573318 0.77745831 0.79930995 0.78250863 0.79574223]
Mean: 0.7861504601337621
Std: 0.009617799710088014


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

single_score = accuracy_score(y_test, y_pred)

print("Single split accuracy:", single_score)
print("CV mean accuracy:", scores.mean())


Single split accuracy: 0.7757331799884991
CV mean accuracy: 0.7861504601337621
