In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('dataset/space.csv')
df.tail()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8688,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True
8689,9275_01,Europa,False,A/97/P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Polaton Conable,True
8690,9275_03,Europa,,A/97/P,TRAPPIST-1e,30.0,False,0.0,3208.0,0.0,2.0,330.0,Atlasym Conable,True
8691,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8692,9280_02,Europa,False,E/608/S,TRAPPIST-1e,44.0,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True


In [3]:
# Custom transformer for splitting 'Cabin' feature
class CabinSplitter(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        cabin_df = X['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)
        cabin_df.columns = ['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']
        return cabin_df
        
    def get_feature_names_out(self):
        return self.columns

In [4]:
#preprocessing pipelines
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
cabin_feature = ['Cabin'] 

# Num pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Cat pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Cabin pipeline
cabin_transformer = Pipeline(steps=[
    ('cabin_split', CabinSplitter()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('cabin', cabin_transformer, cabin_feature)
    ])

In [5]:
preprocessor

In [6]:
from sklearn.model_selection import train_test_split

target = 'Transported'
X = df.drop(columns=[target, 'Name', 'PassengerId'])
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
logistic_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("classifier", LogisticRegression(max_iter=100))
    ]
)

cross_val_score(logistic_pipeline, X_train, y_train, cv=5, scoring="f1_macro").mean()

0.7821822602801741

In [8]:
param_grid = {
    "classifier__C": [0.001, 0.001, 0.01, 0.1, 1, 10, 100],
    "classifier__penalty": ["l1", "l2"],
    "classifier__class_weight": ["balanced", None],
    "preprocessing__num__imputer__strategy": ["mean", "median"],
    "preprocessing__cat__imputer__strategy": ["most_frequent", "constant"],
    "classifier__solver": ["liblinear"]
}


from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(logistic_pipeline, param_distributions=param_grid, n_iter=84, cv=5, scoring="recall")
random_search.fit(X_train, y_train)

225 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_encode.py", line 174, in _unique_python
    uniques = sorted(uniques_set)
              ^^^^^^^^^^^^^^^^^^^
TypeError: '<' not supported between instances of 'bool' and 'str'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\e

In [9]:
print("Best parameters found: ", random_search.best_params_)
print("Best recall score: ", random_search.best_score_)

Best parameters found:  {'preprocessing__num__imputer__strategy': 'mean', 'preprocessing__cat__imputer__strategy': 'most_frequent', 'classifier__solver': 'liblinear', 'classifier__penalty': 'l1', 'classifier__class_weight': None, 'classifier__C': 0.1}
Best recall score:  0.7982978723404255


In [10]:
y_pred = random_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print((f1, accuracy, precision, recall))

(0.8007951698179852, 0.8009969325153374, 0.7944403803950256, 0.8202416918429003)


In [11]:
knn_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', KNeighborsClassifier())
])

cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring="f1_macro").mean()

0.7653637114096457

In [12]:
param_grid = {
    "classifier__n_neighbors": [5, 10, 15, 20, 30, 50],
    "classifier__weights": ["uniform", "distance"],
    "classifier__p": [1, 2],  # Manhattan or Euclidean distance
    "preprocessing__num__imputer__strategy": ["mean", "median"],
    "preprocessing__cat__imputer__strategy": ["most_frequent", "constant"]
}

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    knn_pipeline,
    param_distributions=param_grid,
    n_iter=96,  # safe as total space = 6 * 2 * 2 * 2 * 2 = 96
    cv=5,
    scoring="recall",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
229 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_encode.py", line 174, in _unique_python
    uniques = sorted(uniques_set)
              ^^^^^^^^^^^^^^^^^^^
TypeError: '<' not supported between instances of 'str' and 'bool'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\

In [13]:
print("Best parameters found: ", random_search.best_params_)
print("Best recall score: ", random_search.best_score_)

Best parameters found:  {'preprocessing__num__imputer__strategy': 'median', 'preprocessing__cat__imputer__strategy': 'most_frequent', 'classifier__weights': 'distance', 'classifier__p': 1, 'classifier__n_neighbors': 20}
Best recall score:  0.7727600547342437


In [14]:
y_pred = random_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print((f1, accuracy, precision, recall))

(0.7979270413960986, 0.7979294478527608, 0.8081979891724671, 0.7892749244712991)


In [15]:
xgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0))
])

In [16]:
param_grid = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [3, 5, 7, 10],
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "classifier__subsample": [0.6, 0.8, 1.0],
    "classifier__colsample_bytree": [0.6, 0.8, 1.0],
    "classifier__gamma": [0, 1, 5],
    "classifier__reg_lambda": [1, 5, 10],
    "classifier__reg_alpha": [0, 0.5, 1],
    "preprocessing__num__imputer__strategy": ["mean", "median"],
    "preprocessing__cat__imputer__strategy": ["most_frequent", "constant"]
}

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid,
    n_iter=84,  # adjust as needed for performance
    cv=5,
    scoring="recall",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


200 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
194 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_encode.py", line 174, in _unique_python
    uniques = sorted(uniques_set)
              ^^^^^^^^^^^^^^^^^^^
TypeError: '<' not supported between instances of 'str' and 'bool'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\emmanuel.osademe\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\

In [17]:
print("Best parameters found: ", random_search.best_params_)
print("Best recall score: ", random_search.best_score_)

Best parameters found:  {'preprocessing__num__imputer__strategy': 'median', 'preprocessing__cat__imputer__strategy': 'most_frequent', 'classifier__subsample': 0.8, 'classifier__reg_lambda': 5, 'classifier__reg_alpha': 0, 'classifier__n_estimators': 300, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.05, 'classifier__gamma': 5, 'classifier__colsample_bytree': 1.0}
Best recall score:  0.8418394998792629


In [18]:
y_pred = random_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print((f1, accuracy, precision, recall))

(0.7987523383880434, 0.7994631901840491, 0.778318276580959, 0.8459214501510574)
