In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [12]:
data = pd.read_csv('../data/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6606 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   6606 non-null   object 
 1   HomePlanet    6606 non-null   object 
 2   CryoSleep     6606 non-null   object 
 3   Cabin         6606 non-null   object 
 4   Destination   6606 non-null   object 
 5   Age           6606 non-null   float64
 6   VIP           6606 non-null   object 
 7   RoomService   6606 non-null   float64
 8   FoodCourt     6606 non-null   float64
 9   ShoppingMall  6606 non-null   float64
 10  Spa           6606 non-null   float64
 11  VRDeck        6606 non-null   float64
 12  Name          6606 non-null   object 
 13  Transported   6606 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 729.0+ KB


In [15]:
data_encoded = pd.get_dummies(data, columns=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name'], dummy_na=True, dtype=int)
print(data_encoded.dtypes)

PassengerId                object
Age                       float64
RoomService               float64
FoodCourt                 float64
ShoppingMall              float64
                           ...   
Name_Zosmark Unaasor        int64
Name_Zosmas Ineedeve        int64
Name_Zosmas Mormonized      int64
Name_Zubeneb Pasharne       int64
Name_nan                    int64
Length: 11919, dtype: object


In [24]:
X = data_encoded.drop(['Transported', 'PassengerId'], axis=1)
y = data_encoded.Transported

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=43)

In [26]:
from sklearn.metrics import f1_score, accuracy_score

xgb_classifier = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=1,
    n_jobs=-1,
    early_stopping_rounds=10
)

xgb_classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=True
)


val_predictions = xgb_classifier.predict(X_valid)

val_accuracy = accuracy_score(y_valid, val_predictions)
val_macro_f1 = f1_score(y_valid, val_predictions, average='macro')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Macro-F1: {val_macro_f1:.4f}")

predictions = xgb_classifier.predict(X_test)

[0]	validation_0-logloss:0.65903
[1]	validation_0-logloss:0.62605
[2]	validation_0-logloss:0.60107
[3]	validation_0-logloss:0.57587
[4]	validation_0-logloss:0.55571
[5]	validation_0-logloss:0.53763
[6]	validation_0-logloss:0.52218
[7]	validation_0-logloss:0.50903
[8]	validation_0-logloss:0.49818
[9]	validation_0-logloss:0.48857
[10]	validation_0-logloss:0.47890
[11]	validation_0-logloss:0.47090
[12]	validation_0-logloss:0.46401
[13]	validation_0-logloss:0.45837
[14]	validation_0-logloss:0.45369
[15]	validation_0-logloss:0.44848
[16]	validation_0-logloss:0.44529
[17]	validation_0-logloss:0.44199
[18]	validation_0-logloss:0.43775
[19]	validation_0-logloss:0.43482
[20]	validation_0-logloss:0.43251
[21]	validation_0-logloss:0.43013
[22]	validation_0-logloss:0.42771
[23]	validation_0-logloss:0.42606
[24]	validation_0-logloss:0.42482
[25]	validation_0-logloss:0.42390
[26]	validation_0-logloss:0.42259
[27]	validation_0-logloss:0.42213
[28]	validation_0-logloss:0.42172
[29]	validation_0-loglos

In [27]:
from sklearn.metrics import classification_report

accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, output_dict=True, zero_division=0)

f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.7973
F1: 0.7972
Recall: 0.7971
Precision: 0.7974
