In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("../data/processed/train_clean.csv")
test = pd.read_csv("../data/processed/test_clean.csv")

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Title"]
X = train[features]
y = train["Survived"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    # use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("\n Accuracy:", accuracy_score(y_val, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\n Classification Report:\n", classification_report(y_val, y_pred))

joblib.dump(model, "../models/xgb_titanic_model.pkl")
print("\n Model saved in '../models/xgb_titanic_model.pkl'")



 Accuracy: 0.7988826815642458

 Confusion Matrix:
 [[90 15]
 [21 53]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


 Model saved in '../models/xgb_titanic_model.pkl'


In [7]:
model = joblib.load("../models/xgb_titanic_model.pkl")

test_predictions = model.predict(test)

submission = pd.DataFrame({
    "PassengerId": range(892, 892 + len(test_predictions)),  
    "Survived": test_predictions
})
submission.to_csv("../submissions/xgb_submission.csv", index=False)
print(" Prediction saved in '../submissions/xgb_submission.csv'")


 Prediction saved in '../submissions/xgb_submission.csv'
