In [1]:
import pathlib

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import compose, impute, linear_model, model_selection, pipeline, preprocessing, metrics
from sklearn.ensemble import GradientBoostingClassifier
import torch
from torch import nn, optim, utils
import torchmetrics

In [2]:
INPUT_DIR = pathlib.Path("/kaggle/input/kaust-academy-ai-week-november-2022")
WORKING_DIR = pathlib.Path("/kaggle/working")

In [3]:
!ls -lh $INPUT_DIR

total 1.2M
-rw-r--r-- 1 nobody nogroup  59K Nov 27 05:24 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 364K Nov 27 05:24 test.csv
-rw-r--r-- 1 nobody nogroup 787K Nov 27 05:24 train.csv


In [4]:
# Q1: load the training data
_train_df = pd.read_csv(
    INPUT_DIR/ "train.csv",
    index_col = "PassengerId"
)

_seed = 42
train_df, val_df = model_selection.train_test_split(
    _train_df,
    test_size=0.1,
    random_state = np.random.RandomState(_seed),
    stratify=_train_df.loc[:, "Transported"],
)

In [5]:
train_features = train_df.drop(["Transported", "Name"] , axis = 1)
train_target = train_df[["Transported"]]

val_features = val_df.loc[:, train_df.columns != "Transported"]
val_target = val_df[["Transported"]]

In [6]:
boolean_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
)

categorical_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    preprocessing.OneHotEncoder(),
)

numeric_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="mean")
)

data_preprocessing = compose.make_column_transformer(
    (boolean_preprocessing, ["CryoSleep", "VIP"]),
    (categorical_preprocessing, ["HomePlanet", "Destination"]),
    (numeric_preprocessing, compose.make_column_selector(dtype_include=np.float64)),
    remainder = "drop",
)

In [7]:
GradientBoostingClassifier?

In [8]:
_seed = 42
_hyperparameters = {
   
    "loss": 'deviance',
    "learning_rate": 0.09,
    "n_estimators": 100,
    "subsample": 1.0,
    "criterion": 'friedman_mse',
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "min_weight_fraction_leaf": 0.0,
    "max_depth": 3,
    "min_impurity_decrease": 0.0,
    "init": None,
    "random_state": None,
    "max_features": None,
    "verbose": 0,
    "max_leaf_nodes": None,
    "warm_start": False,
    "validation_fraction": 0.1,
    "n_iter_no_change": None,
    "tol": 0.0001,
    "ccp_alpha": 0.0,
}
estimator = GradientBoostingClassifier(**_hyperparameters)

ml_pipeline = pipeline.make_pipeline(
    data_preprocessing,
    estimator
)
_ = ml_pipeline.fit(train_features, train_target)

_train_predictions = ml_pipeline.predict(train_features)
_report = metrics.classification_report(
    _train_predictions,
    train_target
)
print(_report)

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

       False       0.76      0.84      0.80      3543
        True       0.85      0.78      0.82      4280

    accuracy                           0.81      7823
   macro avg       0.81      0.81      0.81      7823
weighted avg       0.81      0.81      0.81      7823



In [9]:
# Q3: load the testing features
test_features = pd.read_csv(
    INPUT_DIR / "test.csv",
    index_col="PassengerId",
)

In [10]:
predictions = ml_pipeline.predict(test_features)

In [11]:
# Q4: load the sample submission file!
sample_submission_df = pd.read_csv(
    INPUT_DIR / "sample_submission.csv",
    index_col="PassengerId"
)

In [12]:
_ = (pd.DataFrame({"Transported": predictions}, index=sample_submission_df.index)
       .to_csv(WORKING_DIR / "submission.csv"))    

In [13]:
!cat $WORKING_DIR/submission.csv | head

PassengerId,Transported
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
0027_01,True
0029_01,True
0032_01,True
0032_02,True
