In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("train_clean.csv")

df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S


# Split Train-Validation-Test (Save test for later)

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop("Transported", axis=1)
y = df["Transported"]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, random_state=42)

# Save test dataset for later
X_test['Transported'] = y_test
X_test.to_csv("test.csv", index=False)

## Feature Engineering

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn import set_config

# Set transformer output to be a pandas DataFrame instead of numpy array
set_config(transform_output = "pandas")

cat_columns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]
num_columns = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

def make_feature_pipeline(cat_columns, num_columns):
    feature_engineering_pipeline = ColumnTransformer(
        transformers=[
            # (step name, transformer, column list to apply transformation to)
            ("onehot_encoding", OneHotEncoder(sparse_output=False), cat_columns),
            ("minmax_scaling", MinMaxScaler(), num_columns)
        ]
    )
    
    return feature_engineering_pipeline

## Make pipeline

In [7]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline

def make_pipeline(cat_columns, num_columns):
    
    feature_engineering_pipeline = make_feature_pipeline(cat_columns, num_columns)
    
    model_pipeline = Pipeline(
        steps = [
            # (step name, transformer)
            ("feature_engineering", feature_engineering_pipeline),
            ("model", LGBMClassifier())
        ]
    )
    
    return model_pipeline

make_pipeline(cat_columns, num_columns)

## Hyperparameter search using optuna

In [8]:
import optuna

def objective(
    trial,
    X_train,
    y_train,
    X_val,
    y_val,
    cat_columns,
    num_columns
):
    params = {
        # meta parameters
        'model__objective': trial.suggest_categorical('model__objective', ['binary']),
        'model__metric': trial.suggest_categorical('model__metric', ['binary_logloss']),
        'model__boosting_type': trial.suggest_categorical('model__boosting_type', ['gbdt']),
        'model__verbosity': trial.suggest_categorical('model__verbosity', [-1]),
        'model__random_state': trial.suggest_categorical('model__random_state', [42]),
        
        # hyperparameters
        'model__num_leaves': trial.suggest_int('model__num_leaves', 10, 100),
        'model__learning_rate': trial.suggest_float('model__learning_rate', 0.01, 0.1, log=True),
        'model__feature_fraction': trial.suggest_float('model__feature_fraction', 0.1, 1.0),
        'model__bagging_fraction': trial.suggest_float('model__bagging_fraction', 0.1, 1.0),
        'model__bagging_freq': trial.suggest_int('model__bagging_freq', 1, 10),
        'model__min_child_samples': trial.suggest_int('model__min_child_samples', 1, 50),
        
    }
    
    # Create pipeline
    model_pipeline = make_pipeline(cat_columns, num_columns)
    
    # Set pipeline hyperparameters for trial
    model_pipeline = model_pipeline.set_params(**params)
    
    # Fit model and evaluate
    model_pipeline.fit(X_train, y_train)
    
    return model_pipeline.score(X_val, y_val)

In [9]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="maximize")

study.optimize(
    func = lambda trial: objective(trial, X_train, y_train, X_val, y_val, cat_columns, num_columns),
    n_trials = 100,
    show_progress_bar=True
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
optuna.visualization.plot_optimization_history(study)

In [11]:
study.best_params

{'model__objective': 'binary',
 'model__metric': 'binary_logloss',
 'model__boosting_type': 'gbdt',
 'model__verbosity': -1,
 'model__random_state': 42,
 'model__num_leaves': 61,
 'model__learning_rate': 0.03004838117593054,
 'model__feature_fraction': 0.6958280573652637,
 'model__bagging_fraction': 0.5348271477306089,
 'model__bagging_freq': 6,
 'model__min_child_samples': 35}

## Reproduce validation score

In [10]:
model_pipeline = make_pipeline(cat_columns, num_columns)

model_pipeline.set_params(**study.best_params)

model_pipeline.fit(X_train, y_train)

# Check if we get exact same score
print("Study Reproduced:", study.best_value == model_pipeline.score(X_val, y_val))

Study Reproduced: True


## Save model

In [11]:
import pickle

with open("best_model.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)