In [2]:
import pickle
import numpy as np
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
with open('/Users/adamboesky/Research/ay98/clean_data/gbm_data.pkl', 'rb') as f:
    (X_train, y_train, X_test, y_test) = pickle.load(f)

In [111]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 5,  # Specify the number of classes
    'metric': 'multi_logloss',
    'max_bin': 100,            # ***  [100 ---> 15000]
    'num_iterations': 500,      # ***  [400 ---> 1000]
    'learning_rate': 0.1,      # ***  [0.1  ---> 0.0001]
    'num_leaves': 30            # *** [30 ---> 150]
}

In [112]:
max_bins = np.linspace(200, 1500, num=3)
num_iters = np.linspace(100, 1000, num=4)
lrs = np.logspace(-4, -1, num=4)
n_leaves = np.linspace(30, 150, num=5)
max_bins, num_iters, lrs, n_leaves

(array([ 200.,  850., 1500.]),
 array([ 100.,  400.,  700., 1000.]),
 array([0.0001, 0.001 , 0.01  , 0.1   ]),
 array([ 30.,  60.,  90., 120., 150.]))

In [113]:
# K fold confusion matrix
kf = StratifiedKFold(n_splits=10, random_state=22, shuffle=True)
# two_step_classifier = TwoStepClassifier(ia_thresh=max_thresh)

# Initialize a matrix to hold the summed confusion matrix
cumulative_cm = np.array([[0 for _ in range(5)] for _ in range(5)])

# Initialize lists of the scores for the purity vs. completeness graphs
y_tests = []
y_proba = []
f1_scores = []

# X and y set
preds = ['$\\log(\\rm{Separation})$ [$^{\\prime\\prime}$]', '$\\log(M_*) \\ [M_\\odot]$', '$\\log(\\rm{SFR}) \\ [M_\\odot \\rm{yr}^{-1}$]', 'Redshift']

# Iterate over each fold
for train_index, test_index in kf.split(X_train, y_train):

    ### NN-INFERRED DATA ###
    # Split the data into training and test sets for the current fold
    X_train_set, X_val = X_train[train_index], X_train[test_index]
    y_train_set, y_val = y_train[train_index], y_train[test_index]
    train_set = lgb.Dataset(X_train_set, label=y_train_set)
    # val_set = lgb.Dataset(X_val, label=y_val)

    # Fit and get confusion matrix
    # Non-balanced weights
    bst = lgb.train(params=params, train_set=train_set)#, valid_sets=val_set)
    y_pred_proba = bst.predict(X_val)
    y_pred = np.argmax(y_pred_proba, axis=1)
    cumulative_cm += confusion_matrix(y_val, y_pred, labels=[i for i in range(5)])

    # Get the info for the purity vs. completeness curve
    y_tests.append(y_val)
    y_proba.append(y_pred_proba)

    # Get the f1 score
    f1 = f1_score(y_val, y_pred, average='macro')  # Adjust the 'average' parameter as needed
    f1_scores.append(f1)



In [114]:
cumulative_cm

array([[5604,   55,    8,   15,  334],
       [ 459,   19,    1,    0,   78],
       [  73,    0,    0,    1,    5],
       [ 211,    1,    0,    5,   15],
       [1123,   43,    2,    7,  188]])