In [None]:
# Importing necessary libraries
import lightgbm as lgb
import optuna 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
import import_ipynb 
import Data_Processing as dp


In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [2]:
## import cleaned and processed data frame from Data_processing file 

def main(): 
    client_df = dp.import_client()
    invoice_df = dp.import_invoice()
    client_df = dp.convert_date(client_df) # Convert date cols
    invoice_df = dp.convert_date(invoice_df)
    client_df = dp.drop_duplicates(client_df) # Drop duplicates rows
    invoice_df = dp.drop_duplicates(invoice_df)
    categorical_column_names = ['region', 'dis', 'id', 'catg', 'target']
    client_df = dp.convert_to_categorical( # Convert categorical cols
        client_df, cols = categorical_column_names
        )
    invoice_df = dp.aggregate_invoice(invoice_df) # Aggregate invoices
    invoice_df = dp.manual_fix_names( # Fix column names manually
        invoice_df, 
        new_col_names = [
            'id', 
            'cons_level_1_sum', 'cons_level_1_mean', 
            'cons_level_1_max', 'cons_level_1_std',
            'cons_level_2_sum', 'cons_level_2_mean', 
            'cons_level_2_max', 'cons_level_2_std',
            'cons_level_3_sum', 'cons_level_3_mean', 
            'cons_level_3_max', 'cons_level_3_std',
            'cons_level_4_sum', 'cons_level_4_mean', 
            'cons_level_4_max', 'cons_level_4_std',
            'date_sum', 'date_mean', 'date_max', 'date_std',
            'num_invoices'
            ]
        )
    df = dp.merge(client_df = client_df, invoice_df = invoice_df) # Merge
    df = dp.prep_dataframe( # Prep for PCA
        df = df,
        response_col_name = 'target',
        cat_col_names = categorical_column_names
    )
    
    df = dp.principal_component_analysis( # Do PCA
        df = df,
        response_col_name = 'target'
    )
    # Do Low Variance Filter
    df = dp.filter_low_variance(df, response_col_name = 'target')
    df = dp.balance_data( # Do balancing 
        df = df, 
        response_col_name = 'target', 
        prop_synthetic_data = 0.4 # Final proportion of synthetic data
        ) 
    return df 


In [90]:
# not sure if we should consider the categorical explanatory variables?
def cat_df():    
    client_df = dp.import_client()
    invoice_df = dp.import_invoice()
    client_df = dp.convert_date(client_df) # Convert date cols
    invoice_df = dp.convert_date(invoice_df)
    client_df = dp.drop_duplicates(client_df) # Drop duplicates rows
    invoice_df = dp.drop_duplicates(invoice_df)
    categorical_column_names = ['region', 'dis', 'id', 'catg', 'target']
    client_df = dp.convert_to_categorical( # Convert categorical cols
        client_df, cols = categorical_column_names
        )
    invoice_df = dp.aggregate_invoice(invoice_df) # Aggregate invoices
    invoice_df = dp.manual_fix_names( # Fix column names manually
        invoice_df, 
        new_col_names = [
            'id', 
            'cons_level_1_sum', 'cons_level_1_mean', 
            'cons_level_1_max', 'cons_level_1_std',
            'cons_level_2_sum', 'cons_level_2_mean', 
            'cons_level_2_max', 'cons_level_2_std',
            'cons_level_3_sum', 'cons_level_3_mean', 
            'cons_level_3_max', 'cons_level_3_std',
            'cons_level_4_sum', 'cons_level_4_mean', 
            'cons_level_4_max', 'cons_level_4_std',
            'date_sum', 'date_mean', 'date_max', 'date_std',
            'num_invoices'
            ]
        )
    cate_df = dp.merge(client_df = client_df, invoice_df = invoice_df) # Merge
    return cate_df

In [None]:
pcdata = main()
pcdata.head()

#Convert categorical target to numeric 
pcdata['target'] = pcdata['target'].astype(int)  



In [None]:
pcdata["target"].value_counts()

In [5]:
# Select all columns except for the last one 
X = pcdata.iloc[:,:-1]
y = pcdata.iloc[:,-1]

# Split data into train,validation and test set via a 80-10-10 split
# First split into training + validation (90% of data) and test set (10% of data)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Second split into training (80% of total) and validation (10% of total)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=42)



In [17]:
def objective(trial):
    # Define hyperparameters to search
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators',0, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'max_depth': trial.suggest_int('max_depth', 2, 128),
        'min_split_gain': trial.suggest_loguniform("min_split_gain",0.001,0.1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10)
    }

    # Train the model
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
    params, 
    train_data, 
    valid_sets=[valid_data], 
    num_boost_round=100, 
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
      # Use a callback for early stopping
)

    # Predict and calculate the accuracy on the test set
    y_val_pred_proba = model.predict(X_val)
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)

    return roc_auc


In [20]:
# Create a study object and optimize
  # We want to maximize accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best parameters found
print("Best parameters found: ", study.best_params)

[I 2024-10-24 21:36:45,571] A new study created in memory with name: no-name-38770857-e07a-49f6-ad23-81a32f3b7a6d


Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:45,989] Trial 0 finished with value: 0.9151415689187555 and parameters: {'n_estimators': 55, 'learning_rate': 0.07107985528910561, 'num_leaves': 169, 'max_depth': 78, 'min_split_gain': 0.03632609723510134, 'feature_fraction': 0.916462725369, 'bagging_freq': 10}. Best is trial 0 with value: 0.9151415689187555.


Did not meet early stopping. Best iteration is:
[55]	valid_0's binary_logloss: 0.376783
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:46,372] Trial 1 finished with value: 0.8395840171368356 and parameters: {'n_estimators': 60, 'learning_rate': 0.06941823904386503, 'num_leaves': 171, 'max_depth': 30, 'min_split_gain': 0.004489759352571418, 'feature_fraction': 0.13386085340315546, 'bagging_freq': 2}. Best is trial 0 with value: 0.9151415689187555.


Did not meet early stopping. Best iteration is:
[60]	valid_0's binary_logloss: 0.517741
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:47,077] Trial 2 finished with value: 0.9392471539378445 and parameters: {'n_estimators': 74, 'learning_rate': 0.0747263751703812, 'num_leaves': 248, 'max_depth': 119, 'min_split_gain': 0.01104933255005825, 'feature_fraction': 0.4421182025740694, 'bagging_freq': 0}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[74]	valid_0's binary_logloss: 0.331737
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:47,354] Trial 3 finished with value: 0.8702679358059668 and parameters: {'n_estimators': 34, 'learning_rate': 0.010333040840170404, 'num_leaves': 233, 'max_depth': 88, 'min_split_gain': 0.03733119868571864, 'feature_fraction': 0.22235367603898848, 'bagging_freq': 10}. Best is trial 2 with value: 0.9392471539378445.
[I 2024-10-24 21:36:47,525] Trial 4 finished with value: 0.7896539810320107 and parameters: {'n_estimators': 87, 'learning_rate': 0.001621419844547359, 'num_leaves': 35, 'max_depth': 47, 'min_split_gain': 0.0012641256399731405, 'feature_fraction': 0.5655969417232354, 'bagging_freq': 3}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[34]	valid_0's binary_logloss: 0.62575
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[87]	valid_0's binary_logloss: 0.667324
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:47,923] Trial 5 finished with value: 0.9228773299704598 and parameters: {'n_estimators': 40, 'learning_rate': 0.10553298795214522, 'num_leaves': 386, 'max_depth': 17, 'min_split_gain': 0.05225878546281163, 'feature_fraction': 0.9858270688952944, 'bagging_freq': 2}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[40]	valid_0's binary_logloss: 0.362212
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:48,870] Trial 6 finished with value: 0.9090918513655917 and parameters: {'n_estimators': 100, 'learning_rate': 0.013504218823901484, 'num_leaves': 261, 'max_depth': 59, 'min_split_gain': 0.029183024172381895, 'feature_fraction': 0.31647936240536656, 'bagging_freq': 1}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.49422
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:49,693] Trial 7 finished with value: 0.911721111821307 and parameters: {'n_estimators': 86, 'learning_rate': 0.028989556020230327, 'num_leaves': 231, 'max_depth': 119, 'min_split_gain': 0.04058124468012041, 'feature_fraction': 0.9181968578093205, 'bagging_freq': 8}. Best is trial 2 with value: 0.9392471539378445.
[I 2024-10-24 21:36:49,799] Trial 8 finished with value: 0.769062138303937 and parameters: {'n_estimators': 49, 'learning_rate': 0.0016061969174129187, 'num_leaves': 25, 'max_depth': 95, 'min_split_gain': 0.010571851149399571, 'feature_fraction': 0.8958792805893417, 'bagging_freq': 9}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[86]	valid_0's binary_logloss: 0.399069
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[49]	valid_0's binary_logloss: 0.677965
Training until validation scores don't improve for 10 rounds


[I 2024-10-24 21:36:50,193] Trial 9 finished with value: 0.8779207766856116 and parameters: {'n_estimators': 36, 'learning_rate': 0.001965147684503104, 'num_leaves': 446, 'max_depth': 65, 'min_split_gain': 0.025512769220791334, 'feature_fraction': 0.5586634955618639, 'bagging_freq': 4}. Best is trial 2 with value: 0.9392471539378445.


Did not meet early stopping. Best iteration is:
[36]	valid_0's binary_logloss: 0.67073
Best parameters found:  {'n_estimators': 74, 'learning_rate': 0.0747263751703812, 'num_leaves': 248, 'max_depth': 119, 'min_split_gain': 0.01104933255005825, 'feature_fraction': 0.4421182025740694, 'bagging_freq': 0}


In [23]:
# use best parameters to train the model
best_params = study.best_params
best_params['objective'] = 'binary'
best_params['metric'] = 'binary_logloss'  


train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

final_model = lgb.train(
    best_params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
)

# Predict and calculate the accuracy on the test set
y_val_pred_proba = final_model.predict(X_val)
roc_auc = roc_auc_score(y_val, y_val_pred_proba)
roc_auc.round(2)



np.float64(0.94)