In [38]:
!pip install opendatasets scikit-learn optuna --quiet

In [44]:
import opendatasets as od
import os

import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier

import optuna

In [68]:
resource_url= 'https://www.kaggle.com/competitions/GiveMeSomeCredit/data'
od.download(resource_url)
os.chdir('GiveMeSomeCredit')

train_set= pd.read_csv('cs-training.csv')
test_set= pd.read_csv('cs-test.csv')
sample_entry= pd.read_csv('sampleEntry.csv')

input_features= train_set.iloc[:, 2:].columns.to_list()
target_label= train_set.iloc[:, 1].to_frame().columns.to_list()

input_features_dataframe= train_set[input_features]
median_imputer= SimpleImputer(strategy= 'median')
imputed_data= median_imputer.fit_transform(input_features_dataframe)
imputed_features_dataframe= pd.DataFrame(data= imputed_data, columns= input_features_dataframe.columns)

x= imputed_features_dataframe[input_features]
y= train_set[target_label]

#x_train, x_validation, y_train, y_validation= train_test_split(x, y.values.ravel(), test_size= 0.25, random_state= 42)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: redeyedraven
Your Kaggle Key: ··········
Downloading GiveMeSomeCredit.zip to ./GiveMeSomeCredit


100%|██████████| 5.16M/5.16M [00:00<00:00, 5.45MB/s]



Extracting archive ./GiveMeSomeCredit/GiveMeSomeCredit.zip to ./GiveMeSomeCredit


In [55]:
#  Testing Cross Validaiton

def objective(trial):
  parameters= {'objective'        :'binary',
              'metric'            :'binary_logloss',
              'boosting_type'     :trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
              'num_leaves'        :trial.suggest_int('num_leaves', 2, 256),
              'learning_rate'     :trial.suggest_float('learning_rate', 0.001, 0.5),
              'feature_fraction'  :trial.suggest_float('feature_fraction', 0.1, 1.0),
              'bagging_fraction'  :trial.suggest_float('bagging_fraction', 0.1, 1.0),
              'bagging_freq'      :trial.suggest_int('bagging_freq', 1, 10),
              'reg_alpha'         :trial.suggest_float('reg_alpha', 1e-8, 10.0, log= True),
              'reg_lambda'        :trial.suggest_float('reg_lambda', 1e-8, 10.0, log= True),
              'min_child_samples' :trial.suggest_int('min_child_samples', 5, 100)}

  LGBM= LGBMClassifier(**parameters,
                       force_col_wise= True)

  scores= cross_val_score(LGBM,
                          x,
                          y,
                          n_jobs= -1,
                          cv= 5,
                          scoring= 'accuracy')
  accuracy= scores.mean()

  return accuracy

In [None]:
study= optuna.create_study(direction= 'maximize')
study.optimize(objective, n_trials= 100)

In [60]:
best_model= study.best_trial
print("Best trial:")
print("  Value (Accuracy): ", best_model.value)
print("  Params: ")
for key, value in best_model.params.items():
    print(f"    {key}: {value}")

Best trial:
  Value (Accuracy):  0.9376866666666667
  Params: 
    boosting_type: dart
    num_leaves: 34
    learning_rate: 0.12790357571059024
    feature_fraction: 0.491139423521851
    bagging_fraction: 0.5928349766122926
    bagging_freq: 4
    reg_alpha: 6.8372689571593e-05
    reg_lambda: 0.0017614372265715404
    min_child_samples: 97


# Final Prediction

In [61]:
test_set_input_features= test_set.iloc[:, 2:]

imputed_test_data= median_imputer.fit_transform(test_set_input_features)
imputed_test_features_dataframe= pd.DataFrame(data= imputed_test_data, columns= test_set_input_features.columns)

In [63]:
final_model= LGBMClassifier(boosting_type= 'dart',
                            num_leaves= 34,
                            learning_rate= 0.12790357571059024,
                            feature_fraction= 0.491139423521851,
                            bagging_fraction= 0.5928349766122926,
                            bagging_freq= 4,
                            reg_alpha= 6.8372689571593e-05,
                            reg_lambda= 0.0017614372265715404,
                            min_child_samples= 97,
                            force_col_wise= True)

In [65]:
final_model.fit(x, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 10026, number of negative: 139974
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066840 -> initscore=-2.636275
[LightGBM] [Info] Start training from score -2.636275


In [66]:
final_model_prediction= final_model.predict_proba(imputed_test_features_dataframe)



In [78]:
final_prediction_dataframe= sample_entry.copy()
final_prediction_dataframe.set_index('Id', inplace= True)
final_prediction_dataframe.Probability= final_model_prediction[:, 1]

In [80]:
final_prediction_dataframe.to_csv('final_prediction.csv')