In [1]:
import lightgbm as lgb
import optuna
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, RobustScaler
from datetime import datetime
from optuna.samplers import TPESampler
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean_column_names(train_df):
    # Replace all non-alphanumeric characters (except underscores) with underscores
    train_df.columns = [re.sub(r'\W+', '_', col) for col in train_df.columns]
    return train_df

def clean_column_names(test_df):
    # Replace all non-alphanumeric characters (except underscores) with underscores
    test_df.columns = [re.sub(r'\W+', '_', col) for col in test_df.columns]
    return test_df

In [3]:
# Load training and test datasets
train_df = pd.read_csv("C:/Users/SHARON/Downloads/Train (1).csv")
test_df = pd.read_csv("C:/Users/SHARON/Downloads/Test (1).csv")
train_df.head()

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.5,0.3,7794.0,0
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.0,0.2,1428.0,0
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.4,0.3,2770.0,0
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.2,0.3,1418.0,0


In [4]:
# Combine train and test data for encoding
combined_df = pd.concat([train_df, test_df], axis=0)

categorical_cols = ['country_id', 'loan_type', 'New_versus_Repeat']
target_col = 'target'

# 1. Convert categorical columns to numerical values using LabelEncoder
label_encoder = LabelEncoder()

# Ensure that all categorical columns are properly encoded in the combined DataFrame
for col in categorical_cols:
    combined_df[col] = label_encoder.fit_transform(combined_df[col])

# Split the combined DataFrame back into train and test sets
train_df[categorical_cols] = combined_df[categorical_cols].iloc[:len(train_df)]
test_df[categorical_cols] = combined_df[categorical_cols].iloc[len(train_df):]

# 2. Define features (X) and target (y) for training
X = train_df[categorical_cols]
y = train_df[target_col]

# Check if all categorical columns are now numeric
print(X.dtypes)

country_id           int32
loan_type            int32
New_versus_Repeat    int32
dtype: object


In [5]:
# Handle date columns
date_cols = ['disbursement_date', 'due_date']
for col in date_cols:
    train_df[col] = pd.to_datetime(train_df[col])
    test_df[col] = pd.to_datetime(test_df[col])
    
    # Create new features from date columns
    train_df[f'{col}_year'] = train_df[col].dt.year
    train_df[f'{col}_month'] = train_df[col].dt.month
    train_df[f'{col}_day'] = train_df[col].dt.day
    
    test_df[f'{col}_year'] = test_df[col].dt.year
    test_df[f'{col}_month'] = test_df[col].dt.month
    test_df[f'{col}_day'] = test_df[col].dt.day
    

# Drop original date columns
train_df = train_df.drop(columns=date_cols)
test_df = test_df.drop(columns=date_cols)

In [6]:
# Example of incorporating unique features
# Assuming economic_indicators is a DataFrame containing additional data
economic_indicators = pd.read_csv("C:/Users/SHARON/Downloads/economic_indicators.csv")

economic_indicators_long = pd.melt(economic_indicators, id_vars=['Country', 'Indicator'],
                                   var_name='Year', value_name='Value')
economic_indicators_long['Year'] = economic_indicators_long['Year'].str.extract('(\d{4})').astype(int)
economic_indicators_pivot = economic_indicators_long.pivot_table(index=['Country', 'Year'],
                                                                 columns='Indicator',
                                                                 values='Value',
                                                                 aggfunc='first').reset_index()
economic_indicators_pivot = economic_indicators_pivot.rename(columns={'Country': 'country_id'})


In [7]:
train_df['country_id'] = train_df['country_id'].astype(str)
test_df['country_id'] = test_df['country_id'].astype(str) 
economic_indicators_pivot['country_id'] = economic_indicators_pivot['country_id'].astype(str)
print(train_df['country_id'].dtype)  # Should output: 'object' or 'str'
print(test_df['country_id'].dtype)   # Should output: 'object' or 'str'
print(economic_indicators_pivot['country_id'].dtype) 

object
object
object


In [8]:
# Extract year from disbursement_date and merge economic indicators with train and test datasets
train_eco_merge = pd.merge(train_df, economic_indicators_pivot,
                    left_on=['country_id', 'disbursement_date_year'], 
                    right_on=['country_id', 'Year'], how='left').drop(columns=['Year'])
test_eco_merge = pd.merge(train_df, economic_indicators_pivot,
                    left_on=['country_id', 'disbursement_date_year'], 
                    right_on=['country_id', 'Year'], how='left').drop(columns=['Year'])

# Ensure no duplicated columns in the merged dataframes before concatenation
train_eco_merge = train_eco_merge.loc[:, ~train_eco_merge.columns.duplicated()]
test_eco_merge = test_eco_merge.loc[:, ~test_eco_merge.columns.duplicated()]

# Remove duplicates from train_df and test_df before concatenation
train_df = train_df.loc[:, ~train_df.columns.duplicated()]
test_df = test_df.loc[:, ~test_df.columns.duplicated()]

# Concatenate the merged economic indicators with the original train and test dataframes
train_df = pd.concat([train_df, train_eco_merge], axis=1) 
test_df = pd.concat([test_df, test_eco_merge], axis=1)

# Remove any potential duplicates in the concatenated dataframes
train_df = train_df.loc[:, ~train_df.columns.duplicated()]
test_df = test_df.loc[:, ~test_df.columns.duplicated()]
# Fill NaN values with specific strategies
train_df = train_df.fillna(method='ffill').fillna(method='bfill').fillna(0) 
test_df = test_df.fillna(method='ffill').fillna(method='bfill').fillna(0)

In [9]:
train_df = clean_column_names(train_df)
test_df = clean_column_names(test_df)

In [10]:
# Convert country_id properly
train_df['country_id'] = train_df['country_id'].apply(lambda x: x[0] if isinstance(x, tuple) else x).fillna(-1).astype(int)
test_df['country_id'] = test_df['country_id'].apply(lambda x: x[0] if isinstance(x, tuple) else x).fillna(-1).astype(int)

# Print the dtype of the 'country_id' column specifically to avoid confusion
print("train_df 'country_id' dtype after conversion:", train_df['country_id'].dtypes)  # Should be 'int32'
print("test_df 'country_id' dtype after conversion:", test_df['country_id'].dtypes)


train_df 'country_id' dtype after conversion: int32
test_df 'country_id' dtype after conversion: int32


In [11]:
# Separate features and target
X = train_df.drop(columns=['ID', 'target'])
y = train_df['target']
X_test = test_df.drop(columns=['ID'])

In [12]:
# Align the columns of the test set with the training set
X_test = X_test[X.columns]

In [13]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns 

In [14]:
scaler = RobustScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) 
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [15]:
train_df = train_df.drop(columns=['ID'])

In [16]:
print(y.value_counts())

target
0    67396
1     1258
Name: count, dtype: int64


In [17]:
# Function to calculate credit score
def calculate_credit_score(probability_of_default):
    # Ensure the probability is within the valid range [0, 1]
    probability_of_default = max(0, min(1, probability_of_default))
    
    # Define the risk categories and corresponding score ranges
    if 0 <= probability_of_default <= 0.2:
        # Low risk (Excellent)
        credit_score = 750 + (probability_of_default * 100)  # Score between 750 and 850
    elif 0.2 < probability_of_default <= 0.4:
        # Moderate Low risk (Good)
        credit_score = 650 + ((probability_of_default - 0.2) * 99)  # Score between 650 and 749
    elif 0.4 < probability_of_default <= 0.6:
        # Moderate risk (Average)
        credit_score = 550 + ((probability_of_default - 0.4) * 99)  # Score between 550 and 649
    elif 0.6 < probability_of_default <= 0.8:
        # Moderate High risk (Risky)
        credit_score = 450 + ((probability_of_default - 0.6) * 99)  # Score between 450 and 549
    else:
        # High risk (Very Risky)
        credit_score = 300 + ((probability_of_default - 0.8) * 149)  # Score between 300 and 449

    return round(credit_score)

In [18]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights for imbalanced classes
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Define the LightGBM model
base_lgb_model = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=13,
    random_state=42,
    n_jobs=-1,
    num_leaves=81,
    min_child_samples=30,
    class_weight=class_weight_dict
)

# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
accuracy_scores = []

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(skf.split(X, y), start=1):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the LightGBM model on the current fold
    base_lgb_model.fit(X_train_fold, y_train_fold)
    
    # Get predicted probabilities for the positive class (class 1)
    y_prob = base_lgb_model.predict_proba(X_val_fold)[:, 1]
    
    # Convert probabilities to credit scores (ensure calculate_credit_score is defined)
    credit_scores = [calculate_credit_score(prob) for prob in y_prob]
    print(f"Credit Scores for Fold {fold}:")
    print(credit_scores)
    
    # Evaluate performance using F1 Score and Accuracy
    y_pred = base_lgb_model.predict(X_val_fold)
    f1_score_val = f1_score(y_val_fold, y_pred, average='weighted')
    accuracy_score_val = accuracy_score(y_val_fold, y_pred)
    f1_scores.append(f1_score_val)
    accuracy_scores.append(accuracy_score_val)
    
    # Print classification report for this fold
    print(f"Classification Report for Fold {fold}:")
    print(classification_report(y_val_fold, y_pred))
    print(f"F1 Score for Fold {fold}: {f1_score_val:.4f}")
    print(f"Accuracy for Fold {fold}: {accuracy_score_val:.4f}\n")

# Print average F1 score and accuracy across all folds
print("Average F1 Score across all folds:", np.mean(f1_scores))
print("Average Accuracy across all folds:", np.mean(accuracy_scores))

# Train the final model on the entire dataset (X, y)
base_lgb_model.fit(X, y)

# Make predictions on the test set
y_prob_test = base_lgb_model.predict_proba(X_test)[:, 1]
credit_scores_test = [calculate_credit_score(prob) for prob in y_prob_test]

# Print the credit scores for the test set
print(f"Credit Scores for Test Set:")
print(credit_scores_test)


[LightGBM] [Info] Number of positive: 1007, number of negative: 53916
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1957
[LightGBM] [Info] Number of data points in the train set: 54923, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500153 -> initscore=0.000611
[LightGBM] [Info] Start training from score 0.000611
Credit Scores for Fold 1:
[750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 330, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 330, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750, 750