In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Loading the train dataset
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
# Display the first few rows of the datasets and their shape
display("Train", train.head(), train.shape, "Test", test.head(), test.shape)

'Train'

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.5,0.3,7794.0,0
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.0,0.2,1428.0,0
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.4,0.3,2770.0,0
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.2,0.3,1418.0,0


(68654, 16)

'Test'

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid
0,ID_269404226088267278,269404,Kenya,226088,267278,Type_1,1919.0,1989.0,2022-07-27,2022-08-03,7,Repeat Loan,575.7,0.3,597.0
1,ID_255356300042267278,255356,Kenya,300042,267278,Type_1,2138.0,2153.0,2022-11-16,2022-11-23,7,Repeat Loan,0.0,0.0,0.0
2,ID_257026243764267278,257026,Kenya,243764,267278,Type_1,8254.0,8304.0,2022-08-24,2022-08-31,7,Repeat Loan,207.0,0.025079,208.0
3,ID_264617299409267278,264617,Kenya,299409,267278,Type_1,3379.0,3379.0,2022-11-15,2022-11-22,7,Repeat Loan,1013.7,0.3,1014.0
4,ID_247613296713267278,247613,Kenya,296713,267278,Type_1,120.0,120.0,2022-11-10,2022-11-17,7,Repeat Loan,36.0,0.3,36.0


(18594, 15)

In [7]:
# Check column types & missing values
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68654 entries, 0 to 68653
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country_id                   68654 non-null  object 
 1   lender_id                    68654 non-null  int64  
 2   loan_type                    68654 non-null  object 
 3   Total_Amount                 68654 non-null  float64
 4   Total_Amount_to_Repay        68654 non-null  float64
 5   disbursement_date            68654 non-null  object 
 6   due_date                     68654 non-null  object 
 7   duration                     68654 non-null  int64  
 8   New_versus_Repeat            68654 non-null  object 
 9   Amount_Funded_By_Lender      68654 non-null  float64
 10  Lender_portion_Funded        68654 non-null  float64
 11  Lender_portion_to_be_repaid  68654 non-null  float64
 12  target                       68654 non-null  int64  
dtypes: float64(5), i

In [8]:
# Are there missing values in the train dataset ?
print(f"There are {train.isna().sum().sum()} missing values in the data.")

There are 0 missing values in the data.


In [25]:
data = pd.concat([train, test]).reset_index(drop=True)

# Feature engineering: Loan term
date_cols = ['disbursement_date', 'due_date']
for col in date_cols:
    data[col] = pd.to_datetime(data[col], errors='coerce')  # Convert to datetime, handle invalid parsing as NaT

# Calculate loan term in days
data['loan_term_days'] = (data['due_date'] - data['disbursement_date']).dt.days

# Convert the datetime columns appropriately
date_cols = ['disbursement_date', 'due_date']
for col in date_cols:
    data[col] = pd.to_datetime(data[col])
    # Extract month, day, and year from the date columns
    data[col+'_month'] = data[col].dt.month
    data[col+'_day'] = data[col].dt.day
    data[col+'_year'] = data[col].dt.year

# Select all categorical columns from the dataset and label encode them or one hot encode
cat_cols = data.select_dtypes(include='object').columns
num_cols = [col for col in data.select_dtypes(include='number').columns if col not in ['target']]
print(f"The categorical columns are: {cat_cols}.")
print("-"* 100)
print(f"The numerical columns are: {num_cols}")
print("-"* 100)

# Example: Target encoding
loan_type_means = train.groupby('loan_type')['target'].mean()
data['loan_type'] = data['loan_type'].map(loan_type_means)


# Label-encoding for the other remaining categorical columns
le = LabelEncoder()
for col in [col for col in cat_cols if col not in ['loan_type', 'ID']]:
    data[col] = le.fit_transform(data[col])


# deal with numerical columns: we saw loan amount is  highly right skewed for this we can log transform it
data['Total_Amount'] = np.log1p(data['Total_Amount']) # study other numerical columns and see if they are skewed as well

# Drop the original datetime columns after feature engineering
data = data.drop(columns=['disbursement_date', 'due_date'])

# Splitting the data back into train and test
train_df = data[data['ID'].isin(train['ID'].unique())]

test_df = data[data['ID'].isin(test['ID'].unique())]

# we are also going to drop the country id as we saw we have only one country in train
features_for_modelling = [col for col in train_df.columns if col not in date_cols + ['ID', 'target', 'country_id']]

# Check if the new datasets have the same rows as train and test datasets
print(f"The shape of train_df is: {train_df.shape}")
print(f"The shape of test_df is: {test_df.shape}")
print(f"The shape of train is: {train.shape}")
print(f"The shape of test is: {test.shape}")
print(f"The features for modelling are:\n{features_for_modelling}")


The categorical columns are: Index(['ID', 'country_id', 'loan_type', 'New_versus_Repeat'], dtype='object').
----------------------------------------------------------------------------------------------------
The numerical columns are: ['customer_id', 'tbl_loan_id', 'lender_id', 'Total_Amount', 'Total_Amount_to_Repay', 'duration', 'Amount_Funded_By_Lender', 'Lender_portion_Funded', 'Lender_portion_to_be_repaid', 'loan_term_days', 'disbursement_date_month', 'disbursement_date_day', 'disbursement_date_year', 'due_date_month', 'due_date_day', 'due_date_year']
----------------------------------------------------------------------------------------------------
The shape of train_df is: (68654, 21)
The shape of test_df is: (18594, 21)
The shape of train is: (68654, 16)
The shape of test is: (18594, 15)
The features for modelling are:
['customer_id', 'tbl_loan_id', 'lender_id', 'loan_type', 'Total_Amount', 'Total_Amount_to_Repay', 'duration', 'New_versus_Repeat', 'Amount_Funded_By_Lender', 'L

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68654 entries, 0 to 68653
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           68654 non-null  object 
 1   customer_id                  68654 non-null  int64  
 2   country_id                   68654 non-null  int32  
 3   tbl_loan_id                  68654 non-null  int64  
 4   lender_id                    68654 non-null  int64  
 5   loan_type                    68654 non-null  float64
 6   Total_Amount                 68654 non-null  float64
 7   Total_Amount_to_Repay        68654 non-null  float64
 8   duration                     68654 non-null  int64  
 9   New_versus_Repeat            68654 non-null  int32  
 10  Amount_Funded_By_Lender      68654 non-null  float64
 11  Lender_portion_Funded        68654 non-null  float64
 12  Lender_portion_to_be_repaid  68654 non-null  float64
 13  target               

### CROSS VALIDATION
* Here we are going to use a simple train test split but due to the imbalance try other robust methods like StratifiedKFold

In [26]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score

# Features and target
X = train_df[features_for_modelling]
y = train_df['target']

# Splitting data using StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1

# For storing evaluation results
auc_scores = []

for train_index, val_index in skf.split(X, y):
    print(f"Training on Fold {fold}...")
    
    # Splitting data into train and validation for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Initialize XGBoost model
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1])
    )
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Evaluate on validation data
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    # Calculate AUC
    auc = roc_auc_score(y_val, y_val_prob)
    auc_scores.append(auc)
    
    # Display metrics
    print(f"Fold {fold} AUC: {auc:.4f}")
    print(classification_report(y_val, y_val_pred))
    
    fold += 1

# Average AUC across folds
print(f"\nAverage AUC across folds: {sum(auc_scores)/len(auc_scores):.4f}")


Training on Fold 1...
Fold 1 AUC: 0.9795
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     13480
         1.0       0.63      0.83      0.72       251

    accuracy                           0.99     13731
   macro avg       0.81      0.91      0.85     13731
weighted avg       0.99      0.99      0.99     13731

Training on Fold 2...
Fold 2 AUC: 0.9897
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     13479
         1.0       0.65      0.88      0.75       252

    accuracy                           0.99     13731
   macro avg       0.82      0.94      0.87     13731
weighted avg       0.99      0.99      0.99     13731

Training on Fold 3...
Fold 3 AUC: 0.9778
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     13479
         1.0       0.63      0.79      0.70       252

    accuracy                           0.99     13731
   ma

In [27]:
# Make predictions on the test dataset
test_predictions = model.predict(test_df[features_for_modelling])

# Save the predictions to a CSV file

test_df['target'] = test_predictions
sub =  test_df[['ID', 'target']]
sub.head()


Unnamed: 0,ID,target
68654,ID_269404226088267278,0
68655,ID_255356300042267278,0
68656,ID_257026243764267278,0
68657,ID_264617299409267278,0
68658,ID_247613296713267278,0


In [28]:
sub.to_csv('baseline_submission.csv', index=False)