1. Extract all data from CSV files.
2. Transform data into inputs/targets.
3. Split inputs/targets into train/test sets.
4. Sample inputs/targets for the testing sets.
5. Load outputs inputs/targets to parquet files.

# Imports 

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder


# Config 

In [65]:
DATA_PATH = '../loan_data/credit_risk_dataset.csv'

In [66]:
SHUFFLE = False  # time-sensitive
TEST_SIZE = 0.2
SAMPLE_SIZE = 2000

In [67]:
INPUTS_TRAIN_FILE = "../data/inputs_train.parquet"
INPUTS_TEST_FILE = "../data/inputs_test.parquet"
TARGETS_TRAIN_FILE = "../data/targets_train.parquet"
TARGETS_TEST_FILE = "../data/targets_test.parquet"
INPUTS_SAMPLE_FILE = "../tests/data/inputs_sample.parquet"
TARGETS_SAMPLE_FILE = "../tests/data/targets_sample.parquet"

In [68]:
TARGET_COL = "loan_status"

# Load the dataframe

In [69]:
train_data = pd.read_csv(DATA_PATH)

In [70]:
feature_df = train_data[::-1]
target_df = train_data['loan_status']

# Isolate categorical data from numerical 

In [71]:
numeric_columns = train_data.select_dtypes(include=[np.number])
categorical_columns = train_data.select_dtypes(exclude=[np.number])

# Encode categorical data

Considering the specificity of the data, I will apply different encoding methods

| Column |  Encoding Method|  Reason |
|----------|-----------|---------|
|person_home_ownership|	Label Encoding|	Non-ordinal with likely a small number of categories.|
|loan_intent	| Label Encoding	| Non-ordinal; with likely a small number of categories. |
|loan_grade	|Ordinal Encoding|	Ordered categories.|
|cb_person_default_on_file|	Binary Encoding	| Binary variable.|

In [72]:
categorical_columns.head()

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,PERSONAL,D,Y
1,OWN,EDUCATION,B,N
2,MORTGAGE,MEDICAL,C,N
3,RENT,MEDICAL,C,N
4,RENT,MEDICAL,C,Y


In [73]:
# 4. Binary Encoding for 'cb_person_default_on_file'
categorical_columns['cb_person_default_on_file_encoded'] = categorical_columns['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

In [74]:
# convert the loqan grade to a list and sort it in desc order to give A the highest value and G the lowest
ordinal_values = list(categorical_columns['loan_grade'].sort_values(ascending=False).unique())
ordinal_values

['G', 'F', 'E', 'D', 'C', 'B', 'A']

In [75]:
# Ordinal Encoding for 'loan_grade'
ordinal_encoder = OrdinalEncoder(categories=[ordinal_values])  
categorical_columns['loan_grade_encoded'] = ordinal_encoder.fit_transform(categorical_columns[['loan_grade']])

In [76]:
# Initialize LabelEncoders
encoders = {}
for col in ['person_home_ownership', 'loan_intent']:
    encoders[col] = LabelEncoder()
    categorical_columns[f'{col}_encoded'] = encoders[col].fit_transform(categorical_columns[col])

In [77]:
print(encoders.get('person_home_ownership').classes_)
print(encoders.get('loan_intent').classes_)

['MORTGAGE' 'OTHER' 'OWN' 'RENT']
['DEBTCONSOLIDATION' 'EDUCATION' 'HOMEIMPROVEMENT' 'MEDICAL' 'PERSONAL'
 'VENTURE']


In [78]:
categorical_columns

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,cb_person_default_on_file_encoded,loan_grade_encoded,person_home_ownership_encoded,loan_intent_encoded
0,RENT,PERSONAL,D,Y,1,3.0,3,4
1,OWN,EDUCATION,B,N,0,5.0,2,1
2,MORTGAGE,MEDICAL,C,N,0,4.0,0,3
3,RENT,MEDICAL,C,N,0,4.0,3,3
4,RENT,MEDICAL,C,Y,1,4.0,3,3
...,...,...,...,...,...,...,...,...
32576,MORTGAGE,PERSONAL,C,N,0,4.0,0,4
32577,MORTGAGE,PERSONAL,A,N,0,6.0,0,4
32578,RENT,HOMEIMPROVEMENT,B,N,0,5.0,3,2
32579,MORTGAGE,PERSONAL,B,N,0,5.0,0,4


In [79]:
# Drop original columns that have been encoded (optional)
categorical_columns.drop(['loan_intent', 'loan_grade', 'cb_person_default_on_file','person_home_ownership'], axis=1, inplace=True)

In [80]:
categorical_columns

Unnamed: 0,cb_person_default_on_file_encoded,loan_grade_encoded,person_home_ownership_encoded,loan_intent_encoded
0,1,3.0,3,4
1,0,5.0,2,1
2,0,4.0,0,3
3,0,4.0,3,3
4,1,4.0,3,3
...,...,...,...,...
32576,0,4.0,0,4
32577,0,6.0,0,4
32578,0,5.0,3,2
32579,0,5.0,0,4


In [81]:
# join the data back together
loan = pd.concat([numeric_columns, categorical_columns], axis=1)

In [82]:
loan

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,cb_person_default_on_file_encoded,loan_grade_encoded,person_home_ownership_encoded,loan_intent_encoded
0,22,59000,123.0,35000,16.02,1,0.59,3,1,3.0,3,4
1,21,9600,5.0,1000,11.14,0,0.10,2,0,5.0,2,1
2,25,9600,1.0,5500,12.87,1,0.57,3,0,4.0,0,3
3,23,65500,4.0,35000,15.23,1,0.53,2,0,4.0,3,3
4,24,54400,8.0,35000,14.27,1,0.55,4,1,4.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,0,4.0,0,4
32577,54,120000,4.0,17625,7.49,0,0.15,19,0,6.0,0,4
32578,65,76000,3.0,35000,10.99,1,0.46,28,0,5.0,3,2
32579,56,150000,5.0,15000,11.48,0,0.10,26,0,5.0,0,4


# Split the data

In [83]:
inputs = loan.drop(TARGET_COL, axis="columns")
print("Inputs:", inputs.shape)
inputs.head()

Inputs: (32581, 11)


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,cb_person_default_on_file_encoded,loan_grade_encoded,person_home_ownership_encoded,loan_intent_encoded
0,22,59000,123.0,35000,16.02,0.59,3,1,3.0,3,4
1,21,9600,5.0,1000,11.14,0.1,2,0,5.0,2,1
2,25,9600,1.0,5500,12.87,0.57,3,0,4.0,0,3
3,23,65500,4.0,35000,15.23,0.53,2,0,4.0,3,3
4,24,54400,8.0,35000,14.27,0.55,4,1,4.0,3,3


In [84]:
targets = loan[TARGET_COL].to_frame()
print("Targets:", targets.shape)
targets.head()

Targets: (32581, 1)


Unnamed: 0,loan_status
0,1
1,0
2,1
3,1
4,1


In [85]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(
    inputs, targets, test_size=TEST_SIZE, shuffle=SHUFFLE
)
inputs_train.shape, inputs_test.shape, targets_train.shape, targets_test.shape

((26064, 11), (6517, 11), (26064, 1), (6517, 1))

In [45]:
inputs_train_sample = inputs_train.tail(SAMPLE_SIZE)
targets_train_sample = targets_train.tail(SAMPLE_SIZE)
inputs_train_sample.shape, targets_train_sample.shape

((2000, 11), (2000, 1))

# Save data

In [52]:
# File paths (replace these with your actual paths)
INPUTS_TRAIN_FILE = "../data/training/inputs_train.parquet"
INPUTS_TEST_FILE = "../data/training/inputs_test.parquet"
TARGETS_TRAIN_FILE = "../data/training/targets_train.parquet"
TARGETS_TEST_FILE = "../data/training/targets_test.parquet"
INPUTS_SAMPLE_FILE = "../data/prototyping/inputs_sample.parquet"
TARGETS_SAMPLE_FILE = "../data/prototyping/targets_sample.parquet"

In [53]:
# Function to ensure the directory exists
def ensure_directory(file_path):
    directory = Path(file_path).parent  # Get the directory part of the path
    if not directory.exists():
        directory.mkdir(parents=True, exist_ok=True)

In [54]:
# Ensure directories exist and save files
file_paths = [
    INPUTS_TRAIN_FILE, INPUTS_TEST_FILE, TARGETS_TRAIN_FILE,
    TARGETS_TEST_FILE, INPUTS_SAMPLE_FILE, TARGETS_SAMPLE_FILE
]

# Ensure directories and write files
for file_path, df in zip(
    file_paths, 
    [inputs_train, inputs_test, targets_train, targets_test, inputs_train_sample, targets_train_sample]
):
    ensure_directory(file_path)
    df.to_parquet(file_path)

# Save the encoders 

In [58]:
# Save the encoder for future use
import pickle

In [56]:
ORDINAL_ENCODER_PATH = "../encoders/ordinal_encoder.pkl"
ENCODERS_PATH = "../encoders/label_encoders.pkl"

In [59]:
ensure_directory(ORDINAL_ENCODER_PATH)
with open(ORDINAL_ENCODER_PATH, 'wb') as f:
    pickle.dump(ordinal_encoder, f)

In [60]:
with open(ENCODERS_PATH, 'wb') as f:
    pickle.dump(encoders, f)

In [61]:
# load the encoder
with open(ENCODERS_PATH, 'rb') as f:
    encoder = pickle.load(f)
# new_data['column_encoded'] = encoder.transform(new_data['column'])

In [63]:
# see the classes to see if the encoder was saved correctly 
encoder.get('person_home_ownership').classes_

array(['MORTGAGE', 'OTHER', 'OWN', 'RENT'], dtype=object)