##Pre-requisites

In [1]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import sys
sys.path.insert(0, '/content/src/')

####Do NOT execute below

In [3]:
import os
from dotenv import load_dotenv


# Load the variables from .env file
load_dotenv()

# Define the global variables to load AWS credentials and dataset file names
AWS_S3_BUCKET = os.environ.get("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
AWS_S3_DATA_DIRECTORY = os.environ.get("AWS_S3_DATA_DIRECTORY")
AWS_S3_CLEAN_DATA_DIRECTORY = os.environ.get("AWS_S3_CLEAN_DATA_DIRECTORY")
AWS_S3_DATA_DIRECTORY_RAW = os.environ.get("AWS_S3_DATA_DIRECTORY_RAW")
APPLICATION_DATASET = os.environ.get("APPLICATION_DATASET")
AWS_S3_DATA_DIRECTORY_PROCESSED = os.environ.get("AWS_S3_DATA_DIRECTORY_PROCESSED")
AWS_S3_DATA_DIRECTORY_MODELS = os.environ.get("AWS_S3_DATA_DIRECTORY_MODELS")
DEMO_DATASET = os.environ.get("DEMO_DATASET")

###Data Preprocessing

In [4]:
import pandas as pd
from utils.load_EnvVars import (
    AWS_ACCESS_KEY_ID,
    AWS_SECRET_ACCESS_KEY,
    AWS_S3_BUCKET,
    AWS_S3_DATA_DIRECTORY,
    AWS_S3_DATA_DIRECTORY_RAW,
    AWS_S3_DATA_DIRECTORY_PROCESSED,
    APPLICATION_DATASET,
)
from utils.s3_Functions import S3Utils
from utils.data_Functions import (
    null_value_column_list,
    preprocess_data,
)


# Create an instance of S3Utils class to access various methods
s3_utils = S3Utils(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET, AWS_S3_DATA_DIRECTORY)

# Load data into a dataframe
applicationDf = s3_utils.load_dataframe(AWS_S3_DATA_DIRECTORY_RAW, APPLICATION_DATASET)

""" From EDA following decisions were made on application data. \
    1. Columns with Null values greater than or equal to 40 percentage can be dropped.
    2. Regardless of amount of null values, EXT_SOURCE_X can be removed as they don't correlate \
        with Target values.
    3. All FLAG_DOCUMENT_X columns except FLAG_DOCUMENT_3 can be deleted as submitting various \
        documents does not influence the loan default rate.
    4. All columns related to contact parameters shall be dropped.
    5. Drop unnecessary columns such as 'WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START', \
        'FLAG_LAST_APPL_PER_CONTRACT','NFLAG_LAST_APPL_IN_DAY'.
"""
unwanted_columns = null_value_column_list(applicationDf, 35) + [
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
    "FLAG_DOCUMENT_2",
    "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_5",
    "FLAG_DOCUMENT_6",
    "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_8",
    "FLAG_DOCUMENT_9",
    "FLAG_DOCUMENT_10",
    "FLAG_DOCUMENT_11",
    "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_13",
    "FLAG_DOCUMENT_14",
    "FLAG_DOCUMENT_15",
    "FLAG_DOCUMENT_16",
    "FLAG_DOCUMENT_17",
    "FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",
    "FLAG_DOCUMENT_20",
    "FLAG_DOCUMENT_21",
    "FLAG_MOBIL",
    "FLAG_EMP_PHONE",
    "FLAG_WORK_PHONE",
    "FLAG_CONT_MOBILE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
    "WEEKDAY_APPR_PROCESS_START_x",
    "HOUR_APPR_PROCESS_START_x",
    "WEEKDAY_APPR_PROCESS_START_y",
    "HOUR_APPR_PROCESS_START_y",
    "FLAG_LAST_APPL_PER_CONTRACT",
    "NFLAG_LAST_APPL_IN_DAY",
    "SK_ID_CURR",
    "SK_ID_PREV",
]

scale_columns = [
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT_x",
    "AMT_ANNUITY_x",
    "AMT_GOODS_PRICE_x",
    "REGION_POPULATION_RELATIVE",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "DAYS_REGISTRATION",
    "DAYS_ID_PUBLISH",
    "CNT_FAM_MEMBERS",
    "OBS_30_CNT_SOCIAL_CIRCLE",
    "DEF_30_CNT_SOCIAL_CIRCLE",
    "OBS_60_CNT_SOCIAL_CIRCLE",
    "DEF_60_CNT_SOCIAL_CIRCLE",
    "DAYS_LAST_PHONE_CHANGE",
    "AMT_REQ_CREDIT_BUREAU_MON",
    "AMT_REQ_CREDIT_BUREAU_QRT",
    "AMT_REQ_CREDIT_BUREAU_YEAR",
    "AMT_ANNUITY_y",
    "AMT_APPLICATION",
    "AMT_CREDIT_y",
    "AMT_GOODS_PRICE_y",
    "SELLERPLACE_AREA",
    "CNT_PAYMENT",
]

Loading 's3://loan-default-prediction/dataset-3/raw/merged_application_data.csv' file as dataframe.


In [5]:
applicationDf.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,422584,0,Cash loans,F,N,N,0,94500.0,508495.5,21541.5,...,XNA,24.0,low_normal,Cash Street: low,,,,,,
1,267589,0,Cash loans,M,N,Y,0,135000.0,405000.0,24601.5,...,Connectivity,6.0,high,POS mobile with interest,365243.0,-2263.0,-2113.0,-2173.0,-2166.0,1.0
2,425007,0,Cash loans,F,N,Y,2,225000.0,239850.0,23850.0,...,Consumer electronics,6.0,middle,POS household with interest,365243.0,-702.0,-552.0,-552.0,-547.0,0.0
3,385766,0,Cash loans,M,Y,Y,0,225000.0,1350000.0,39474.0,...,Consumer electronics,60.0,low_normal,Cash X-Sell: low,365243.0,-397.0,1373.0,-367.0,-361.0,0.0
4,157094,0,Cash loans,M,Y,N,0,157500.0,503721.0,19710.0,...,XNA,,XNA,Cash,,,,,,


In [6]:
preprocessing_pipeline, columns = preprocess_data(applicationDf, unwanted_columns, scale_columns)

Dropping SK_ID_CURR
Passing through TARGET
Applying mode Imputing and LabelEncoder to NAME_CONTRACT_TYPE_x
Applying mode Imputing and LabelEncoder to CODE_GENDER
Applying mode Imputing and LabelEncoder to FLAG_OWN_CAR
Applying mode Imputing and LabelEncoder to FLAG_OWN_REALTY
Applying Median Imputing and scaling to CNT_CHILDREN
Applying Median Imputing and scaling to AMT_INCOME_TOTAL
Applying Median Imputing and scaling to AMT_CREDIT_x
Applying Median Imputing and scaling to AMT_ANNUITY_x
Applying Median Imputing and scaling to AMT_GOODS_PRICE_x
Applying mode Imputing and LabelEncoder to NAME_TYPE_SUITE_x
Applying mode Imputing and LabelEncoder to NAME_INCOME_TYPE
Applying mode Imputing and LabelEncoder to NAME_EDUCATION_TYPE
Applying mode Imputing and LabelEncoder to NAME_FAMILY_STATUS
Applying mode Imputing and LabelEncoder to NAME_HOUSING_TYPE
Applying Median Imputing and scaling to REGION_POPULATION_RELATIVE
Applying Custom days_transform and scaling to DAYS_BIRTH
Applying standard

In [7]:
data_transform_pipeline = preprocessing_pipeline.fit_transform(applicationDf)

In [8]:
transformed_df = pd.DataFrame(
        data_transform_pipeline,
        columns = columns
    )

In [9]:
transformed_df.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,...,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION
0,0.0,0.0,0.0,0.0,0.0,-0.564414,-0.395824,-0.205286,-0.39233,-0.207079,...,2.0,26.0,2.0,1.0,5.0,-0.040866,10.0,0.789325,3.0,4.0
1,0.0,0.0,1.0,0.0,1.0,-0.564414,-0.192379,-0.474165,-0.172975,-0.347261,...,0.0,18.0,3.0,0.0,4.0,-0.035687,2.0,-0.453064,1.0,13.0
2,0.0,0.0,0.0,0.0,1.0,2.223557,0.259721,-0.90322,-0.226846,-0.857015,...,2.0,5.0,3.0,0.0,4.0,-0.002021,4.0,-0.453064,4.0,9.0
3,0.0,0.0,1.0,1.0,1.0,-0.564414,0.259721,1.980917,0.893158,2.328947,...,2.0,26.0,2.0,2.0,6.0,0.075669,4.0,3.274104,3.0,7.0
4,0.0,0.0,1.0,1.0,0.0,-0.564414,-0.079354,-0.21769,-0.523621,-0.21916,...,2.0,26.0,4.0,0.0,3.0,-0.040866,10.0,-0.867193,0.0,2.0


In [10]:
#with open("transforms.txt", "w") as text_file:
#    text_file.write(f'{preprocessing_pipeline}')#

In [11]:
#transformed_df.to_csv('/content/transformed_data.csv')

In [12]:
#s3_utils.save_dataframe(
#          AWS_S3_DATA_DIRECTORY_PROCESSED, 'transformed_data.csv', transformed_df
#       )

Saving dataframe as 'transformed_data.csv' at 's3://loan-default-prediction/dataset-3/processed/transformed_data.csv'
Error saving dataframe as csv to S3: 'S3FileSystem' object is not callable


In [13]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Example DataFrame
data = {
    'Category1': ['A', 'B', 'C', 'B', None],
    'Category2': ['X', 'Y', 'Z', None, 'X'],
    'Category3': ['P', 'Q', 'Q', None, 'S']
}
df = pd.DataFrame(data)

# Define categorical feature columns
categorical_features = ['Category1', 'Category2', 'Category3']

# Custom transformer combining SimpleImputer and LabelEncoder
class CategoricalImputerEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = LabelEncoder()

    def fit(self, X):
        #self.imputer.fit(X)
        self.encoder.fit(X)
        return self

    def transform(self, X):
        #X_imputed = self.imputer.transform(X)
        X_encoded = self.encoder.transform(X)
        return X_encoded.reshape(-1, 1)
        #return X_encoded

# Loop through categorical columns and create transformers
transformers = []
for col in categorical_features:
    null_percentage = df[col].isnull().mean() * 100
    print(null_percentage)

    transformer = (
            col,
            Pipeline([
                (f'impute_{col}', SimpleImputer(missing_values=None, strategy='most_frequent')),
                (f'encode_{col}',CategoricalImputerEncoder())
            ]), [col]
        )
    transformers.append(transformer)

print(transformers)

# Create ColumnTransformer
preprocessor = ColumnTransformer(transformers)

# Apply the preprocessor to the DataFrame
transformed_data = preprocessor.fit_transform(df)

# Create a DataFrame from the transformed data
transformed_ex_df = pd.DataFrame(transformed_data, columns=categorical_features)

print(transformed_ex_df)


20.0
20.0
20.0
[('Category1', Pipeline(steps=[('impute_Category1',
                 SimpleImputer(missing_values=None, strategy='most_frequent')),
                ('encode_Category1', CategoricalImputerEncoder())]), ['Category1']), ('Category2', Pipeline(steps=[('impute_Category2',
                 SimpleImputer(missing_values=None, strategy='most_frequent')),
                ('encode_Category2', CategoricalImputerEncoder())]), ['Category2']), ('Category3', Pipeline(steps=[('impute_Category3',
                 SimpleImputer(missing_values=None, strategy='most_frequent')),
                ('encode_Category3', CategoricalImputerEncoder())]), ['Category3'])]
   Category1  Category2  Category3
0          0          0          0
1          1          1          1
2          2          2          1
3          1          0          1
4          1          0          2


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


###Model Training

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

def split_data(
    df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Function to split the data into train and test samples"""

    # Split the features and target variables
    X, y = df.drop(target_column, axis=1), df[target_column]

    # Split the data into training and test samples
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    return X, X_train, X_test, y_train, y_test

def train_model(data_pipeline: any, model, X: pd.DataFrame, y: pd.DataFrame, n: int=5):
    """Function to train the Random Forest Classifier model with preprocessed data using K-Fold Cross Validation to find the best model and accuracy."""

    # Create model Pipeline
    model_pipeline = Pipeline(
        [
            ("preprocessing", data_pipeline),  # Preprocessing pipeline
            ("model", model),  # Random Forest Classifier
        ]
    )

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n, shuffle=True, random_state=42)

    # Perform K-fold cross-validation
    accuracy = cross_val_score(model_pipeline, X, y, cv=kf).mean()
    model_pipeline.fit(X, y)

    print(f"Model trained: {model}")
    print(f"Best Model: {model},\nBest Score: {accuracy}")

    return model_pipeline, accuracy

def evaluate_model(model, X_test, y_test):
    """Function to evaluate the model on test data and print model metrics."""

    # Make predictions on test data
    y_pred = model.predict(X_test)

    # Print Classification report
    print("Classification report:\n", classification_report(y_test, y_pred))

    # Display COnfusion Matrix
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
    disp.plot()
    plt.show()



In [None]:
X, X_train, X_test, y_train, y_test = split_data(applicationDf, "TARGET")

In [16]:
data_pipline, columns = preprocess_data(X, unwanted_columns, scale_columns)

Dropping SK_ID_CURR
Applying mode Imputing and LabelEncoder to NAME_CONTRACT_TYPE_x
Applying mode Imputing and LabelEncoder to CODE_GENDER
Applying mode Imputing and LabelEncoder to FLAG_OWN_CAR
Applying mode Imputing and LabelEncoder to FLAG_OWN_REALTY
Applying Median Imputing and scaling to CNT_CHILDREN
Applying Median Imputing and scaling to AMT_INCOME_TOTAL
Applying Median Imputing and scaling to AMT_CREDIT_x
Applying Median Imputing and scaling to AMT_ANNUITY_x
Applying Median Imputing and scaling to AMT_GOODS_PRICE_x
Applying mode Imputing and LabelEncoder to NAME_TYPE_SUITE_x
Applying mode Imputing and LabelEncoder to NAME_INCOME_TYPE
Applying mode Imputing and LabelEncoder to NAME_EDUCATION_TYPE
Applying mode Imputing and LabelEncoder to NAME_FAMILY_STATUS
Applying mode Imputing and LabelEncoder to NAME_HOUSING_TYPE
Applying Median Imputing and scaling to REGION_POPULATION_RELATIVE
Applying Custom days_transform and scaling to DAYS_BIRTH
Applying standardization, days_transform

####1. XGB Classifier

In [17]:
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

xgb_classifier = XGBClassifier()
nb_classifier = GaussianNB()

nb_model, accuracy = train_model(data_pipline, nb_classifier, X_train, y_train)

evaluate_model(nb_model, X_test, y_test)

Model trained: GaussianNB()
Best Model: GaussianNB(),
Best Score: 0.7749033873362938


ValueError: ignored