In [1]:
import sys
sys.path.append('../src/')

In [2]:
import os
from dotenv import load_dotenv


# Load the variables from .env file
load_dotenv()

# Define the global variables to load AWS credentials and dataset file names
AWS_S3_BUCKET = os.environ.get("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
AWS_S3_DATA_DIRECTORY = os.environ.get("AWS_S3_DATA_DIRECTORY")
AWS_S3_CLEAN_DATA_DIRECTORY = os.environ.get("AWS_S3_CLEAN_DATA_DIRECTORY")
AWS_S3_DATA_DIRECTORY_RAW = os.environ.get("AWS_S3_DATA_DIRECTORY_RAW")
APPLICATION_DATASET = os.environ.get("APPLICATION_DATASET")
AWS_S3_DATA_DIRECTORY_PROCESSED = os.environ.get("AWS_S3_DATA_DIRECTORY_PROCESSED")
AWS_S3_DATA_DIRECTORY_MODELS = os.environ.get("AWS_S3_DATA_DIRECTORY_MODELS")
DEMO_DATASET = os.environ.get("DEMO_DATASET")

In [3]:
from utils.s3_Functions import S3Utils
# Create an instance of S3Utils class to access various methods
s3_utils = S3Utils(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET, AWS_S3_DATA_DIRECTORY)

In [4]:
applicationDf = s3_utils.load_dataframe('original', 'application_data.csv')

Loading 's3://loan-default-prediction/dataset-3/original/application_data.csv' file as dataframe.


Loading: 100%|██████████| 158M/158M [00:17<00:00, 9.42MB/s]    


In [5]:
previousDf = s3_utils.load_dataframe('original', 'previous_application.csv')

Loading 's3://loan-default-prediction/dataset-3/original/previous_application.csv' file as dataframe.


Loading: 100%|██████████| 386M/386M [00:40<00:00, 9.88MB/s]    


In [28]:
import pandas as pd
loan_process_df = pd.merge(applicationDf, previousDf, how="inner", on="SK_ID_CURR")

In [29]:
loan_process_df.shape

(1413701, 158)

In [30]:
loan_process_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,Auto technology,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
2,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Furniture,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
3,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Consumer electronics,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0
4,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,Connectivity,4.0,middle,POS mobile without interest,365243.0,-784.0,-694.0,-724.0,-714.0,0.0


In [10]:
test_df = pd.read_csv("test_sample.csv")
test_df.head()

Unnamed: 0,ID,Name,Percentile,Age,Working,Designation
0,1,Sai,87.0,27.0,1.0,Lead Developer
1,2,Darshan,91.2,29.0,1.0,Lead Devops
2,3,Anwar,89.3,,0.0,
3,4,,94.8,38.0,1.0,Architect
4,5,XYZ,38.0,42.0,,


In [11]:
test_df = test_df.where(pd.notnull(test_df), None)
test_df.head()

Unnamed: 0,ID,Name,Percentile,Age,Working,Designation
0,1,Sai,87.0,27.0,1.0,Lead Developer
1,2,Darshan,91.2,29.0,1.0,Lead Devops
2,3,Anwar,89.3,,0.0,
3,4,,94.8,38.0,1.0,Architect
4,5,XYZ,38.0,42.0,,


In [26]:
import numpy as np

test1_df = test_df.replace(to_replace={None: np.nan})
test1_df.head()

Unnamed: 0,ID,Name,Percentile,Age,Working,Designation
0,1,Sai,87.0,27.0,1.0,Lead Developer
1,2,Darshan,91.2,29.0,1.0,Lead Devops
2,3,Anwar,89.3,,0.0,
3,4,,94.8,38.0,1.0,Architect
4,5,XYZ,38.0,42.0,,


In [31]:
from typing import Tuple
from sklearn.model_selection import train_test_split

def split_data(
    df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Function to split the data into train and test samples"""

    # Split the features and target variables
    X, y = df.drop(target_column, axis=1), df[target_column]

    # Split the data into training and test samples
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state,
    )

    return X, X_train, X_test, y_train, y_test

In [32]:
Train_set, Test_set = train_test_split(loan_process_df, test_size=0.075)

In [33]:
print("Train_set:", Train_set.shape)
print("Test_set:", Test_set.shape)

Train_set: (1307673, 158)
Test_set: (106028, 158)


In [25]:
Train_set[Train_set.select_dtypes(include='object').columns].value_counts()

NAME_CONTRACT_TYPE_x  CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  NAME_TYPE_SUITE_x  NAME_INCOME_TYPE      NAME_EDUCATION_TYPE            NAME_FAMILY_STATUS  NAME_HOUSING_TYPE    OCCUPATION_TYPE  WEEKDAY_APPR_PROCESS_START_x  ORGANIZATION_TYPE       FONDKAPREMONT_MODE     HOUSETYPE_MODE  WALLSMATERIAL_MODE  EMERGENCYSTATE_MODE  NAME_CONTRACT_TYPE_y  WEEKDAY_APPR_PROCESS_START_y  FLAG_LAST_APPL_PER_CONTRACT  NAME_CASH_LOAN_PURPOSE  NAME_CONTRACT_STATUS  NAME_PAYMENT_TYPE      CODE_REJECT_REASON  NAME_TYPE_SUITE_y  NAME_CLIENT_TYPE  NAME_GOODS_CATEGORY   NAME_PORTFOLIO  NAME_PRODUCT_TYPE  CHANNEL_TYPE             NAME_SELLER_INDUSTRY  NAME_YIELD_GROUP  PRODUCT_COMBINATION        
Cash loans            F            N             Y                Group of people    Commercial associate  Secondary / secondary special  Married             House / apartment    Sales staff      WEDNESDAY                     Trade: type 3           not specified          block of flats  Stone, brick        No  

In [26]:
Test_set[Test_set.select_dtypes(include='object').columns].value_counts()

NAME_CONTRACT_TYPE_x  CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  NAME_TYPE_SUITE_x  NAME_INCOME_TYPE      NAME_EDUCATION_TYPE            NAME_FAMILY_STATUS    NAME_HOUSING_TYPE  OCCUPATION_TYPE        WEEKDAY_APPR_PROCESS_START_x  ORGANIZATION_TYPE       FONDKAPREMONT_MODE     HOUSETYPE_MODE  WALLSMATERIAL_MODE  EMERGENCYSTATE_MODE  NAME_CONTRACT_TYPE_y  WEEKDAY_APPR_PROCESS_START_y  FLAG_LAST_APPL_PER_CONTRACT  NAME_CASH_LOAN_PURPOSE  NAME_CONTRACT_STATUS  NAME_PAYMENT_TYPE      CODE_REJECT_REASON  NAME_TYPE_SUITE_y  NAME_CLIENT_TYPE  NAME_GOODS_CATEGORY  NAME_PORTFOLIO  NAME_PRODUCT_TYPE  CHANNEL_TYPE             NAME_SELLER_INDUSTRY  NAME_YIELD_GROUP  PRODUCT_COMBINATION        
Cash loans            F            N             N                Unaccompanied      Working               Secondary / secondary special  Single / not married  House / apartment  Sales staff            MONDAY                        Business Entity Type 3  reg oper account       block of flats  Stone, brick 

In [35]:
s3_utils.save_dataframe(
          AWS_S3_DATA_DIRECTORY_RAW, 'merged_application_data.csv', Train_set
       )

Saving dataframe as 'merged_application_data.csv' at 's3://loan-default-prediction/dataset-3/raw/merged_application_data.csv'
csv file 'merged_application_data.csv' is saved to S3 successfully.


In [34]:
s3_utils.save_dataframe(
          AWS_S3_DATA_DIRECTORY_RAW, 'merged_demo_data.csv', Test_set
       )

Saving dataframe as 'merged_demo_data.csv' at 's3://loan-default-prediction/dataset-3/raw/merged_demo_data.csv'
csv file 'merged_demo_data.csv' is saved to S3 successfully.


In [19]:
pd.set_option("display.max_rows", None)
Train_set.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1307673 entries, 884843 to 636128
Data columns (total 158 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    SK_ID_CURR                    int64  
 1    TARGET                        int64  
 2    NAME_CONTRACT_TYPE_x          object 
 3    CODE_GENDER                   object 
 4    FLAG_OWN_CAR                  object 
 5    FLAG_OWN_REALTY               object 
 6    CNT_CHILDREN                  int64  
 7    AMT_INCOME_TOTAL              float64
 8    AMT_CREDIT_x                  float64
 9    AMT_ANNUITY_x                 float64
 10   AMT_GOODS_PRICE_x             float64
 11   NAME_TYPE_SUITE_x             object 
 12   NAME_INCOME_TYPE              object 
 13   NAME_EDUCATION_TYPE           object 
 14   NAME_FAMILY_STATUS            object 
 15   NAME_HOUSING_TYPE             object 
 16   REGION_POPULATION_RELATIVE    float64
 17   DAYS_BIRTH                    int64  
 18   D

In [23]:
Train_set["OCCUPATION_TYPE"].value_counts(dropna=False)

OCCUPATION_TYPE
None                     423157
Laborers                 232338
Sales staff              139911
Core staff               109494
Managers                  87772
Drivers                   77839
High skill tech staff     44905
Accountants               38776
Medicine staff            37114
Security staff            29327
Cooking staff             26603
Cleaning staff            22532
Private service staff     11293
Low-skill Laborers         8596
Waiters/barmen staff       5681
Secretaries                5337
Realty agents              3280
HR staff                   2140
IT staff                   1578
Name: count, dtype: int64