In [1]:
import os
import pprint

import numpy as np
import pandas as pd
from typing import Union
from pandas.core.series import Series
from pandas.core.frame import DataFrame

In [2]:
# set global variables
DATA_DIR = "../data/dataset"

In [3]:
# load the dataset
df = pd.read_csv(os.path.join(DATA_DIR, "train_data.csv"))
df.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,450407,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,271298,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,...,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0
2,122238,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,...,0,0,0,0.0,0.0,0.0,2.0,0.0,1.0,0
3,305311,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0
4,414121,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0


In [4]:
duplicated_rows = len(df.index[df.duplicated()])
print(f"There are {duplicated_rows} duplicate rows in the dataset.")

There are 0 duplicate rows in the dataset.


In [5]:
df.shape

(184506, 122)

In [6]:
for column in df.columns:
    try:
        std_column = df[column].std()
        if std_column < 0.02:
            print(column, std_column)
    except:
        pass

REGION_POPULATION_RELATIVE 0.013817632579011042
FLAG_MOBIL 0.0023280631315212843
FLAG_DOCUMENT_2 0.0061593759302808315
FLAG_DOCUMENT_4 0.00959843398631762
FLAG_DOCUMENT_7 0.013372544706006448
FLAG_DOCUMENT_10 0.004656088409229477
FLAG_DOCUMENT_12 0.0
FLAG_DOCUMENT_17 0.016459706190843363
FLAG_DOCUMENT_21 0.01803021598109628


In [7]:
df = df.drop(df.std()[df.std() < 0.02].index.values, axis=1)
print(df.shape)

  df = df.drop(df.std()[df.std() < 0.02].index.values, axis=1)


(184506, 113)


In [8]:
unnecessary_columns_to_drop = [
        "SK_ID_CURR",  # this is just a unique identifier for each row
        "FLAG_EMP_PHONE",  # office phone number is not important
        "FLAG_WORK_PHONE",  # home phone number is not important
        "WEEKDAY_APPR_PROCESS_START",  # does not matter on what day the loan is applied for
        "HOUR_APPR_PROCESS_START",  # does not matter during what hour the loan is applied for
        "REG_REGION_NOT_LIVE_REGION",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
        "REG_REGION_NOT_WORK_REGION",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
        "LIVE_REGION_NOT_WORK_REGION",  # contact address and work address (region) are different addresses, and does not matter if they match or not
        "REG_CITY_NOT_LIVE_CITY",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
        "REG_CITY_NOT_WORK_CITY",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
        "LIVE_CITY_NOT_WORK_CITY",  # contact address and work address (region) are different addresses, and does not matter if they match or not,
        "DAYS_LAST_PHONE_CHANGE",  # phone change information does not reveal something important as one can change phone due to multiple things,
        "OBS_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "DEF_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "OBS_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "DEF_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
    ]

df = df.drop(unnecessary_columns_to_drop, axis=1, errors="ignore")
print("Shape of dataframe:")
print(df.shape)

Shape of dataframe:
(184506, 97)


In [9]:
def calculate_iqr_range(
        data: Union[list, Series],
        scaled_factor: float = 1.7,
        percentile_range: tuple = (10, 90),
) -> tuple:
    """
    Calculates the IQR range, lower bound, and upper bound of the data to
    detect outliers.

    Args:
        data: Union[list, Series]
            Data for which IQR range, lower bound, and upper bound needs to be
            calculated
        scaled_factor: float
            Defaults to 1.5.

            Set this high to impose more stricter outlier detection i.e.,
            more outliers will be considered as regular data points.
            Lower this value to impose less stricter outlier detection i.e.,
            more data points will be considered as outliers.
        percentile_range: tuple
            Defaults to (25, 75).

            Denotes the percentile range needed to calculate the inter quartile
            range.

    Returns:
    """
    
    p1, p3 = np.percentile(data.dropna(), percentile_range)
    iqr_range = p3 - p1
    lower_bound = p1 - (scaled_factor * iqr_range)
    upper_bound = p3 + (scaled_factor * iqr_range)
    return iqr_range, lower_bound, upper_bound

In [10]:
for column in list(df.select_dtypes(exclude=["object"]).columns):
    iqr_range, lower_bound, upper_bound = calculate_iqr_range(
        df[column],
    )

    index_of_outliers = df[
        (df[column] > upper_bound) | (df[column] < lower_bound)].index
    median_of_column = df[column].dropna().median()
    df.loc[index_of_outliers, column] = median_of_column

In [11]:
df.shape

(184506, 97)

In [12]:
print("Categorical columns:")
categorical_columns = list(df.select_dtypes(include=["object"]).columns)
print(f"There are {len(categorical_columns)} categorical columns.\n")
for column in categorical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        print(len(df[column].unique()), column, missing_values)
        print(df[column].value_counts())
        print("\n\n")

Categorical columns:
There are 15 categorical columns.

8 NAME_TYPE_SUITE 770
Unaccompanied      149059
Family              24044
Spouse, partner      6869
Children             1982
Other_B              1081
Other_A               528
Group of people       173
Name: NAME_TYPE_SUITE, dtype: int64



19 OCCUPATION_TYPE 57867
Laborers                 32968
Sales staff              19351
Core staff               16492
Managers                 12780
Drivers                  11238
High skill tech staff     6859
Accountants               5897
Medicine staff            5130
Security staff            4074
Cooking staff             3621
Cleaning staff            2766
Private service staff     1565
Low-skill Laborers        1223
Waiters/barmen staff       819
Secretaries                787
Realty agents              434
HR staff                   321
IT staff                   314
Name: OCCUPATION_TYPE, dtype: int64



5 FONDKAPREMONT_MODE 126254
reg oper account         44300
reg oper spec accoun

In [13]:
def get_replacing_criteria_for_categorical_features(
        df: DataFrame
) -> dict:
    """
    Returns the replacing criteria for missing values of categorical features
    i.e., with what value should missing values in categorical features should
    be replaced.

    Args:
        df: DatFrame
            Pandas DataFrame containing the training data, needed to compute
            the replacing criteria for missing values

    Returns:
        dict:
            A dictionary containing column-name and replacing value, as key-
            value pair for each of the categorical feature in training datsaet.
    """
    # There are 6 categorical features in train data that contain missing
    # values - NAME_TYPE_SUITE, OCCUPATION_TYPE, FONDKAPREMONT_MODE,
    # HOUSETYPE_MODE, WALLSMATERIAL_MODE, EMERGENCYSTATE_MODE
    replace_cat_with = dict()
    replace_cat_with["NAME_TYPE_SUITE"] = None
    replace_cat_with["OCCUPATION_TYPE"] = None
    replace_cat_with["FONDKAPREMONT_MODE"] = None
    replace_cat_with["HOUSETYPE_MODE"] = None
    replace_cat_with["WALLSMATERIAL_MODE"] = None
    replace_cat_with["EMERGENCYSTATE_MODE"] = None

    # Replace missing values in NAME_TYPE_SUITE with most common class as there
    # are only 770 missing values as compared to 1,84,506 data points and in
    # which 1,49,059 data points belong to NAME_TYPE_SUITE. So it's safe to
    # assume that most of the data points belong to this category.
    replace_cat_with["NAME_TYPE_SUITE"] = \
    df["NAME_TYPE_SUITE"].value_counts().index[0]

    # In feature column EMERGENCYSTATE_MODE, there are only two categories -
    # Yes (wih 1,443 data points), and No (with 95,727 data points). And the
    # number of values missing is really large (87,336). Here, we can create a
    # new category 'Missing' to replace null values.
    replace_cat_with["EMERGENCYSTATE_MODE"] = "Missing"

    # For now, replace missing values in all other categorical features columns
    # with another cateogir 'Missing', and later, we will try to improve it.
    replace_cat_with["OCCUPATION_TYPE"] = "Missing"
    replace_cat_with["FONDKAPREMONT_MODE"] = "Missing"
    replace_cat_with["HOUSETYPE_MODE"] = "Missing"
    replace_cat_with["WALLSMATERIAL_MODE"] = "Missing"
    return replace_cat_with

In [14]:
replacing_criteria_for_cat_features = \
            get_replacing_criteria_for_categorical_features(df)

In [15]:
replacing_criteria_for_cat_features

{'NAME_TYPE_SUITE': 'Unaccompanied',
 'OCCUPATION_TYPE': 'Missing',
 'FONDKAPREMONT_MODE': 'Missing',
 'HOUSETYPE_MODE': 'Missing',
 'WALLSMATERIAL_MODE': 'Missing',
 'EMERGENCYSTATE_MODE': 'Missing'}

In [16]:
for column_name, replace_with_value in replacing_criteria_for_cat_features.items():
    df[column_name].fillna(replace_with_value, inplace=True)

In [17]:
for column in categorical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        print(len(df[column].unique()), column, missing_values)
        print(df[column].value_counts())
        print("\n\n")

In [18]:
df.shape

(184506, 97)