In [1]:
import os
import pprint

import numpy as np
import pandas as pd
from typing import Union
from pandas.core.series import Series

In [2]:
# set global variables
DATA_DIR = "../data"

In [3]:
# load the dataset
df = pd.read_csv(os.path.join(DATA_DIR, "train_data.csv"))
df.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,450407,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,271298,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,...,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0
2,122238,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,...,0,0,0,0.0,0.0,0.0,2.0,0.0,1.0,0
3,305311,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0
4,414121,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0


In [4]:
duplicated_rows = len(df.index[df.duplicated()])
print(f"There are {duplicated_rows} duplicate rows in the dataset.")

There are 0 duplicate rows in the dataset.


In [5]:
df.shape

(184506, 122)

In [6]:
for column in df.columns:
    try:
        std_column = df[column].std()
        if std_column < 0.02:
            print(column, std_column)
    except:
        pass

REGION_POPULATION_RELATIVE 0.013817632579011042
FLAG_MOBIL 0.0023280631315212843
FLAG_DOCUMENT_2 0.0061593759302808315
FLAG_DOCUMENT_4 0.00959843398631762
FLAG_DOCUMENT_7 0.013372544706006448
FLAG_DOCUMENT_10 0.004656088409229477
FLAG_DOCUMENT_12 0.0
FLAG_DOCUMENT_17 0.016459706190843363
FLAG_DOCUMENT_21 0.01803021598109628


In [7]:
df = df.drop(df.std()[df.std() < 0.02].index.values, axis=1)
print(df.shape)

  df = df.drop(df.std()[df.std() < 0.02].index.values, axis=1)


(184506, 113)


In [8]:
unnecessary_columns_to_drop = [
        "SK_ID_CURR",  # this is just a unique identifier for each row
        "FLAG_EMP_PHONE",  # office phone number is not important
        "FLAG_WORK_PHONE",  # home phone number is not important
        "WEEKDAY_APPR_PROCESS_START",  # does not matter on what day the loan is applied for
        "HOUR_APPR_PROCESS_START",  # does not matter during what hour the loan is applied for
        "REG_REGION_NOT_LIVE_REGION",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
        "REG_REGION_NOT_WORK_REGION",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
        "LIVE_REGION_NOT_WORK_REGION",  # contact address and work address (region) are different addresses, and does not matter if they match or not
        "REG_CITY_NOT_LIVE_CITY",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
        "REG_CITY_NOT_WORK_CITY",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
        "LIVE_CITY_NOT_WORK_CITY",  # contact address and work address (region) are different addresses, and does not matter if they match or not,
        "DAYS_LAST_PHONE_CHANGE",  # phone change information does not reveal something important as one can change phone due to multiple things,
        "OBS_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "DEF_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "OBS_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
        "DEF_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
    ]

df = df.drop(unnecessary_columns_to_drop, axis=1, errors="ignore")
print("Shape of dataframe:")
print(df.shape)

Shape of dataframe:
(184506, 97)


In [9]:
print("Categorical columns:")
categorical_columns = list(df.select_dtypes(include=["object"]).columns)

Categorical columns:


In [10]:
def calculate_iqr_range(
        data: Union[list, Series],
        scaled_factor: float = 1.7,
        percentile_range: tuple = (10, 90),
) -> tuple:
    """
    Calculates the IQR range, lower bound, and upper bound of the data to
    detect outliers.

    Args:
        data: Union[list, Series]
            Data for which IQR range, lower bound, and upper bound needs to be
            calculated
        scaled_factor: float
            Defaults to 1.5.

            Set this high to impose more stricter outlier detection i.e.,
            more outliers will be considered as regular data points.
            Lower this value to impose less stricter outlier detection i.e.,
            more data points will be considered as outliers.
        percentile_range: tuple
            Defaults to (25, 75).

            Denotes the percentile range needed to calculate the inter quartile
            range.

    Returns:
    """
    
    p1, p3 = np.percentile(data.dropna(), percentile_range)
    iqr_range = p3 - p1
    lower_bound = p1 - (scaled_factor * iqr_range)
    upper_bound = p3 + (scaled_factor * iqr_range)
    return iqr_range, lower_bound, upper_bound

In [11]:
for column in list(df.select_dtypes(exclude=["object"]).columns):
    iqr_range, lower_bound, upper_bound = calculate_iqr_range(
        df[column],
    )

    index_of_outliers = df[
        (df[column] > upper_bound) | (df[column] < lower_bound)].index
    median_of_column = df[column].dropna().median()
    df.loc[index_of_outliers, column] = median_of_column

In [12]:
df.shape

(184506, 97)