In [1]:
import os
import pprint
import pickle as pkl

import numpy as np
import pandas as pd
from typing import Union
from category_encoders import HashingEncoder
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# set global variables
DATA_DIR = "../data/dataset"

In [3]:
# load the dataset
df = pd.read_csv(os.path.join(DATA_DIR, "train_data.csv"))
df.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,450407,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,271298,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,...,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0
2,122238,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,...,0,0,0,0.0,0.0,0.0,2.0,0.0,1.0,0
3,305311,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0
4,414121,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0


In [4]:
duplicated_rows = len(df.index[df.duplicated()])
print(f"There are {duplicated_rows} duplicate rows in the dataset.")

There are 0 duplicate rows in the dataset.


In [5]:
df.shape

(184506, 122)

In [6]:
for column in df.columns:
    try:
        std_column = df[column].std()
        if std_column <= 0.05:
            print(column, std_column)
    except:
        pass

REGION_POPULATION_RELATIVE 0.013817632579011042
FLAG_MOBIL 0.0023280631315212843
FLAG_CONT_MOBILE 0.04419002525707454
NONLIVINGAPARTMENTS_AVG 0.04805894151949643
NONLIVINGAPARTMENTS_MODE 0.04613080888129596
NONLIVINGAPARTMENTS_MEDI 0.04761193548581488
FLAG_DOCUMENT_2 0.0061593759302808315
FLAG_DOCUMENT_4 0.00959843398631762
FLAG_DOCUMENT_7 0.013372544706006448
FLAG_DOCUMENT_10 0.004656088409229477
FLAG_DOCUMENT_12 0.0
FLAG_DOCUMENT_15 0.0359677620448801
FLAG_DOCUMENT_17 0.016459706190843363
FLAG_DOCUMENT_19 0.02506619189055882
FLAG_DOCUMENT_20 0.02350584877516304
FLAG_DOCUMENT_21 0.01803021598109628


In [7]:
df = df.drop(df.std()[df.std() < 0.05].index.values, axis=1)
print(df.shape)

  df = df.drop(df.std()[df.std() < 0.05].index.values, axis=1)


(184506, 106)


In [8]:
unnecessary_columns_to_drop = [
        "SK_ID_CURR",  # this is just a unique identifier for each row
#         "FLAG_EMP_PHONE",  # office phone number is not important
#         "FLAG_WORK_PHONE",  # home phone number is not important
#         "WEEKDAY_APPR_PROCESS_START",  # does not matter on what day the loan is applied for
#         "HOUR_APPR_PROCESS_START",  # does not matter during what hour the loan is applied for
#         "REG_REGION_NOT_LIVE_REGION",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
#         "REG_REGION_NOT_WORK_REGION",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
#         "LIVE_REGION_NOT_WORK_REGION",  # contact address and work address (region) are different addresses, and does not matter if they match or not
#         "REG_CITY_NOT_LIVE_CITY",  # permanent address and contact address (region) are different addresses, and does not matter if they match or not
#         "REG_CITY_NOT_WORK_CITY",  # permanent address and work address (region) are different addresses, and does not matter if they match or not
#         "LIVE_CITY_NOT_WORK_CITY",  # contact address and work address (region) are different addresses, and does not matter if they match or not,
#         "DAYS_LAST_PHONE_CHANGE",  # phone change information does not reveal something important as one can change phone due to multiple things,
#         "OBS_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
#         "DEF_30_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
#         "OBS_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
#         "DEF_60_CNT_SOCIAL_CIRCLE",  # surroundings is biased and does not reveal anything about the person's character
    ]

df = df.drop(unnecessary_columns_to_drop, axis=1, errors="ignore")
print("Shape of dataframe:")
print(df.shape)

Shape of dataframe:
(184506, 105)


In [9]:
def calculate_iqr_range(
        data: Union[list, Series],
        scaled_factor: float = 1.7,
        percentile_range: tuple = (10, 90),
) -> tuple:
    """
    Calculates the IQR range, lower bound, and upper bound of the data to
    detect outliers.

    Args:
        data: Union[list, Series]
            Data for which IQR range, lower bound, and upper bound needs to be
            calculated
        scaled_factor: float
            Defaults to 1.5.

            Set this high to impose more stricter outlier detection i.e.,
            more outliers will be considered as regular data points.
            Lower this value to impose less stricter outlier detection i.e.,
            more data points will be considered as outliers.
        percentile_range: tuple
            Defaults to (25, 75).

            Denotes the percentile range needed to calculate the inter quartile
            range.

    Returns:
    """
    
    p1, p3 = np.percentile(data.dropna(), percentile_range)
    iqr_range = p3 - p1
    lower_bound = p1 - (scaled_factor * iqr_range)
    upper_bound = p3 + (scaled_factor * iqr_range)
    return iqr_range, lower_bound, upper_bound

In [10]:
for column in list(df.select_dtypes(exclude=["object"]).columns):
    if column == "TARGET":
        continue
        
    iqr_range, lower_bound, upper_bound = calculate_iqr_range(
        df[column],
    )

    index_of_outliers = df[
        (df[column] > upper_bound) | (df[column] < lower_bound)].index
    median_of_column = df[column].dropna().median()
    df.loc[index_of_outliers, column] = median_of_column

In [11]:
print("Categorical columns:")
categorical_columns = list(df.select_dtypes(include=["object"]).columns)
print(f"There are {len(categorical_columns)} categorical columns.\n")
for column in categorical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        print(len(df[column].unique()), column, missing_values)
        print(df[column].value_counts())
        print("\n\n")

Categorical columns:
There are 16 categorical columns.

8 NAME_TYPE_SUITE 770
Unaccompanied      149059
Family              24044
Spouse, partner      6869
Children             1982
Other_B              1081
Other_A               528
Group of people       173
Name: NAME_TYPE_SUITE, dtype: int64



19 OCCUPATION_TYPE 57867
Laborers                 32968
Sales staff              19351
Core staff               16492
Managers                 12780
Drivers                  11238
High skill tech staff     6859
Accountants               5897
Medicine staff            5130
Security staff            4074
Cooking staff             3621
Cleaning staff            2766
Private service staff     1565
Low-skill Laborers        1223
Waiters/barmen staff       819
Secretaries                787
Realty agents              434
HR staff                   321
IT staff                   314
Name: OCCUPATION_TYPE, dtype: int64



5 FONDKAPREMONT_MODE 126254
reg oper account         44300
reg oper spec accoun

In [12]:
def get_replacing_criteria_for_categorical_features(
        df: DataFrame
) -> dict:
    """
    Returns the replacing criteria for missing values of categorical features
    i.e., with what value should missing values in categorical features should
    be replaced.

    Args:
        df: DatFrame
            Pandas DataFrame containing the training data, needed to compute
            the replacing criteria for missing values

    Returns:
        dict:
            A dictionary containing column-name and replacing value, as key-
            value pair for each of the categorical feature in training datsaet.
    """
    # There are 6 categorical features in train data that contain missing
    # values - NAME_TYPE_SUITE, OCCUPATION_TYPE, FONDKAPREMONT_MODE,
    # HOUSETYPE_MODE, WALLSMATERIAL_MODE, EMERGENCYSTATE_MODE
    replace_cat_with = dict()
    replace_cat_with["NAME_TYPE_SUITE"] = None
    replace_cat_with["OCCUPATION_TYPE"] = None
    replace_cat_with["FONDKAPREMONT_MODE"] = None
    replace_cat_with["HOUSETYPE_MODE"] = None
    replace_cat_with["WALLSMATERIAL_MODE"] = None
    replace_cat_with["EMERGENCYSTATE_MODE"] = None

    # Replace missing values in NAME_TYPE_SUITE with most common class as there
    # are only 770 missing values as compared to 1,84,506 data points and in
    # which 1,49,059 data points belong to NAME_TYPE_SUITE. So it's safe to
    # assume that most of the data points belong to this category.
    replace_cat_with["NAME_TYPE_SUITE"] = \
    df["NAME_TYPE_SUITE"].value_counts().index[0]

    # In feature column EMERGENCYSTATE_MODE, there are only two categories -
    # Yes (wih 1,443 data points), and No (with 95,727 data points). And the
    # number of values missing is really large (87,336). Here, we can create a
    # new category 'Missing' to replace null values.
    replace_cat_with["EMERGENCYSTATE_MODE"] = "Missing"

    # For now, replace missing values in all other categorical features columns
    # with another cateogir 'Missing', and later, we will try to improve it.
    replace_cat_with["OCCUPATION_TYPE"] = "Missing"
    replace_cat_with["FONDKAPREMONT_MODE"] = "Missing"
    replace_cat_with["HOUSETYPE_MODE"] = "Missing"
    replace_cat_with["WALLSMATERIAL_MODE"] = "Missing"
    return replace_cat_with

In [13]:
replacing_criteria_for_cat_features = \
            get_replacing_criteria_for_categorical_features(df)

In [14]:
replacing_criteria_for_cat_features

{'NAME_TYPE_SUITE': 'Unaccompanied',
 'OCCUPATION_TYPE': 'Missing',
 'FONDKAPREMONT_MODE': 'Missing',
 'HOUSETYPE_MODE': 'Missing',
 'WALLSMATERIAL_MODE': 'Missing',
 'EMERGENCYSTATE_MODE': 'Missing'}

In [15]:
df.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_14,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,Family,...,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,Unaccompanied,...,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0
2,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,"Spouse, partner",...,0,0,0,0.0,0.0,0.0,2.0,0.0,1.0,0
3,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,Unaccompanied,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0
4,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,Unaccompanied,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0


In [16]:
for column_name, replace_with_value in replacing_criteria_for_cat_features.items():
    df[column_name].fillna(replace_with_value, inplace=True)

In [17]:
for column in categorical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        print(len(df[column].unique()), column, missing_values)
        print(df[column].value_counts())
        print("\n\n")

In [18]:
df.shape

(184506, 105)

In [19]:
print("Numerical columns:")
numerical_columns = list(df.select_dtypes(exclude=["object"]).columns)
print(f"There are {len(numerical_columns)} numerical columns.\n")
missing_column_count = 0
for column in numerical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        missing_column_count += 1
        print(len(df[column].unique()), column, missing_values)
#         print(df[column].value_counts())
#         print("\n\n")

Numerical columns:
There are 89 numerical columns.

12269 AMT_ANNUITY 6
783 AMT_GOODS_PRICE 167
55 OWN_CAR_AGE 121633
7 CNT_FAM_MEMBERS 1
73058 EXT_SOURCE_1 104074
94606 EXT_SOURCE_2 413
791 EXT_SOURCE_3 36656
1993 APARTMENTS_AVG 93575
3353 BASEMENTAREA_AVG 107975
122 YEARS_BEGINEXPLUATATION_AVG 89900
117 YEARS_BUILD_AVG 122757
2299 COMMONAREA_AVG 128971
222 ELEVATORS_AVG 98320
254 ENTRANCES_AVG 92807
349 FLOORSMAX_AVG 91726
271 FLOORSMIN_AVG 125244
2964 LANDAREA_AVG 109543
1579 LIVINGAPARTMENTS_AVG 126212
4549 LIVINGAREA_AVG 92534
2004 NONLIVINGAREA_AVG 101788
559 APARTMENTS_MODE 93575
3413 BASEMENTAREA_MODE 107975
120 YEARS_BEGINEXPLUATATION_MODE 89900
119 YEARS_BUILD_MODE 122757
2233 COMMONAREA_MODE 128971
18 ELEVATORS_MODE 98320
21 ENTRANCES_MODE 92807
24 FLOORSMAX_MODE 91726
26 FLOORSMIN_MODE 125244
2980 LANDAREA_MODE 109543
575 LIVINGAPARTMENTS_MODE 126212
4628 LIVINGAREA_MODE 92534
1981 NONLIVINGAREA_MODE 101788
947 APARTMENTS_MEDI 93575
3345 BASEMENTAREA_MEDI 107975
122 YEARS_B

In [20]:
def get_replacing_criteria_for_numerical_features(
        df: DataFrame
) -> dict:
    """
    Returns the replacing criteria for missing values of numerical features
    i.e., with what value should missing values in categorical features should
    be replaced.

    Args:
        df: DatFrame
            Pandas DataFrame containing the training data, needed to compute
            the replacing criteria for missing values of numerical features.

    Returns:
        dict:
            A dictionary containing column-name and replacing value, as key-
            value pair for each of the numerical feature in training dataset.
    """
    replace_num_with = dict()

    numerical_columns = list(df.select_dtypes(exclude=["object"]).columns)
    

    numerical_columns_with_missing_values = list()
    for column in numerical_columns:
        missing_values = df[column].isna().sum()
        if missing_values > 0:
            numerical_columns_with_missing_values.append(column)
            replace_num_with[column] = df[column].median()  # default criteria

    
    # It is a categorical columns that contains integer count, hence missing
    # values can be replaced with the mode of the feature
    replace_num_with["CNT_FAM_MEMBERS"] = \
        df["CNT_FAM_MEMBERS"].value_counts().index[0]
    return replace_num_with

In [21]:
replacing_criteria_for_num_features = \
            get_replacing_criteria_for_numerical_features(df)

In [22]:
replacing_criteria_for_num_features

{'AMT_ANNUITY': 24939.0,
 'AMT_GOODS_PRICE': 450000.0,
 'OWN_CAR_AGE': 9.0,
 'CNT_FAM_MEMBERS': 2.0,
 'EXT_SOURCE_1': 0.506691720318096,
 'EXT_SOURCE_2': 0.5659990279431356,
 'EXT_SOURCE_3': 0.5352762504724826,
 'APARTMENTS_AVG': 0.0876,
 'BASEMENTAREA_AVG': 0.0761,
 'YEARS_BEGINEXPLUATATION_AVG': 0.9816,
 'YEARS_BUILD_AVG': 0.7552,
 'COMMONAREA_AVG': 0.0212,
 'ELEVATORS_AVG': 0.0,
 'ENTRANCES_AVG': 0.1379,
 'FLOORSMAX_AVG': 0.1667,
 'FLOORSMIN_AVG': 0.2083,
 'LANDAREA_AVG': 0.0483,
 'LIVINGAPARTMENTS_AVG': 0.0756,
 'LIVINGAREA_AVG': 0.0742,
 'NONLIVINGAREA_AVG': 0.0035,
 'APARTMENTS_MODE': 0.084,
 'BASEMENTAREA_MODE': 0.0744,
 'YEARS_BEGINEXPLUATATION_MODE': 0.9816,
 'YEARS_BUILD_MODE': 0.7648,
 'COMMONAREA_MODE': 0.0191,
 'ELEVATORS_MODE': 0.0,
 'ENTRANCES_MODE': 0.1379,
 'FLOORSMAX_MODE': 0.1667,
 'FLOORSMIN_MODE': 0.2083,
 'LANDAREA_MODE': 0.0461,
 'LIVINGAPARTMENTS_MODE': 0.0762,
 'LIVINGAREA_MODE': 0.073,
 'NONLIVINGAREA_MODE': 0.001,
 'APARTMENTS_MEDI': 0.0869,
 'BASEMENTAREA_ME

In [23]:
for column_name, replace_with_value in replacing_criteria_for_num_features.items():
    df[column_name].fillna(replace_with_value, inplace=True)

In [24]:
df.shape

(184506, 105)

In [25]:
numerical_columns = list(df.select_dtypes(exclude=["object"]).columns)
categorical_columns = list(df.select_dtypes(include=["object"]).columns)

In [26]:
print(len(numerical_columns))
print(len(categorical_columns))

89
16


In [27]:
for column in categorical_columns:
    print(column, len(df[column].unique()))

NAME_CONTRACT_TYPE 2
CODE_GENDER 3
FLAG_OWN_CAR 2
FLAG_OWN_REALTY 2
NAME_TYPE_SUITE 7
NAME_INCOME_TYPE 8
NAME_EDUCATION_TYPE 5
NAME_FAMILY_STATUS 6
NAME_HOUSING_TYPE 6
OCCUPATION_TYPE 19
WEEKDAY_APPR_PROCESS_START 7
ORGANIZATION_TYPE 58
FONDKAPREMONT_MODE 5
HOUSETYPE_MODE 4
WALLSMATERIAL_MODE 8
EMERGENCYSTATE_MODE 3


In [28]:
target_column = df[["TARGET"]]
to_scale = df[numerical_columns]
to_scale = to_scale.drop(["TARGET"], axis=1)

In [29]:
to_scale.shape

(184506, 88)

In [30]:
to_scale.columns

Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG',
       'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG',
       'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
       'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG',
       'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE',
       'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'CO

In [31]:
from sklearn.preprocessing import RobustScaler, StandardScaler, Normalizer

In [32]:
# scaler = MinMaxScaler()
scaler = StandardScaler()
scaler.fit(to_scale)

In [33]:
scaled = scaler.transform(to_scale)
scaled_df = pd.DataFrame(scaled, columns=to_scale.columns)

In [34]:
scaled_df

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0.982854,-1.195826,-0.926464,-1.144011,-0.974700,-0.269010,-0.457289,-0.168825,1.482901,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.172436,-0.404390,0.130522
1,0.982854,1.020039,3.201972,2.775649,3.213598,-0.596881,-0.452490,-0.205988,0.542476,3.637222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,1.570789,0.702773
2,0.982854,0.189090,-1.239904,-1.161165,-1.219630,0.318177,-0.495195,0.910328,-1.042339,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.703295,-0.404390,-0.441729
3,-0.585228,-1.029636,-0.483697,-0.452649,-0.362376,0.750829,-0.493504,-0.409960,-1.075453,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522
4,-0.585228,-0.087893,0.723361,0.182394,0.372413,-0.067014,-0.454253,1.013307,-1.079427,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184501,0.982854,-0.364877,-1.135050,-0.825194,-1.097165,0.893670,-0.462017,0.311177,-1.166846,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,-0.441729
184502,-0.585228,0.466073,-0.595960,-0.645233,-0.484841,-1.475484,2.133158,-2.038618,-1.139031,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,1.570789,-1.013980
184503,-0.585228,-0.364877,0.751203,1.365413,0.984737,0.229216,-0.452434,-0.011377,1.862382,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,-0.441729
184504,-0.585228,-0.508908,0.364488,-0.466567,0.562883,-1.043061,2.133158,-0.375918,-0.429076,-0.102647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522


In [35]:
df = pd.concat([df[categorical_columns], scaled_df, target_column], axis=1)
df

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,...,FLAG_DOCUMENT_14,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,Cash loans,F,N,Y,Family,Commercial associate,Secondary / secondary special,Married,House / apartment,Sales staff,...,0.0,0.0,0.0,0.0,0.0,0.0,2.172436,-0.404390,0.130522,0
1,Cash loans,M,Y,Y,Unaccompanied,Working,Higher education,Civil marriage,House / apartment,Managers,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,1.570789,0.702773,0
2,Cash loans,M,Y,Y,"Spouse, partner",Working,Higher education,Married,House / apartment,Laborers,...,0.0,0.0,0.0,0.0,0.0,0.0,4.703295,-0.404390,-0.441729,0
3,Cash loans,M,N,N,Unaccompanied,Working,Secondary / secondary special,Single / not married,With parents,Missing,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522,0
4,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Sales staff,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184501,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Secretaries,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,-0.441729,0
184502,Revolving loans,F,N,Y,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,Missing,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,1.570789,-1.013980,0
184503,Cash loans,F,N,N,Unaccompanied,Commercial associate,Secondary / secondary special,Married,With parents,Laborers,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,-0.441729,0
184504,Cash loans,F,N,Y,Family,Pensioner,Secondary / secondary special,Married,House / apartment,Missing,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.358424,-0.404390,0.130522,0


In [36]:
for column in categorical_columns:
    missing_values = df[column].isna().sum()
    if missing_values > 0:
        print(len(df[column].unique()), column, missing_values)
        print(df[column].value_counts())
        print("\n\n")

In [37]:
print("Categorical columns:")
categorical_columns = list(df.select_dtypes(include=["object"]).columns)
print(f"There are {len(categorical_columns)} categorical columns.\n")
for column in categorical_columns:
#     print(len(df[column].unique()), column)
    if len(df[column].unique()) < 3:
        print(column)

Categorical columns:
There are 16 categorical columns.

NAME_CONTRACT_TYPE
FLAG_OWN_CAR
FLAG_OWN_REALTY


In [38]:
df["NAME_CONTRACT_TYPE"].replace({"Cash loans": 1, "Revolving loans": 0}, inplace=True)
df["FLAG_OWN_CAR"].replace({"Y": 1, "N": 0}, inplace=True)
df["FLAG_OWN_REALTY"].replace({"Y": 1, "N": 0}, inplace=True)

In [39]:
print("Categorical columns:")
categorical_columns = list(df.select_dtypes(include=["object"]).columns)
print(f"There are {len(categorical_columns)} categorical columns.\n")
for column in categorical_columns:
    print(len(df[column].unique()), column)

Categorical columns:
There are 13 categorical columns.

3 CODE_GENDER
7 NAME_TYPE_SUITE
8 NAME_INCOME_TYPE
5 NAME_EDUCATION_TYPE
6 NAME_FAMILY_STATUS
6 NAME_HOUSING_TYPE
19 OCCUPATION_TYPE
7 WEEKDAY_APPR_PROCESS_START
58 ORGANIZATION_TYPE
5 FONDKAPREMONT_MODE
4 HOUSETYPE_MODE
8 WALLSMATERIAL_MODE
3 EMERGENCYSTATE_MODE


In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [41]:
columns_using_onehot_encoder = [
    "OCCUPATION_TYPE",
    "ORGANIZATION_TYPE",
    "CODE_GENDER",
    "NAME_TYPE_SUITE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "FONDKAPREMONT_MODE",
    "HOUSETYPE_MODE",
    "WALLSMATERIAL_MODE",
    "EMERGENCYSTATE_MODE",
    "WEEKDAY_APPR_PROCESS_START"
]


In [42]:
transformer = make_column_transformer(
    (OneHotEncoder(), columns_using_onehot_encoder),
    remainder='passthrough')

In [43]:
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]
print(X.shape)
print(Y.shape)

(184506, 104)
(184506,)


In [44]:
transformed = transformer.fit_transform(X)

In [45]:
transformed.shape

(184506, 230)

In [46]:
stratified_splits = StratifiedShuffleSplit(
    n_splits=5,
    test_size=0.25,
    random_state=42,
)

In [47]:
# standard
clf = LogisticRegression(random_state=42, max_iter=3000, n_jobs=-1)
iter = 0
for train_indices, test_indices in stratified_splits.split(transformed, Y):
    iter +=1 
    X_train, X_val = transformed[train_indices], transformed[test_indices]
    y_train, y_val = Y.loc[train_indices], Y.loc[test_indices]
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_predict)
    f1_macro = f1_score(y_val.values, y_predict, average="macro")
    print(f"F1 macro score is: {f1_macro}")

F1 macro score is: 0.4898804386355013
F1 macro score is: 0.48727841347528456
F1 macro score is: 0.4914734818500509
F1 macro score is: 0.4880098245484216
F1 macro score is: 0.4927329815621138


In [None]:
# standard
clf = LogisticRegression(random_state=42, max_iter=3000, n_jobs=-1, solver="saga", penalty="elasticnet", l1_ratio=0.5)
iter = 0
for train_indices, test_indices in stratified_splits.split(transformed, Y):
    iter +=1 
    X_train, X_val = transformed[train_indices], transformed[test_indices]
    y_train, y_val = Y.loc[train_indices], Y.loc[test_indices]
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_predict)
    f1_macro = f1_score(y_val.values, y_predict, average="macro")
    print(f"F1 macro score is: {f1_macro}")

In [51]:
clf = LogisticRegression(random_state=42, max_iter=3000, n_jobs=-1)
iter = 0
for train_indices, test_indices in stratified_splits.split(transformed, Y):
    iter +=1 
    X_train, X_val = transformed[train_indices], transformed[test_indices]
    y_train, y_val = Y.loc[train_indices], Y.loc[test_indices]
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_predict)
    f1_macro = f1_score(y_val.values, y_predict, average="macro")
    print(f"F1 macro score is: {f1_macro}")

F1 macro score is: 0.4891295466261802
F1 macro score is: 0.48676177453527225
F1 macro score is: 0.49175554063584137
F1 macro score is: 0.48806794488685806
F1 macro score is: 0.4920284235670436


In [48]:
test = pd.read_csv("../data/dataset/test_data.csv")
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294,Cash loans,F,N,Y,0,180000.0,265306.5,25317.0,252000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,2.0,5.0
1,439847,Cash loans,F,N,Y,0,202500.0,346500.0,21069.0,346500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562,Cash loans,M,Y,N,0,360000.0,545040.0,36553.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
3,407238,Cash loans,F,N,Y,0,135000.0,307557.0,20682.0,265500.0,...,0,0,0,0,0.0,0.0,1.0,0.0,1.0,2.0
4,239910,Cash loans,F,N,Y,0,157500.0,1056447.0,31018.5,922500.0,...,0,0,0,0,,,,,,


In [49]:
columns_to_drop = [
    "REGION_POPULATION_RELATIVE",
    "FLAG_MOBIL",
    "FLAG_DOCUMENT_2",
    "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_10",
    "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_17",
    "FLAG_DOCUMENT_21",
]
test = test.drop(columns=columns_to_drop)
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294,Cash loans,F,N,Y,0,180000.0,265306.5,25317.0,252000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,2.0,5.0
1,439847,Cash loans,F,N,Y,0,202500.0,346500.0,21069.0,346500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562,Cash loans,M,Y,N,0,360000.0,545040.0,36553.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
3,407238,Cash loans,F,N,Y,0,135000.0,307557.0,20682.0,265500.0,...,0,0,0,0,0.0,0.0,1.0,0.0,1.0,2.0
4,239910,Cash loans,F,N,Y,0,157500.0,1056447.0,31018.5,922500.0,...,0,0,0,0,,,,,,


In [50]:
for column_name, replace_with_value in replacing_criteria_for_cat_features.items():
    test[column_name].fillna(replace_with_value, inplace=True)

In [51]:
for column_name, replace_with_value in replacing_criteria_for_num_features.items():
    test[column_name].fillna(replace_with_value, inplace=True)

In [52]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_16,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294,Cash loans,F,N,Y,0,180000.0,265306.5,25317.0,252000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,2.0,5.0
1,439847,Cash loans,F,N,Y,0,202500.0,346500.0,21069.0,346500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562,Cash loans,M,Y,N,0,360000.0,545040.0,36553.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
3,407238,Cash loans,F,N,Y,0,135000.0,307557.0,20682.0,265500.0,...,0,0,0,0,0.0,0.0,1.0,0.0,1.0,2.0
4,239910,Cash loans,F,N,Y,0,157500.0,1056447.0,31018.5,922500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [53]:
numerical_columns = list(test.select_dtypes(exclude=["object"]).columns)
categorical_columns = list(test.select_dtypes(include=["object"]).columns)

In [54]:
numerical_columns = numerical_columns[1:]
numerical_columns

['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMON

In [55]:
id_column = test[["SK_ID_CURR"]]
to_scale = test[numerical_columns]

In [56]:
to_scale.shape

(123005, 95)

In [57]:
scaled = scaler.transform(to_scale)
scaled_df = pd.DataFrame(scaled, columns=to_scale.columns)
test_df = pd.concat([id_column, test[categorical_columns], scaled_df], axis=1)
test_df.head()

Feature names unseen at fit time:
- FLAG_CONT_MOBILE
- FLAG_DOCUMENT_15
- FLAG_DOCUMENT_19
- FLAG_DOCUMENT_20
- NONLIVINGAPARTMENTS_AVG
- ...



ValueError: X has 95 features, but StandardScaler is expecting 88 features as input.

In [59]:
test_df["NAME_CONTRACT_TYPE"].replace({"Cash loans": 1, "Revolving loans": 0}, inplace=True)
test_df["FLAG_OWN_CAR"].replace({"Y": 1, "N": 0}, inplace=True)
test_df["FLAG_OWN_REALTY"].replace({"Y": 1, "N": 0}, inplace=True)

In [60]:
test_transformed = transformer.transform(test_df)

In [61]:
test_transformed.shape

(123005, 237)

In [63]:
y_predict = clf.predict(test_transformed)

In [65]:
submission_df = pd.DataFrame(columns=["SK_ID_CURR", "TARGET"])
submission_df["SK_ID_CURR"] = id_column
submission_df["TARGET"] = y_predict.astype(int)
submission_df.to_csv("submission.csv", index=False)

In [64]:
set(y_predict)

{0, 1}

In [83]:
clf = LogisticRegression(random_state=42, max_iter=3000, n_jobs=-1, solver="saga", penalty="elasticnet",  l1_ratio=0.5)
iter = 0
for train_indices, test_indices in stratified_splits.split(X, Y):
    iter +=1 
#     print(X[train_indices].shape)
#     break
    X_train, X_val = X[train_indices], X[test_indices]
    y_train, y_val = Y[train_indices], Y[test_indices]
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_predict)
    f1 = f1_score(y_val, y_predict)
    f1_macro = f1_score(y_val, y_predict, average="macro")
    precision = precision_score(y_val, y_predict)
    recall = recall_score(y_val, y_predict)
    print(f"F1 macro score is: {f1_macro}")


F1 macro score is: 0.4891381245021095
F1 macro score is: 0.48676177453527225


KeyboardInterrupt: 