In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import os
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from category_encoders import *
from pathlib import Path


In [2]:
ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR       = ROOT / "csv_files" / "train"
TEST_DIR        = ROOT / "csv_files" / "test"
BASE_TRAIN_PATH = TRAIN_DIR / "train_base.csv"
BASE_TEST_PATH  = TEST_DIR / "test_base.csv"


MODEL_PATH = "/kaggle/working/homecredit-dataset/best_model.pth"
LOAD_MODEL = False

In [3]:
main_file = pl.read_csv(BASE_TRAIN_PATH)

In [4]:
d = os.listdir("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train")

In [5]:
def check_dtype(file_list):
    for file_name in file_list:
        print("*"*5+file_name+"*"*5)
        col_dtype = {}
        file = pd.read_csv(TRAIN_DIR/file_name)
        for col in file.columns:
            col_dtype[col] = file[col].dtype
        
        print(col_dtype)
            

In [6]:
# check_dtype(os.listdir("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train"))

In [7]:
for file in d:
    temp_file = pl.read_csv(TRAIN_DIR/file)
#     print("\n","*"*10)
    print(file)
    print(temp_file.schema)

train_credit_bureau_a_1_3.csv
OrderedDict([('case_id', Int64), ('annualeffectiverate_199L', Float64), ('annualeffectiverate_63L', Float64), ('classificationofcontr_13M', String), ('classificationofcontr_400M', String), ('contractst_545M', String), ('contractst_964M', String), ('contractsum_5085717L', Float64), ('credlmt_230A', Float64), ('credlmt_935A', Float64), ('dateofcredend_289D', String), ('dateofcredend_353D', String), ('dateofcredstart_181D', String), ('dateofcredstart_739D', String), ('dateofrealrepmt_138D', String), ('debtoutstand_525A', Float64), ('debtoverdue_47A', Float64), ('description_351M', String), ('dpdmax_139P', Float64), ('dpdmax_757P', Float64), ('dpdmaxdatemonth_442T', Float64), ('dpdmaxdatemonth_89T', Float64), ('dpdmaxdateyear_596T', Float64), ('dpdmaxdateyear_896T', Float64), ('financialinstitution_382M', String), ('financialinstitution_591M', String), ('instlamount_768A', Float64), ('instlamount_852A', Float64), ('interestrate_508L', Float64), ('lastupdate_11

In [8]:
def set_table_dtypes(df):
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
        if col[-1] in ("M"):
            df = df.with_columns(pl.col(col).cast(pl.String).alias(col))

    return df

In [9]:
def handle_dates(df):
    for col in df.columns:
        if col[-1] in ("D"):
            df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days())
            
    return df

In [10]:
def filter_cols(df):
    
    # Drop if null count of column higher than 80%
    
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM"]:
            continue
            
        isnull = df[col].is_null().mean()
        
        if isnull > 0.8:
            df = df.drop(col)
            
    # Drop if number of unique values of column is not between 2-100
            
    for col in df.columns[1:]:
        if col in ["case_id", "WEEK_NUM"]:
            continue
        if df[col].dtype != pl.String:
            continue
            
        freq = df[col].n_unique()
        
        if (freq == 1) | (freq > 100):
            df = df.drop(col)

    return df

In [11]:
def read_file(path):
    df = pl.read_csv(path)
    df = df.pipe(set_table_dtypes)
    
    return df

In [12]:

def feature_eng(df_base, df_person_1, df_static, df_static_cb, df_credit_bureau_b_2):
    df_base = (
        df_base
        .with_columns(
            date_decision = pl.col("date_decision").cast(pl.Date),
            WEEK_NUM = pl.col("WEEK_NUM").cast(pl.Int32),
        )
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    
    df_person_1 = (
        df_person_1
        .group_by("case_id")
        .agg(
            [pl.max(col) for col in df_person_1.columns if col != "case_id"],
        )
    )
    
    df_credit_bureau_b_2 = (
        df_credit_bureau_b_2
        .group_by("case_id")
        .agg(
            [pl.max(col) for col in df_credit_bureau_b_2.columns if col != "case_id"],
        )
    )

    df_data = (
        df_base
        .join(df_person_1, how="left", on="case_id", suffix="_p1")
        .join(df_static, how="left", on="case_id", suffix="_s")
        .join(df_static_cb, how="left", on="case_id", suffix="_scb")
        .join(df_credit_bureau_b_2, how="left", on="case_id", suffix="cbb2")
    )
    
    return df_data

In [13]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols

In [14]:
# d1 = pl.read_csv(TRAIN_DIR/'train_person_1.csv')
# d2 = pl.read_csv(TRAIN_DIR/'train_person_2.csv')
# count = 0
# for i in d1.columns:
#     for j in d2.columns:
#         if i == j:
#             count=+1
#             print(i,j)

In [15]:
# df_base              = read_file(TRAIN_DIR / "train_base.csv")
# df_static_cb         = read_file(TRAIN_DIR / "train_static_cb_0.cvs")
# df_person_1          = read_file(TRAIN_DIR / "train_person_1.csv")
# df_credit_bureau_b_2 = read_file(TRAIN_DIR / "train_credit_bureau_b_2.cvs")
# df_person_1          = read_file(TRAIN_DIR / "train_person_1.csv")
# df_credit_bureau_b_1 = pl.concat([
#     read_file(TRAIN / "train_credit_bureau_a_1_0.csv"),
#     read_file(TRAIN / "train_credit_bureau_a_1_0.csv"),
#     read_file(TRAIN / "train_credit_bureau_a_1_0.csv"),
# ],how="vertical_relaxed")

# df_applprev = pl.concat([
#     read_file(TRAIN / "train_applprev_1_0.csv"),
#     read_file(TRAIN / "train_applprev_1_1.csv")
# ],how="vertical_relaxed")
# df_static = pl.concat([
#     read_file(TRAIN_DIR / "train_static_0_0.parquet"),
#     read_file(TRAIN_DIR / "train_static_0_1.parquet"),
# ], how="vertical_relaxed")

In [16]:
# def check_id(file_list):
#     count = 0
#     l = []
#     t = len(file_list)
#     print(t)
#     for file_name in file_list:
#         file = pl.read_csv(TRAIN_DIR/file_name)
#         if file.columns[0] == 'case_id':
#             l.append(file_name)
#             count+=1
    
#     print(f"CSV's {l} have case_id and total number is {count}")
#     print(file.columns[0])
        
# check_id(d)
            

In [17]:
main_file.head(10)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1
5,"""2019-01-02""",201901,0,0
6,"""2019-01-03""",201901,0,0
7,"""2019-01-03""",201901,0,0
8,"""2019-01-03""",201901,0,0
9,"""2019-01-03""",201901,0,0


In [18]:
main_file.schema

OrderedDict([('case_id', Int64),
             ('date_decision', String),
             ('MONTH', Int64),
             ('WEEK_NUM', Int64),
             ('target', Int64)])

In [19]:
for col in main_file.columns:
    print(main_file[col].dtype)

Int64
String
Int64
Int64
Int64
