## Libraries

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.chdir("C:/Users/diego/Desktop/tangelo/")

## Reading Data

### Application Records Dataset

In [144]:
#Application Records dataset
app_df = pd.read_csv("./data/application_record.csv")
app_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


- `ID`: Client Id (Unique)
- `CODE_GENDER`: Gender (M: Male, F: Female). _(Boolean)_
- `FLAG_OWN_CAR`: Is ther a car. _(Boolean)_
- `FLAG_OWN_REALTY`: Is there a property. _(Boolean)_
- `CNT_CHILDREN`: Number of Childer. _(Int)_
- `AMT_INCOME_TOTAL`: Annual Income. _(Float)_
- `NAME_INCOME_TYPE`: Income Category. _(Nominal Variable)_
- `NAME_EDUCATION_TYPE`: Educational Level. _(Nominal Variable 1-5)_ 
- `NAME_FAMILY_STATUS`: Maritial Status. _(Nominal Variable 1-6)_
- `DAYS_BIRTH`: The number of the days from birth. _(Int)_
- `DAYS_EMPLOYED`: The number of the days from employed. _(Int)_
- `FLAG_MOBIL`: Is there a mobile-phone. _(Boolean)_
- `FLAG_WORK_PHONE`: Is there a work-phone. _(Boolean)_
- `FLAG_PHONE`: Is there a phone. _(Boolean)_
- `FLAG_EMAIL`: Is there an email. _(Boolean)_
- `OCCUPATION_TYPE`: Occupation. _(Nominal Variable 1-20)_
- `CNT_FAM_MEMBERS`: Family Size. _(Int)_

In [145]:
#Occupation type column is the only one with NA values.
# Which is around of the 30 % of the dataset, we will not remove it but instead rename it. 
print(app_df["OCCUPATION_TYPE"].isna().mean())
app_df['OCCUPATION_TYPE'].fillna('Other', inplace=True)

0.30601039317580153


### Credit Records Dataset

In [147]:
#Credit Records
cred_df=pd.read_csv("./data/credit_record.csv")
# We ony want the last month of information
cred_df=cred_df.groupby('ID').agg({'MONTHS_BALANCE': 'max', 'STATUS': 'first'}).reset_index()
cred_df.drop("MONTHS_BALANCE",axis=1,inplace=True)
# Filter dataset, we only want the IDs that has information in the app_df.
cred_df = cred_df[cred_df['ID'].isin(app_df.ID.unique())]
#People with no loans or that have paid off will be catalogized as 0 (No Risk).
cred_df['STATUS'] = cred_df['STATUS'].replace(['X', 'C'], '0')
cred_df['STATUS'] = cred_df['STATUS'].astype(int)

cred_df.head()

Unnamed: 0,ID,STATUS
3231,5008804,0
3232,5008805,0
3233,5008806,0
3234,5008808,0
3235,5008809,0


- `ID`: Client Id (Unique)
- `MONTHS_BALANCE`: Record Month _(Int)_
- `STATUS`: Status of the Loan

In [148]:
#This function returns the ratio of the ID's above certain credit rate.
def ratio(df:pd.DataFrame):
    for i in sorted(df["STATUS"].unique()):
        ratio=(df[df["STATUS"]>=i]["ID"].count() / len(df))
        print(f"{i}: {ratio:.2%}")


ratio(cred_df)

0: 100.00%
1: 1.05%
2: 0.29%
3: 0.24%
4: 0.22%
5: 0.21%


Generally, users in risk should be in 3%, thus I choose users who overdue for more than 30 days as target risk users. Those samples are marked as '1', else are '0'.

In [149]:
cred_df["target"] = cred_df["STATUS"].apply(lambda x: 0 if x <= 0 else 1)
cred_df.drop("STATUS",axis=1,inplace=True)
cred_df["target"].value_counts(normalize=True)

0    0.989522
1    0.010478
Name: target, dtype: float64