In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/raw/credit_record.csv')
df.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [3]:
df.dtypes

ID                 int64
MONTHS_BALANCE     int64
STATUS            object
dtype: object

In [4]:
print(f'Rows: {df.shape[0]} \nColumns: {df.shape[1]}')

Rows: 1048575 
Columns: 3


In [5]:
print(f'Null values: {df.isna().sum().sum()}')
print(f'Duplicated values: {df.duplicated().sum()}')

Null values: 0
Duplicated values: 0


In [32]:
random_id = df.sample(1).ID.values[0]

In [34]:
df[df.ID == random_id]

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
104290,5009512,0,C
104291,5009512,-1,C
104292,5009512,-2,C
104293,5009512,-3,C
104294,5009512,-4,C
104295,5009512,-5,C
104296,5009512,-6,C
104297,5009512,-7,C
104298,5009512,-8,C
104299,5009512,-9,C


In [7]:
df[['STATUS']].value_counts()

STATUS
C         442031
0         383120
X         209230
1          11090
5           1693
2            868
3            320
4            223
Name: count, dtype: int64

In [8]:
df_g = df.groupby(['ID','STATUS'])
df_g = df_g.size().unstack()
df_g.head()

STATUS,0,1,2,3,4,5,C,X
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5001711,3.0,,,,,,,1.0
5001712,10.0,,,,,,9.0,
5001713,,,,,,,,22.0
5001714,,,,,,,,15.0
5001715,,,,,,,,60.0


In [9]:
df_g['total'] = df_g.sum(axis=1)

bad = ['0', '1', '2', '3', '4', '5']
good = ['C', 'X']

df_g['bad'] = df_g[bad].sum(axis=1)
df_g['good'] = df_g[good].sum(axis=1)

df_g['good_ratio'] = df_g['good'] / df_g['total']

df_g[['bad', 'good', 'total', 'good_ratio']].describe()

STATUS,bad,good,total,good_ratio
count,45985.0,45985.0,45985.0,45985.0
mean,8.640078,14.162466,22.802544,0.524158
std,8.507858,14.711766,15.492771,0.35411
min,0.0,0.0,1.0,0.0
25%,3.0,1.0,10.0,0.166667
50%,6.0,9.0,19.0,0.571429
75%,12.0,23.0,34.0,0.844828
max,61.0,61.0,61.0,1.0


In [10]:
df_g['CLASS'] = np.where(df_g['good_ratio'] >= 0.7, 1, 0)
df_g.sample(3)

STATUS,0,1,2,3,4,5,C,X,total,bad,good,good_ratio,CLASS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5126118,7.0,,,,,,,1.0,8.0,7.0,1.0,0.125,0
5011423,19.0,,,,,,3.0,,22.0,19.0,3.0,0.136364,0
5150070,3.0,,,,,,54.0,,57.0,3.0,54.0,0.947368,1


In [11]:
df_g = df_g.reset_index(drop=False)[['ID', 'CLASS']]

class_count = dict(df_g[['CLASS']].value_counts())

total = sum(class_count.values())

for key in class_count.keys():
    value = class_count[key]
    perc = (value / total) * 100
    
    print(f'{key[0]}: {value} ({perc:.2f}%)')

0: 27639 (60.10%)
1: 18346 (39.90%)


In [12]:
df_g.to_csv('../data/interim/class_record.csv', index=False)