# Credit card case

### Summary - Data processing

a) Data acquisition

b) Importing libraries

c) Reading the files

d) Missing values

e) Feature engineering

f) Encoding for categorical features

g) Normalization of numerical features

h) Saving essential files for the next steps

---

##### a) Data acquisition

- The downloaded data can be found at this link here on kaggle <br>
https://www.kaggle.com/datasets/arjunbhasin2013/ccdata

#### b) Importing libraries

In [14]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, '../') # set home directory
from src.features.funcs import *

import warnings
warnings.filterwarnings("ignore")

#### c) Reading the files

In [15]:
df_raw = pd.read_csv('../data/raw/CC GENERAL.csv')
df_raw

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


#### d) Missing values

In [16]:
check_nan(df_raw)
df_raw.dropna(inplace=True)

#### e) Feature engineering
- From this point on, a new dataframe with data ready for application in modeling will be built: df_clean

In [17]:
# lowercase nome das colunas
df_raw.columns = [x.lower() for x in df_raw.columns]

# Creation of features that can bring gain during modeling
df_raw['perc_oneoff_purchases'] = round(df_raw['oneoff_purchases'] / df_raw['purchases'], 2).fillna(0)
df_raw['perc_installments_purchases'] = round(df_raw['installments_purchases'] / df_raw['purchases'], 2).fillna(0)
df_raw['defaulter'] = np.where(df_raw.purchases > df_raw.credit_limit,'yes','no')

df_clean = df_raw.copy()

# Removing infinite values
df_raw.replace([np.inf, -np.inf], np.nan, inplace=True)
df_raw.dropna(inplace=True)

#### f) Encoding for categorical variables
- From this point on, a new dataframe with data ready for application in modeling will be built: df_norm

In [18]:
cols_enc = ['defaulter']
df_norm = cat_encoder(df_raw, cols_enc)

#### g) Normalization of numerical features

In [19]:
df_norm = norm_features(df_norm, ['defaulter'])
df_norm

Unnamed: 0,cust_id,balance,balance_frequency,purchases,oneoff_purchases,installments_purchases,cash_advance,purchases_frequency,oneoff_purchases_frequency,purchases_installments_frequency,...,cash_advance_trx,purchases_trx,credit_limit,payments,minimum_payments,prc_full_payment,tenure,perc_oneoff_purchases,perc_installments_purchases,defaulter
0,C10001,0.002148,0.818182,0.001945,0.000000,0.004240,0.000000,0.166667,0.000000,0.083333,...,0.00000,0.005587,0.031720,0.003978,0.001826,0.000000,1.0,0.000000,0.337838,0
1,C10002,0.168169,0.909091,0.000000,0.000000,0.000000,0.136685,0.000000,0.000000,0.000000,...,0.03252,0.000000,0.232053,0.080892,0.014034,0.222222,1.0,0.000000,0.000000,0
2,C10003,0.131026,1.000000,0.015766,0.018968,0.000000,0.000000,1.000000,1.000000,0.000000,...,0.00000,0.033520,0.248748,0.012263,0.008210,0.000000,1.0,0.555556,0.000000,0
4,C10005,0.042940,1.000000,0.000326,0.000393,0.000000,0.000000,0.083333,0.083333,0.000000,...,0.00000,0.002793,0.038397,0.013373,0.003204,0.000000,1.0,0.555556,0.000000,0
5,C10006,0.095038,1.000000,0.027188,0.000000,0.059257,0.000000,0.666667,0.000000,0.583333,...,0.00000,0.022346,0.058431,0.027602,0.031506,0.000000,1.0,0.000000,0.337838,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8943,C19184,0.000308,0.500000,0.000426,0.000513,0.000000,0.000000,0.166667,0.166667,0.000000,...,0.00000,0.002793,0.015025,0.001155,0.000569,0.000000,0.0,0.555556,0.000000,0
8945,C19186,0.001496,1.000000,0.005936,0.000000,0.012939,0.000000,1.000000,0.000000,0.833333,...,0.00000,0.016760,0.031720,0.006418,0.000640,0.500000,0.0,0.000000,0.337838,0
8947,C19188,0.001229,0.833333,0.002945,0.000000,0.006418,0.000000,0.833333,0.000000,0.666667,...,0.00000,0.013966,0.031720,0.001601,0.001078,0.250000,0.0,0.000000,0.337838,0
8948,C19189,0.000707,0.833333,0.000000,0.000000,0.000000,0.000776,0.000000,0.000000,0.000000,...,0.01626,0.000000,0.015025,0.001035,0.000729,0.250000,0.0,0.000000,0.000000,0


#### h) Saving essential files for the next steps

In [20]:
df_clean.to_csv('../data/processed/cc_data_clean.csv', index=False)
df_norm.to_csv('../data/processed/cc_data_norm.csv', index=False)