In [None]:
! echo "# imbalanced-german-credit" >> README.md
! git init
! git add README.md
! git commit -m "first commit"
! git branch -M main
! git remote add origin https://github.com/AlisonZa/imbalanced-german-credit.git
! git push -u origin main

# 1. Setup

In [1]:
# Structures to make easier for us to perform the procedures
from src.entities import EnvironmentConfiguration
environment_configuration = EnvironmentConfiguration()

In [2]:
import os
os.makedirs(environment_configuration.artifacts_folder, exist_ok= True)
os.makedirs(environment_configuration.eda_folder, exist_ok= True)
os.makedirs(environment_configuration.two_way_tables_folder, exist_ok= True)
os.makedirs(environment_configuration.plots_folder, exist_ok= True)
os.makedirs(environment_configuration.univariate_plots_folder, exist_ok= True)
os.makedirs(environment_configuration.bivariate_plots_folder, exist_ok= True)
os.makedirs(environment_configuration.y_data_profiling_folder, exist_ok= True)
os.makedirs(environment_configuration.pipelines_folder, exist_ok= True)

In [3]:
# Import raw data:
import pandas as pd
raw_data = pd.read_csv(environment_configuration.raw_data_folder)

In [4]:
raw_data.head(5)

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


# 2. Explore the data to gain insights.


## 1. Data Quality

In [5]:
# Pandas profiling
from ydata_profiling import ProfileReport 

profile = ProfileReport(raw_data, explorative=True)

report_path = (environment_configuration.y_data_profiling_file)
profile.to_file(report_path)

print(f"Report saved to {report_path}")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 317/317 [02:07<00:00,  2.49it/s, Completed]                                                                 
Generate report structure: 100%|██████████| 1/1 [00:45<00:00, 45.40s/it]
Render HTML: 100%|██████████| 1/1 [00:48<00:00, 48.69s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]

Report saved to artifacts\exploratory_data_analysys\y_data_profiling\data_profiling.html





In [5]:
# Unique Values
from src.eda.unique_values import export_unique_values_to_excel

export_unique_values_to_excel(raw_data, environment_configuration)

Excel file created successfully: artifacts\exploratory_data_analysys\unique_values_spreadsheet.xlsx


In [6]:
# Unique Values Count
raw_data.nunique()

CUST_ID                             8950
BALANCE                             8871
BALANCE_FREQUENCY                     43
PURCHASES                           6203
ONEOFF_PURCHASES                    4014
INSTALLMENTS_PURCHASES              4452
CASH_ADVANCE                        4323
PURCHASES_FREQUENCY                   47
ONEOFF_PURCHASES_FREQUENCY            47
PURCHASES_INSTALLMENTS_FREQUENCY      47
CASH_ADVANCE_FREQUENCY                54
CASH_ADVANCE_TRX                      65
PURCHASES_TRX                        173
CREDIT_LIMIT                         205
PAYMENTS                            8711
MINIMUM_PAYMENTS                    8636
PRC_FULL_PAYMENT                      47
TENURE                                 7
dtype: int64

## 2. Study each attribute and its characteristics:


Now we can define the feature definition variable that will be fed to the code which is going to perform the data Visualization 

In [13]:
from src.entities import FeatureDefinition
feature_defintion = FeatureDefinition(
    type_of_task= "clustering",
    data_frame = raw_data,
    numeric = ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']
)

## 1. Visualize the data.


In [14]:
# Bivariate and Univariate Plots 
from src.eda.plots import InteractiveEDAPlotter
interactive_eda_plotter = InteractiveEDAPlotter(env_config= environment_configuration,
                                              feature_def= feature_defintion)
interactive_eda_plotter.perform_eda()

The conclusions are in `Data_Exploration_Conclusions.md`, please refer to this file.