# Projeto german credit risk

In [1]:
import requests
import logging
import zipfile
import pandas as pd

from pathlib import Path



## Carregando os dados

In [2]:
DATA_DIR = Path.cwd().parent / 'data'

In [3]:
def load_data(data_dir: Path):
    pickle_file_path = data_dir / 'data.pkl'
    if not pickle_file_path.exists():
        data_dir.mkdir(parents=True, exist_ok=True)
        url = 'https://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip'
        response = requests.get(url)
        if response.status_code != 200:
            logging.info('Failed to download dataset.')
            return None

        file_path = data_dir / 'data.zip'
        with open(file_path, 'wb') as file:
            file.write(response.content)
        logging.info('Dataset downloaded successfully.')

        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(path=data_dir)

        columns = [
            'status',
            'duration',
            'history',
            'purpose',
            'amount',
            'savings',
            'employment',
            'installment',
            'status_sex',
            'guarantors',
            'residence',
            'property_type',
            'age',
            'plans',
            'housing',
            'credits',
            'job',
            'dependents',
            'telephone',
            'foreign',
            'target',
        ]
        data = pd.read_csv(data_dir / 'german.data', sep=' ', header=None, names=columns)

        data.to_pickle(pickle_file_path)
    else:
        data = pd.read_pickle(pickle_file_path)

    return data

In [4]:
german = load_data(DATA_DIR)

## Análise Exploratória

In [5]:
german.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   status         1000 non-null   object
 1   duration       1000 non-null   int64 
 2   history        1000 non-null   object
 3   purpose        1000 non-null   object
 4   amount         1000 non-null   int64 
 5   savings        1000 non-null   object
 6   employment     1000 non-null   object
 7   installment    1000 non-null   int64 
 8   status_sex     1000 non-null   object
 9   guarantors     1000 non-null   object
 10  residence      1000 non-null   int64 
 11  property_type  1000 non-null   object
 12  age            1000 non-null   int64 
 13  plans          1000 non-null   object
 14  housing        1000 non-null   object
 15  credits        1000 non-null   int64 
 16  job            1000 non-null   object
 17  dependents     1000 non-null   int64 
 18  telephone      1000 non-null 

In [6]:
german.head()

Unnamed: 0,status,duration,history,purpose,amount,savings,employment,installment,status_sex,guarantors,...,property_type,age,plans,housing,credits,job,dependents,telephone,foreign,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [7]:
german.describe()

Unnamed: 0,duration,amount,installment,residence,age,credits,dependents,target
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155,1.3
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086,0.458487
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0,2.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0,2.0


In [8]:
numerical_features = [
    'duration',
    'amount',
    'installment',
    'residence',
    'age',
    'credits',
    'dependents',
]

categorical_features = [
    'status',
    'history',
    'purpose',
    'savings',
    'employment',
    'status_sex',
    'guarantors',
    'property_type',
    'plans',
    'housing',
    'job',
    'telephone',
    'foreign'
]

In [9]:
X = german.drop('target', axis=1)
Y = german['target']

In [10]:
for col in categorical_features:
    print(X[col].value_counts().sort_index())

status
A11    274
A12    269
A13     63
A14    394
Name: count, dtype: int64
history
A30     40
A31     49
A32    530
A33     88
A34    293
Name: count, dtype: int64
purpose
A40     234
A41     103
A410     12
A42     181
A43     280
A44      12
A45      22
A46      50
A48       9
A49      97
Name: count, dtype: int64
savings
A61    603
A62    103
A63     63
A64     48
A65    183
Name: count, dtype: int64
employment
A71     62
A72    172
A73    339
A74    174
A75    253
Name: count, dtype: int64
status_sex
A91     50
A92    310
A93    548
A94     92
Name: count, dtype: int64
guarantors
A101    907
A102     41
A103     52
Name: count, dtype: int64
property_type
A121    282
A122    232
A123    332
A124    154
Name: count, dtype: int64
plans
A141    139
A142     47
A143    814
Name: count, dtype: int64
housing
A151    179
A152    713
A153    108
Name: count, dtype: int64
job
A171     22
A172    200
A173    630
A174    148
Name: count, dtype: int64
telephone
A191    596
A192    404
Name: c