In [1]:
pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
print(statlog_german_credit_data.metadata)

# variable information
print(statlog_german_credit_data.variables)


{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

In [3]:
# Importando bibliotecas necessárias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [4]:
# Carregando o conjunto de dados
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
column_names = ['Status of existing checking account', 'Duration in months', 'Credit history',
                'Purpose', 'Credit amount', 'Savings account/bonds', 'Present employment since',
                'Installment rate in percentage of disposable income', 'Personal status and sex',
                'Other debtors / guarantors', 'Present residence since', 'Property', 'Age in years',
                'Other installment plans', 'Housing', 'Number of existing credits at this bank',
                'Job', 'Number of people being liable to provide maintenance for', 'Telephone', 'Foreign worker',
                'Creditability']
data = pd.read_csv(url, sep=' ', header=None, names=column_names)

In [5]:
data

Unnamed: 0,Status of existing checking account,Duration in months,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Creditability
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,...,A124,23,A143,A153,1,A173,1,A192,A201,2


In [6]:
# Codificação dos atributos categóricos
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

In [7]:
# Separando features e target
X = data.drop('Creditability', axis=1)
y = data['Creditability']

In [8]:
# Dividindo o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Treinando o modelo (usaremos Random Forest)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [10]:
# Avaliando o desempenho do modelo
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Acurácia:", accuracy)
print("Precisão:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Matriz de Confusão:")
print(conf_matrix)

Acurácia: 0.805
Precisão: 0.81875
Recall: 0.9290780141843972
F1 Score: 0.8704318936877077
Matriz de Confusão:
[[131  10]
 [ 29  30]]


In [11]:
# Acurácia: O modelo classifica corretamente 80.5% das transações de cartão de crédito.
# Precisão: Cerca de 81.9% das transações classificadas como fraudulentas são realmente fraudulentas.
# Recall: O modelo identifica corretamente cerca de 92.9% das transações fraudulentas.
# F1 Score: 87.0% Indica um bom equilíbrio entre precisão e recall.
# Matriz de Confusão:
#TN: 131 Transações não fraudulentas corretamente classificadas.
#TP: 30 Transações fraudulentas corretamente classificadas.
#FN: 29 Transações fraudulentas classificadas como não fraudulentas.
#FP: 10 Transações não fraudulentas classificadas como fraudulentas.