**A base de dados**

link: https://www.kaggle.com/datasets/rohitudageri/credit-card-details


**Dicionario de dados**

* Ind_ID: Client ID
* Gender: Gender information
* Car_owner: Having car or not
* Propert_owner: Having property or not
* Children: Count of children
* Annual_income: Annual income
* Type_Income: Income type
* Education: Education level
* Marital_status: Marital_status
* Housing_type: Living style
* Birthday_count: Use backward count from current day (0), -1 means yesterday.
* Employed_days: Start date of employment. Use backward count from current day (0). Positive value means, individual is currently unemployed.
* Mobile_phone: Any mobile phone
* Work_phone: Any work phone
* Phone: Any phone number
* EMAIL_ID: Any email ID
* Type_Occupation: Occupation
* Family_Members: Family size
* Label: 0 is application approved and 1 is application rejected.





In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('/content/Credit_card.csv')
df_label = pd.read_csv('/content/Credit_card_label.csv')

#### Verificação dos dados

In [None]:
df.head(5)

In [None]:
df_label.head(5)

In [None]:
df.tail(5)

In [None]:
df_label.tail(5)

In [None]:
df.info()

In [None]:
df_label.info()

In [None]:
df_full = df.merge(df_label, how = 'left', on = 'Ind_ID')

In [None]:
df_full.info()

In [None]:
df_full.isna().sum()

In [None]:
df_full.index

In [None]:
df_numeros = df_full.select_dtypes("number")

In [None]:
df_numeros['EMAIL_ID'].unique()

In [None]:
colunas_to_drop = ['Ind_ID', 'Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID', 'label' ]

df_numeros = df_numeros.drop(colunas_to_drop, axis = 1)

In [None]:
df_numeros.describe()

In [None]:
for coluna in df_numeros.columns:
  plt.figure(figsize=(4,4))
  sns.histplot(data=df_full, x=coluna, hue='label')
  plt.title(f'Histograma de {coluna} por classe')
  plt.xlabel(coluna)
  plt.ylabel('Contagem')
  plt.show()


In [None]:
fig = plt.figure(figsize=(14,15))
for index, col in enumerate(df_numeros.columns):
    plt.subplot(6,4,index+1)
    sns.boxplot(y=col, data=df_numeros)
fig.tight_layout(pad=1.0)

In [None]:
df_full[ list(df_numeros.columns) + ['label'] ]

In [None]:
df_full[['label'] + list(df_numeros.columns)].corr()

In [None]:
sns.heatmap(df_full[['label'] + list(df_numeros.columns)].corr(), annot=True)

In [None]:
sns.heatmap(df_full[['label'] + list(df_numeros.columns)].corr('spearman'), annot=True);

analisando as categoricas

In [None]:
df_full.dtypes

In [None]:
cat_columns = df_full.select_dtypes('object').columns
cat_columns

In [None]:
cat_columns = list(cat_columns) + ['Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID']
df_full[cat_columns]

In [None]:
for cat_column in cat_columns:
    plt.figure(figsize=(5, 3))
    sns.countplot(data=df_full, x=cat_column, hue='label')
    plt.title(f'Contagem de {cat_column} por classe')
    plt.xlabel(cat_column)
    plt.ylabel('Contagem')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for coluna in cat_columns:
  mensagem = f"Tabela de contigência entre {coluna} e o label"
  print(len(mensagem) * "-")
  print(mensagem)
  print(len(mensagem) * "-")
  print(pd.crosstab(df_full[coluna], df_full['label'], normalize='index').mul(100).round(2))
  print(len(mensagem) * "#")


#### construção de features

In [None]:
df_full['Employed'] = df_full['Employed_days'].apply(lambda x: 1 if x <= 0 else 0 )
pd.crosstab(df_full['Employed'], df_full['label'], normalize='index').mul(100).round(2)

#### Recorte da base de dados

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full['Mobile_phone'].value_counts()

In [None]:
columns_to_drop = ['Ind_ID', 'Mobile_phone', 'Employed_days']
df_selected = df_full.drop(columns_to_drop, axis = 1)

In [None]:
df_selected

In [None]:
df_selected.dtypes

In [None]:
X = df_selected.drop('label', axis = 1)
y = df_selected['label']

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y.value_counts(normalize = True)

In [None]:
y_train.value_counts(normalize = True)

In [None]:
y_test.value_counts(normalize = True)

In [None]:
y_train.value_counts(normalize = True)

In [None]:
y_test.value_counts(normalize = True)

#### Tratamento das features categoricas

In [None]:
X_test

In [None]:
cat_columns.remove('Mobile_phone')

In [None]:
X_train[cat_columns].isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
X_train['Type_Occupation'].value_counts()

In [None]:
imp_fill = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'Ausente')
imp_fill.fit(X_train['Type_Occupation'].values.reshape(-1, 1))
X_train['Type_Occupation'] = imp_fill.transform(X_train['Type_Occupation'].values.reshape(-1, 1))
X_test['Type_Occupation'] = imp_fill.transform(X_test['Type_Occupation'].values.reshape(-1, 1))




In [None]:
imp_most = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_most.fit(X_train['GENDER'].values.reshape(-1, 1))
X_train['GENDER'] = imp_most.transform(X_train['GENDER'].values.reshape(-1, 1))
X_test['GENDER'] = imp_most.transform(X_test['GENDER'].values.reshape(-1, 1))

In [None]:
X_train[cat_columns]

In [None]:
#pip install category_encoders

In [None]:
from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder  import TargetEncoder
from category_encoders.ordinal  import OrdinalEncoder

In [None]:
X_train['Car_Owner'].unique()

In [None]:
cat_to_onehot = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income']
one_hot_enc = OneHotEncoder(cols = cat_to_onehot)
one_hot_enc.fit(X_train)
X_train_encode = one_hot_enc.transform(X_train)
X_train_encode

In [None]:
X_train['EDUCATION'].unique()

In [None]:
cat_to_ordinal = ['EDUCATION']


mapping_education = [ { 'col' :  'EDUCATION',
   'mapping': {
    'Lower secondary' : 1,
    'Secondary / secondary special' : 2,
    'Incomplete higher' : 3,
    'Higher education' : 4,
    'Academic degree' : 5
   }
}
]

ordinal_enc = OrdinalEncoder(cols= cat_to_ordinal, mapping=mapping_education)
ordinal_enc.fit(X_train_encode)
X_train_encode = ordinal_enc.transform(X_train_encode)

In [None]:
X_train_encode['EDUCATION']

In [None]:
X_train['Housing_type'].unique()

In [None]:
cat_to_target = ['Marital_status', 'Housing_type', 'Type_Occupation']
target_enc = TargetEncoder(cols= cat_to_target)
target_enc.fit(X_train_encode, y_train)
X_train_encode = target_enc.transform(X_train_encode)
X_train_encode

In [None]:
X_train_encode.info()

In [None]:
X_test_encode = one_hot_enc.transform(X_test)
X_test_encode = ordinal_enc.transform(X_test_encode)
X_test_encode = target_enc.transform(X_test_encode)

In [None]:
X_train_encode['Annual_income'].hist()

In [None]:
#Outliers
qtd_desvios = 3
upper_income  = X_train_encode['Annual_income'].std() * qtd_desvios
X_train_encode['Annual_income'] = X_train_encode['Annual_income'].apply(lambda x: upper_income if x >  upper_income else x)
X_test_encode['Annual_income'] = X_test_encode['Annual_income'].apply(lambda x: upper_income if x >  upper_income else x)

In [None]:
X_train_encode['Annual_income'].hist()

In [None]:
X_train_encode.isna().sum()

In [None]:
imp_numbers = SimpleImputer(missing_values=np.nan, strategy='median').set_output(transform = "pandas")
imp_numbers.fit(X_train_encode)
X_train_encode = imp_numbers.transform(X_train_encode)
X_test_encode = imp_numbers.transform(X_test_encode)

In [None]:
X_train_encode.isna().sum()

In [None]:
X_train_encode.describe()

In [None]:
#Normalização

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().set_output(transform = "pandas")
scaler.fit(X_train_encode)
X_train_norm = scaler.transform(X_train_encode)
X_test_norm = scaler.transform(X_test_encode)

In [None]:
X_train_norm.describe()

#### Modelagem/Treinamento

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train_norm, y_train)

In [None]:
input = X_train_encode. head()

In [None]:
predict_train = clf.predict(X_train_norm)
predict_test = clf.predict(X_test_norm)

In [None]:
from sklearn.metrics import accuracy_score as ACC

ACC(y_train, predict_train)

In [None]:
y_train.value_counts(normalize = True)

In [None]:
ACC(y_test, predict_test)

In [None]:
y_test.value_counts(normalize = True)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, predict_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, predict_test, zero_division = 0))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [None]:
neigh.fit(X_train_norm, y_train)

In [None]:
predict_train = neigh.predict(X_train_norm)
predict_test = neigh.predict(X_test_norm)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, predict_test)

In [None]:
print(classification_report(y_train, predict_train, zero_division = 0))

In [None]:
print(classification_report(y_test, predict_test, zero_division = 0))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_norm, y_train)

In [None]:
predict_train = clf.predict(X_train_norm)
predict_test = clf.predict(X_test_norm)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, predict_test)

In [None]:
print(classification_report(y_train, predict_train, zero_division = 0))

In [None]:
print(classification_report(y_test, predict_test, zero_division = 0))

##### Modelagem com gridsearch

In [None]:
X_train_reduzido, X_val, y_train_reduzido, y_val = train_test_split(X_train_norm, y_train, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import f1_score

penaltys = [None, 'l1', 'l2', 'elasticnet']
solvers = ['lbfgs', 'liblinear']

best_f1_score = 0
best_penalty = None
best_solver = None


for penalty in penaltys:
  for solver in solvers:
    print(f'{penalty} e {solver}')
    try:

      logistic = LogisticRegression(penalty = penalty, solver=solver, random_state=42)
      logistic.fit(X_train_reduzido, y_train_reduzido)
      pred_val = logistic.predict(X_val)

    except ValueError:
      print(f"{penalty} e {solver} não podem ser combinados!")
      continue

    f1 = f1_score(y_val, pred_val)
    print(f1)
    if f1 > best_f1_score:

      best_f1_score = f1
      best_penalty = penalty
      best_solver = solver

None e lbfgs
0.12121212121212122
None e liblinear
None e liblinear não podem ser combinados!
l1 e lbfgs
l1 e lbfgs não podem ser combinados!
l1 e liblinear
0.0
l2 e lbfgs
0.0
l2 e liblinear
0.0
elasticnet e lbfgs
elasticnet e lbfgs não podem ser combinados!
elasticnet e liblinear
elasticnet e liblinear não podem ser combinados!


In [None]:
(best_penalty, best_solver)

(None, 'lbfgs')

In [None]:
logistic = LogisticRegression(penalty = best_penalty, solver=best_solver, random_state=42)
logistic.fit(X_train_encode, y_train)

In [None]:
ks = [3, 5, 7, 9, 11, 13, 15, 17, 19]
best_f1_score = 0
best_k = None
for k in ks:
  neigh = KNeighborsClassifier(n_neighbors=k)
  neigh.fit(X_train_reduzido, y_train_reduzido)
  y_pred = neigh.predict(X_val)
  f1 = f1_score(y_val, y_pred)
  if f1 > best_f1_score:
    best_f1_score = f1
    best_k = k

In [None]:
best_f1_score

0.3673469387755102

In [None]:
neigh = KNeighborsClassifier(n_neighbors=best_k)
neigh.fit(X_train_norm, y_train)

In [None]:
criterios = ['gini', 'entropy', 'log_loss']
max_depths = [10, 20, 30, 40, 50, 100]
best_f1_score = 0
best_criterio = None
best_max_depth = None


for criterio in criterios:
  for max_depth in max_depths:
    decTree = DecisionTreeClassifier(criterion=criterio, max_depth=max_depth, random_state=42)
    decTree.fit(X_train_reduzido, y_train_reduzido)

    y_pred = decTree.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1_score:
      best_f1_score = f1
      best_criterio = criterio
      best_max_depth = max_depth

In [None]:
decTree = DecisionTreeClassifier(criterion=best_criterio, max_depth=best_max_depth, random_state=42)
decTree.fit(X_train_norm, y_train)

In [None]:
#### Comparando os modelos

y_pred_train_logist = logistic.predict(X_train_norm)
y_pred_train_knn = neigh.predict(X_train_norm)
y_pred_train_dt = decTree.predict(X_train_norm)

y_pred_test_logist = logistic.predict(X_test_norm)
y_pred_test_knn = neigh.predict(X_test_norm)
y_pred_test_dt = decTree.predict(X_test_norm)

print(f'------------Regressão Logistica Treinamento:')
print(classification_report(y_train, y_pred_train_logist, zero_division = 0))
print(f'------------KNN Treinamento:')
print(classification_report(y_train, y_pred_train_knn, zero_division = 0))
print(f'------------Decision Tree Treinamento:')
print(classification_report(y_train, y_pred_train_dt, zero_division = 0))

------------Regressão Logistica Treinamento:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1093
           1       0.12      1.00      0.21       145

    accuracy                           0.12      1238
   macro avg       0.56      0.50      0.11      1238
weighted avg       0.90      0.12      0.03      1238

------------KNN Treinamento:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1093
           1       0.74      0.57      0.65       145

    accuracy                           0.93      1238
   macro avg       0.84      0.77      0.80      1238
weighted avg       0.92      0.93      0.92      1238

------------Decision Tree Treinamento:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1093
           1       0.99      0.96      0.97       145

    accuracy                           0.99      1238
   macro avg   

In [None]:
print(f'------------Regressão Logistica Teste:')
print(classification_report(y_test, y_pred_test_logist, zero_division = 0))
print(f'------------KNN Teste:')
print(classification_report(y_test, y_pred_test_knn, zero_division = 0))
print(f'------------Decision Tree Teste:')
print(classification_report(y_test, y_pred_test_dt, zero_division = 0))

------------Regressão Logistica Teste:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       280
           1       0.10      1.00      0.18        30

    accuracy                           0.10       310
   macro avg       0.05      0.50      0.09       310
weighted avg       0.01      0.10      0.02       310

------------KNN Teste:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       280
           1       0.46      0.20      0.28        30

    accuracy                           0.90       310
   macro avg       0.69      0.59      0.61       310
weighted avg       0.87      0.90      0.88       310

------------Decision Tree Teste:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       280
           1       0.33      0.37      0.35        30

    accuracy                           0.87       310
   macro avg       0.63      0.64

#### Tecnicas para aumentar o desempenho

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [10, 20, 30, 40, 50, 100],
    'class_weight' : [ {0:1, 1:1}, {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10} ]
  }

dt = DecisionTreeClassifier()
dt = GridSearchCV(dt, parameters, cv = 5, scoring = 'f1_macro')
dt.fit(X_train_norm, y_train)


In [None]:
pd.DataFrame( dt.cv_results_ )[['params','mean_test_score']]

In [None]:
dt.best_params_

{'class_weight': {0: 1, 1: 10}, 'criterion': 'entropy', 'max_depth': 100}

In [None]:
best_model = dt.best_estimator_

In [None]:
predict_train = best_model.predict(X_train_norm)
predict_test= best_model.predict(X_test_norm)

In [None]:
print(f'------------Decision Tree Treinamento:')
print(classification_report(y_train, predict_train, zero_division = 0))

In [None]:
print(f'------------Decision Tree Teste:')
print(classification_report(y_test, predict_test, zero_division = 0))

#### StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
parameters = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [10, 20, 30, 40, 50, 100],
    'class_weight' : [ {0:1, 1:1}, {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10} ]
  }

kfolds = StratifiedKFold(5)
dt = DecisionTreeClassifier()
dt = GridSearchCV(dt, parameters, scoring = 'f1_macro', cv= kfolds.split(X_train_norm,y_train))
dt.fit(X_train_norm, y_train)

In [None]:
dt.best_params_

In [None]:
best_model = dt.best_estimator_
predict_train = best_model.predict(X_train_norm)
predict_test= best_model.predict(X_test_norm)

In [None]:
print(f'------------Decision Tree Treinamento:')
print(classification_report(y_train, predict_train, zero_division = 0))

In [None]:
print(f'------------Decision Tree Teste:')
print(classification_report(y_test, predict_test, zero_division = 0))