In [None]:
#!pip install -U imblearn 
#!pip install -U Texttable
#!pip install -U tabulate
#!pip install -U imgkit

import pandas as pd
import numpy  as np

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from tabulate import tabulate
from texttable import Texttable

from plotly.subplots import make_subplots

import os

---

<center><strong><font color = "darkblue" size=4> Importação e visualização dos dados</font></strong></center>

---

In [None]:
base = pd.read_csv("C:/Users/jaque/Desktop/Behavior Score/Bases/GiveMeSomeCredit/cs-training.csv", sep = ",")
base.columns

In [None]:
# Renomeando 
base.columns = ["Id", "Perf_final",
       "UltPercLimit", "Idade",
       "N_Atraso30_59Dias", "RazaoGastos", "RendaMensal",
       "N_EmeprestimosAbertos", "N_atrasos_Ult90Dias",
       "N_emprestimos", "N_Atraso60_89Dias",
       "N_dependentes"]

In [None]:
base.dtypes

In [None]:
# Realizando uma amostra de 60% da base
base1 = base.sample(frac=0.6, replace=False, random_state=1)
base1

In [None]:
# Verificando os percentis e salvando a tabela
base.quantile([0.1,0.5,0.8,0.85,0.9,0.95,0.99,0.99,1])


In [None]:
P99  = base.quantile([0.99])

print("\n",
      "UltPercLimit:",          P99.iloc[0,2],"\n",
      "Idade:",                 P99.iloc[0,3],"\n",
      "N_Atraso30_59Dias:",     P99.iloc[0,4],"\n",
      "RazaoGastos:",           P99.iloc[0,5],"\n",
      "RendaMensal:",           P99.iloc[0,6],"\n",
      
      "N_EmeprestimosAbertos:", P99.iloc[0,7],"\n",      
      "N_atrasos_Ult90Dias:",   P99.iloc[0,8],"\n",     
      "N_emprestimos:",         P99.iloc[0,9],"\n",
      
      "N_Atraso60_89Dias:",     P99.iloc[0,10],"\n",
      "N_dependentes:",         P99.iloc[0,11],"\n"
      
     )

In [None]:
# Retornando Outliers da base amostradaa
for i in range(2,12):
    print(i)    
    base1.iloc[:,i][base1.iloc[:,i] >  P99.iloc[0,i]] = P99.iloc[0,i]

base1.quantile([0.1,0.5,0.8,0.85,0.9,0.95,0.99,0.99,1])

In [None]:
# Manipulações iniciais 

# Se o individuo nunca fez um emprestimo vamos considerá-lo como duvidoso, pois não sabemos seu comportamento

base1["N_emprestimos"][base1["N_emprestimos"] == 0 ] = 5


# Suavizando a distribuição
base1["lnRazaoGastos"]   = np.log(base1["RazaoGastos"]+0.1)
base1["SqrtRazaoGastos"] = np.sqrt(base1["RazaoGastos"])
base1["lnRazaoGastos"]
base1["SqrtRazaoGastos"]

# Histogramas das transformações

transf_hist1 = make_subplots(rows=1, cols=2)

trace0 = go.Histogram(x = base1["lnRazaoGastos"]          , name = "lnRazaoGastos atual")
trace1 = go.Histogram(x = base1["SqrtRazaoGastos"]        , name = "SqrtRazaoGastos")

transf_hist1.append_trace(trace0, 1, 1)
transf_hist1.append_trace(trace1, 1, 2)

transf_hist1.show()

In [None]:
base1[["lnRazaoGastos","SqrtRazaoGastos","RazaoGastos"]].quantile([0,0.1,0.5,0.8,0.85,0.9,0.95,0.99,0.99,1])

In [None]:
# Frequência da var target
tab1  = pd.crosstab(index = base1["Perf_final"], columns = ["count"]) 
tab1["percent"] = round((tab1/tab1.sum())*100,1)
tab1 = tab1.reset_index()
tab1

In [None]:
# Gráfico de barras
fig = go.Figure(data = [go.Pie(labels = ["Bom","Mau"], values = tab1["percent"], hole = .5)],
               layout=dict(title=dict(text="Distribuição da variável Target")))

fig.update_traces(hoverinfo = "label+percent", textinfo = "value", textfont_size = 20,
                  marker = dict(line = dict(color = "#ffffff",  width = 2)))
fig.show()


In [None]:
# Histogramas

keep = [2,3,4,6,7,8,9,10,11,12]
base1.iloc[:,keep]

trace = {}

fig_hist1 = make_subplots(rows=3, cols=4,
           subplot_titles = base1.columns[keep])

for k in keep:
   
    trace[k,0] =  go.Histogram(x = base1.iloc[:,k] , marker_color = "#0066cc")
    
fig_hist1.append_trace(trace[2,0],  1, 1)
fig_hist1.append_trace(trace[3,0],  1, 2)
fig_hist1.append_trace(trace[4,0],  1, 3)
fig_hist1.append_trace(trace[6,0],  1, 4)
fig_hist1.append_trace(trace[7,0],  2, 1)
fig_hist1.append_trace(trace[8,0],  2, 2)
fig_hist1.append_trace(trace[9,0],  2, 3)
fig_hist1.append_trace(trace[10,0], 2, 4)
fig_hist1.append_trace(trace[11,0], 3, 1)
fig_hist1.append_trace(trace[12,0], 3, 2)
fig_hist1.append_trace(trace[12,0], 3, 2)

fig_hist1.update_layout(height = 600, width = 1000, title = "Distribuição das variáveis preditoras", showlegend = False)

fig_hist1.show()


In [None]:
# Boxplot
trace = {}

fig_box1 = make_subplots(rows=3, cols=4,
           subplot_titles = base1.columns[keep])

for k in keep:
   
    trace[k,0] =  go.Box(y = base1.iloc[:,k], x = base1["Perf_final"] , marker_color = "#0099cc")
    
fig_box1.append_trace(trace[2,0],  1, 1)
fig_box1.append_trace(trace[3,0],  1, 2)
fig_box1.append_trace(trace[4,0],  1, 3)
fig_box1.append_trace(trace[6,0],  1, 4)
fig_box1.append_trace(trace[7,0],  2, 1)
fig_box1.append_trace(trace[8,0],  2, 2)
fig_box1.append_trace(trace[9,0],  2, 3)
fig_box1.append_trace(trace[10,0], 2, 4)
fig_box1.append_trace(trace[11,0], 3, 1)
fig_box1.append_trace(trace[12,0], 3, 2)

fig_box1.update_layout(height = 600, width = 1000, 
                       title = "BoxPlot das variáveis preditoras segundo a target", showlegend = False)
fig_box1.show()    
    

In [None]:
# Matriz de correlação pearson
labels = ["UltPercLimit", "Idade",
          "N_Atraso30_59Dias", "RazaoGastos", "RendaMensal",
          "N_EmeprestimosAbertos", "N_atrasos_Ult90Dias",
          "N_emprestimos", "N_Atraso60_89Dias",
          "N_dependentes"]

cor_matrix = base1.iloc[:,2:12].corr(method="pearson")

fig_cor = go.Figure(data=go.Heatmap(
                   z = cor_matrix,
                   x = labels,
                   y = labels,
                   hoverongaps = False))

fig_cor.show()

In [None]:
# Matriz de correlação spearman
cor_matrix1 = base1.iloc[:,2:12].corr(method="spearman")
cor_matrix1
fig_cor1 = go.Figure(data=go.Heatmap(
                   z = cor_matrix,
                   x = labels,
                   y = labels,
                   hoverongaps = False))
fig_cor1.update_layout(title = "Correlação entre as preditoras")
fig_cor1.show()

---

<center><strong><font color = "darkblue" size=4> Categorizando e criando WOE</font></strong></center>

---

In [None]:
# Função para avaliar Categorização
def resumo_CAT(k):
    tab3  = pd.crosstab(index = base1["CAT_"+base1.columns[k]], columns = ["count"]) 
    tab3["percent"] = (tab3/tab3.sum())*100
    # Reorganizando o index a partir de 0 para juntar com as bases seguintes
    tab3.index = np.arange(0,len(tab3))
    # Calculando média e desvio por categoria
    media  = base1.groupby(["CAT_"+base1.columns[k]])[base1.columns[k]].mean().reset_index()
    var    = base1.groupby(["CAT_"+base1.columns[k]])[base1.columns[k]].std().reset_index()
    minimo = base1.groupby(["CAT_"+base1.columns[k]])[base1.columns[k]].min().reset_index()
    maximo = base1.groupby(["CAT_"+base1.columns[k]])[base1.columns[k]].max().reset_index()
    media["Std"] = var.iloc[:,1]
    media["Min"] = minimo.iloc[:,1]
    media["Max"] = maximo.iloc[:,1]
    media.columns.values[1] = "Media"
    media["Percent"] = tab3["percent"]
    return(media)

In [None]:
n = [2,3,5,7,12]
for j in n:
    group_names = [1,2,3,4,5]
    base1["CAT_"+base1.columns[j]] = pd.qcut(base1.iloc[:,j], 5, labels = group_names)
    print(j)
    print(tabulate(resumo_CAT(j), headers="keys"))

In [None]:
# Função para calculo do WOE
# https://towardsdatascience.com/attribute-relevance-analysis-in-python-iv-and-woe-b5651443fc04

def calcular_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            "Value": val,
            "All": dataset[dataset[feature] == val].count()[feature],
            "Bom": dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            "Mau": dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset["Distr_Bom"] = dset["Bom"] / dset["Bom"].sum()
    dset["Distr_Mau"] = dset["Mau"] / dset["Mau"].sum()
    dset["WoE"] = np.log(dset["Distr_Bom"] / dset["Distr_Mau"])
    dset = dset.replace({"WoE": {np.inf: 0, -np.inf: 0}})
    dset["IV"] = (dset["Distr_Bom"] - dset["Distr_Mau"]) * dset["WoE"]
    iv = dset["IV"].sum()
 #   dset = dset.sort_values(by="WoE")
    
    return dset, iv

In [None]:
variaveis = ["N_atrasos_Ult90Dias", "N_emprestimos", "N_Atraso60_89Dias","N_dependentes",
            "CAT_UltPercLimit","CAT_Idade", "CAT_lnRazaoGastos","CAT_N_EmeprestimosAbertos"]

df1 = {}
l = 0
for col in variaveis:
    if col == 	"Exited	": continue
    else:
        print("WoE e IV: {}".format(col))
        df, iv = calcular_woe_iv(base1, col, "Perf_final")
        df1[l] = df
        df1[l]["nome"] = "WOE_"+col
        print(l)
        l = l + 1
        print(tabulate(df, headers="keys"))
        print("IV score: {:.2f}".format(iv))
        print("\n")
        i = 0
        for i in range(0,len(df)):
            base1.loc[base1[col] ==  df.iloc[i,0], "WOE_"+col] = df.iloc[i,6]

In [None]:
# WOE E IV plot

trace = {}
clrs  = {}

clrred  = "#cc0000"
clrblue = "#2d2d86"

for j in range(0,8):
    clrs[j] = [clrred if df1[j]["WoE"][x] < 0 else clrblue for x in range(0, len(df1[j]))]

nomes = [df1[l]["nome"][1] for l in range(0,8)]   
fig_WOE = make_subplots(rows=8, cols=2,
           subplot_titles=(np.repeat(nomes,2)))

for k in range(0,8):
   
    trace[k,0] = go.Bar(x = df1[k]["Value"], y = df1[k]["WoE"], marker = dict(color = clrs[k]))
                        
    trace[k,1] = go.Bar(x = df1[k]["Value"], y = df1[k]["IV"],  marker = dict(color = "#2d2d86"))
    
    fig_WOE.append_trace(trace[k,0], k+1, 1)
    fig_WOE.append_trace(trace[k,1], k+1, 2)
    
fig_WOE.update_layout(height = 1400, width = 900, showlegend = False,title = "WOE e IV")

fig_WOE.show()


In [None]:
base1 = base1.dropna()
base1.head()

In [None]:
# Salvando a base

#base1.to_csv(r"C:/Users/jaque/Desktop/Behavior Score/Bases/AmostraBehavior.csv", header=True, sep = ";",index = False)