In [151]:
import pandas as pd
import requests
import io
import numpy as np
    
# Downloading the csv file from your GitHub account

url_train = "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/treino.csv" # Make sure the url is the raw version of the file on GitHub
url_test = "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/teste.csv"
download_train = requests.get(url_train).content
download_test = requests.get(url_test).content

# Reading the downloaded content and turning it into a pandas dataframe

df_train = pd.read_csv(io.StringIO(download_train.decode('utf-8')))
df_test = pd.read_csv(io.StringIO(download_test.decode('utf-8')))

# Printing out the first 5 rows of the dataframe

#print(df_train.info())
#print(df_train.info())

In [152]:
# Excluding the lines With NA from Salary, Because it can bias the model with salary = 0
df_train = df_train[~df_train['salario_mensal'].isna()]

# Modify type of numero_de_dependentes to int
df_train['numero_de_dependentes'] = df_train['numero_de_dependentes'].astype(int)
# Cleanig the data from duplicates
df_train.drop_duplicates(keep='first', inplace=True) 
#df_train.describe()

In [None]:
# excluding outliers
def exclude_outliers(DataFrame, col_name):
    interval = 2.9*DataFrame[col_name].std()
    mean = DataFrame[col_name].mean()
    m_i = mean + interval 
    DataFrame = DataFrame[DataFrame[col_name] < m_i]
    return DataFrame

outlier_column = ['util_linhas_inseguras', 'idade', 'vezes_passou_de_30_59_dias', 'razao_debito', 'salario_mensal', 'numero_linhas_crdto_aberto',
                   'numero_emprestimos_imobiliarios', 'numero_de_dependentes']

for col in outlier_column:
    df_train = exclude_outliers(df_train, col)

df_train.describe()
#df_train.info()

In [None]:
# this time I needed to do in less columns
remain_column = ['razao_debito','util_linhas_inseguras','salario_mensal','numero_linhas_crdto_aberto']

for col in remain_column:
    df_train = exclude_outliers(df_train, col)

df_train.describe()

In [None]:
# this time I needed to do in less columns
df_train = exclude_outliers(df_train, 'razao_debito')

df_train.describe()

In [None]:
for col in list(df_train.columns):
    df_train.hist(col, bins = 20)
df_train.describe()

In [None]:
df_train[df_train < 1000].hist('salario_mensal', bins = 40)

In [None]:
df_train = df_train[df_train['salario_mensal'] > 200]
df_train.describe()

In [None]:
for i in list(df_train.columns):
    print(df_train[i].name)
    print(df_train[i].unique())

In [None]:
df_train.describe()

In [None]:
df_train = df_train.fillna(0)
df_train

The data is clean, so we can move to the model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, classification_report,\
                            accuracy_score, confusion_matrix, auc

In [None]:
modelo = smf.glm(formula='inadimplente ~ util_linhas_inseguras + idade + vezes_passou_de_30_59_dias + razao_debito + salario_mensal + numero_linhas_crdto_aberto + numero_vezes_passou_90_dias + numero_emprestimos_imobiliarios + numero_de_vezes_que_passou_60_89_dias + numero_de_dependentes', data=df_train,
                family = sm.families.Binomial()).fit()
print(modelo.summary())

In [159]:
# remove the razao_debito because the test showed this variable wasn't relevant
modelo = smf.glm(formula='inadimplente ~ util_linhas_inseguras + idade + vezes_passou_de_30_59_dias + salario_mensal + numero_linhas_crdto_aberto + numero_vezes_passou_90_dias + numero_emprestimos_imobiliarios + numero_de_vezes_que_passou_60_89_dias + numero_de_dependentes', data=df_train,
                family = sm.families.Binomial()).fit()
print(modelo.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           inadimplente   No. Observations:                81214
Model:                            GLM   Df Residuals:                    81204
Model Family:                Binomial   Df Model:                            9
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -15664.
Date:                Tue, 27 Apr 2021   Deviance:                       31327.
Time:                        11:56:11   Pearson chi2:                 1.42e+05
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [160]:
print(np.exp(modelo.params[1:]))

util_linhas_inseguras                    5.632036
idade                                    0.985799
vezes_passou_de_30_59_dias               1.564476
salario_mensal                           0.999929
numero_linhas_crdto_aberto               1.033461
numero_vezes_passou_90_dias              1.906019
numero_emprestimos_imobiliarios          1.073974
numero_de_vezes_que_passou_60_89_dias    1.857242
numero_de_dependentes                    1.065726
dtype: float64


In [161]:
(np.exp(modelo.params[1:]) - 1) * 100

util_linhas_inseguras                    463.203596
idade                                     -1.420054
vezes_passou_de_30_59_dias                56.447572
salario_mensal                            -0.007078
numero_linhas_crdto_aberto                 3.346111
numero_vezes_passou_90_dias               90.601895
numero_emprestimos_imobiliarios            7.397422
numero_de_vezes_que_passou_60_89_dias     85.724221
numero_de_dependentes                      6.572594
dtype: float64

In [None]:
# Agora vamos fazer com sklearn para aproveitar as métricas
model = LogisticRegression(penalty='none', solver='newton-cg')
baseline_df = df_train[['inadimplente', 'util_linhas_inseguras', 'idade', 'vezes_passou_de_30_59_dias', 'salario_mensal', 'numero_linhas_crdto_aberto',
                        'numero_vezes_passou_90_dias', 'numero_emprestimos_imobiliarios', 'numero_de_vezes_que_passou_60_89_dias', 
                        'numero_de_dependentes']].dropna()
y = baseline_df.inadimplente
X = pd.get_dummies(baseline_df[['util_linhas_inseguras', 'idade', 'vezes_passou_de_30_59_dias', 'salario_mensal', 'numero_linhas_crdto_aberto',
                        'numero_vezes_passou_90_dias', 'numero_emprestimos_imobiliarios', 'numero_de_vezes_que_passou_60_89_dias', 
                        'numero_de_dependentes']], drop_first=True)
print(X)

In [None]:
model.fit(X, y)

In [None]:
print(model.coef_) 

In [165]:
# Predizendo as probabilidades
yhat = model.predict_proba(X)

In [166]:
yhat = yhat[:, 1] # manter somente para a classe positiva

In [None]:
confusion_matrix(y, model.predict(X)) # usando a função do sklearn

In [None]:
acuracia = accuracy_score(y, model.predict(X))
print('O modelo obteve %0.4f de acurácia.' % acuracia)

In [None]:
print(classification_report(y, model.predict(X)))

In [None]:
print('AUC: %0.2f' % roc_auc_score(y, yhat))

In [172]:
def plot_roc_curve(y_true, y_score, figsize=(10,6)):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.figure(figsize=figsize)
    auc_value = roc_auc_score(y_true, y_score)
    plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % auc_value)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(y, yhat)

In [None]:
df_test = df_test[~df_test['salario_mensal'].isna()]
df_test = df_test[df_test.columns[~df_test.columns.isin(['razao_debito'])]]
df_test.info()

In [None]:
# Modify type of numero_de_dependentes to int
df_train['util_linhas_inseguras'] = df_train['util_linhas_inseguras'].astype(int)
# Cleanig the data from duplicates
df_train.drop_duplicates(keep='first', inplace=True) 
#df_train.describe()

In [None]:

prob = model.predict_proba(df_test)
print(prob)