# 0. Imports

In [41]:
import joblib
import pandera
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import RobustScaler, StandardScaler
from sklearn.linear_model    import LogisticRegression
from sklearn.metrics         import roc_auc_score
from feature_engine.discretisation import EqualFrequencyDiscretiser,EqualWidthDiscretiser
from feature_engine.imputation     import MeanMedianImputer
from feature_engine.wrappers       import SklearnTransformerWrapper
from pandera                       import Check,Column,DataFrameSchema


# 1. Utils

In [3]:
df = pd.read_csv('../../primeira_etapa/train.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes'],
      dtype='object')

In [5]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 2. Data Load

In [7]:
# Criando a classe ajudara caso o projeto altere a fonte de dado. Hoje esta vindo de um CSV, mas poderia alterar para ter que usar um SQL
class DataLoad:
    """
    Class data load - usada para encapsular o carregamento dos dados
    """
    def __init__(self) -> None:
        pass
    
    def load_data(self) -> pd.DataFrame:
        """Funcao que vai carregar os dados
        
        return pandas Dataframe"""
        
        loaded_data = pd.read_csv('../data/raw/train.csv')
        return loaded_data


In [8]:
dl = DataLoad()

In [9]:
df = dl.load_data()[columns_to_use]

In [10]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 3. Data Validation

In [11]:
class DataValidation:
    """ Data validation 
    """
    def __init__(self,columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self,dataframe:pd.DataFrame) -> bool:
        try:
            print('Inicou validacao')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validacao deu erro {e}')

        return False
        
    def check_columns (self,dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                "target":Column(int,Check.isin([0,1]),coerce=True), 
                "TaxaDeUtilizacaoDeLinhasNaoGarantidas":Column(float,nullable=True), 
                "Idade":Column(int,nullable=True),
                "NumeroDeVezes30-59DiasAtrasoNaoPior":Column(int,nullable=True), 
                "TaxaDeEndividamento":Column(float,nullable=True),
                "RendaMensal":Column(float,nullable=True), 
                "NumeroDeLinhasDeCreditoEEmprestimosAbertos":Column(int,nullable=True),
                "NumeroDeVezes90DiasAtraso":Column(int,nullable=True),
                "NumeroDeEmprestimosOuLinhasImobiliarias":Column(int,nullable=True),
                "NumeroDeVezes60-89DiasAtrasoNaoPior":Column(int,nullable=True), 
                "NumeroDeDependentes":Column(float,nullable=True)
            }
        )
        try:
            schema.validate(dataframe)
            print('Validacao da colunas OK')
            return True
        except pandera.errors.SchemaErrors as exc:
            print('Validacao deu erro')
            pandera.display(exc.failure_cases)
        return False
        
    def run(self,dataframe:pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validacao Sucesso')
            return True
        else:
            print('Validacao Falhou')
            return False


In [12]:
dv = DataValidation(columns_to_use)

In [13]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [14]:
dv.run(df)

Inicou validacao
Validacao da colunas OK
Validacao Sucesso


top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



True

# 4. Data Transformation

In [15]:
# criando classe para tranformação dos dados. Nesse caso a separacao dos conjuntos
class DataTransformation:
    def __init__(self,dataframe:pd.DataFrame,
                      target_name:str):
        self.dataframe = dataframe
        self.target_name = target_name

    def train_test_spliting(self):
        X = self.dataframe.drop(self.target_name,axis=1)
        y = self.dataframe[self.target_name]

        X_train,X_valid,y_train,y_valid = train_test_split(X,y,stratify=y)

        return X_train,X_valid,y_train,y_valid
        

In [16]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [17]:
dt = DataTransformation(df,'target')

In [18]:
X_train,X_valid,y_train,y_valid = dt.train_test_spliting()

In [19]:
X_train

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
15247,0.000435,55,0,0.430614,10700.0,8,0,2,0,1.0
26265,1.000000,57,1,0.239965,6800.0,3,2,1,0,1.0
85567,1.006916,55,3,1.073774,2100.0,11,1,1,1,0.0
111525,0.544658,32,0,408.000000,,1,0,0,0,0.0
170,0.055382,72,0,0.089091,4500.0,4,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
83650,0.142012,69,0,0.234549,10500.0,19,0,1,0,0.0
114868,0.009800,52,0,0.002100,3333.0,4,0,0,0,0.0
11784,0.406537,54,0,0.287207,4783.0,9,0,2,0,0.0
79862,1.000000,22,98,0.000000,,0,98,0,98,


# 5. Data Preprocess

In [20]:
class DataPreprocess:
    def __init__(self,dataframe:pd.DataFrame,
                      pipe:Pipeline):
        self.dataframe = dataframe
        self.pipe = pipe
    
    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe
    
    def run(self):
        print("Inciou o preprocessamento ...")
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        print("...Finalizou preprocessamento")
        return data_preprocessed
    
        

In [21]:
pipe = Pipeline([('imputer',MeanMedianImputer(variables=['RendaMensal',
                                                         'NumeroDeDependentes'])),
                 ('discretizer',EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                                     'TaxaDeEndividamento',
                                                                     'RendaMensal'])),
                 ('scaler',SklearnTransformerWrapper(StandardScaler()))])

In [22]:
dp = DataPreprocess(X_train,pipe)

In [23]:
X_trained_preprocessed = dp.run()

Inciou o preprocessamento ...
...Finalizou preprocessamento


In [24]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
15247,0.000435,55,0,0.430614,10700.0,8,0,2,0,1.0
26265,1.0,57,1,0.239965,6800.0,3,2,1,0,1.0
85567,1.006916,55,3,1.073774,2100.0,11,1,1,1,0.0
111525,0.544658,32,0,408.0,,1,0,0,0,0.0
170,0.055382,72,0,0.089091,4500.0,4,0,0,0,0.0


In [25]:
X_trained_preprocessed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
15247,-1.566699,0.184218,-0.100331,0.174085,1.255831,-0.087703,-0.063576,0.871259,-0.057742,0.234045
26265,1.566699,0.319625,0.138012,-0.522231,0.907564,-1.060695,0.415866,-0.016195,-0.057742,0.234045
85567,1.566699,0.184218,0.614697,0.870402,-1.530302,0.496093,0.176145,-0.016195,0.182761,-0.666158
111525,0.870388,-1.372968,-0.100331,1.21856,-0.137236,-1.449892,-0.063576,-0.903649,-0.057742,-0.666158
170,-0.522233,1.335181,-0.100331,-1.218548,-0.485502,-0.866097,-0.063576,-0.903649,-0.057742,-0.666158


In [26]:
joblib.dump(dp.pipeline(),
            'preprocessador.joblib')

['preprocessador.joblib']

# 6. Train Models

In [27]:
class TrainModels:
    def __init__(self,dados_X:pd.DataFrame,
                      dados_y:pd.DataFrame):
        self.dados_X = dados_X
        self.dados_y = dados_y

    def train(self,model):
        model.fit(self.dados_X,self.dados_y)
        joblib.dump(model,'modelo.joblib')
        return model
    
    def predict(self,dados_para_prever:pd.DataFrame):
        model_fitted = self._load_model()
        dados_prep = model_fitted.predict_proba(dados_para_prever)
        return dados_prep
    
    def _load_model(self):
        model = joblib.load('modelo.joblib')
        return model
        

In [28]:
tm = TrainModels(X_trained_preprocessed,y_train)

In [29]:
tm.train(LogisticRegression())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [30]:
y_train_pred = tm.predict(X_trained_preprocessed)

In [31]:
y_train_pred

array([[0.9937902 , 0.0062098 ],
       [0.75627415, 0.24372585],
       [0.68419619, 0.31580381],
       ...,
       [0.93870065, 0.06129935],
       [0.34201285, 0.65798715],
       [0.97475892, 0.02524108]])

# 7. Model Evaluation

In [32]:
preprocessor = dp.pipeline()
preprocessor

0,1,2
,steps,"[('imputer', ...), ('discretizer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,imputation_method,'median'
,variables,"['RendaMensal', 'NumeroDeDependentes']"

0,1,2
,variables,"['TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'TaxaDeEndividamento', ...]"
,q,10
,return_object,False
,return_boundaries,False
,precision,3

0,1,2
,transformer,StandardScaler()
,variables,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
x_valid_preprocessed = preprocessor.transform(X_valid)

In [35]:
X_valid

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
47716,0.835544,34,1,951.000000,,5,0,0,0,2.0
31967,0.031827,52,0,0.575257,7500.0,12,0,4,0,0.0
139278,0.000000,60,0,0.001071,2800.0,10,0,0,0,1.0
56469,0.196255,53,1,0.354470,3500.0,14,0,0,0,3.0
29736,0.460195,44,0,0.407015,4903.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...
87894,0.274624,44,0,0.095718,11700.0,6,0,0,0,2.0
68645,0.431588,71,0,0.335298,19680.0,12,0,2,0,1.0
74955,0.000000,78,0,389.000000,,9,0,1,0,0.0
96006,0.918270,38,0,0.024595,5000.0,3,0,0,0,3.0


In [34]:
x_valid_preprocessed

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
47716,1.218544,-1.237561,0.138012,1.218560,-0.137236,-0.671498,-0.063576,-0.903649,-0.057742,1.134247
31967,-0.870388,-0.018893,-0.100331,0.522244,0.907564,0.690692,-0.063576,2.646168,-0.057742,-0.666158
139278,-1.566699,0.522737,-0.100331,-1.566706,-1.182035,0.301495,-0.063576,-0.903649,-0.057742,0.234045
56469,0.174078,0.048810,0.138012,-0.174073,-0.833769,1.079889,-0.063576,-0.903649,-0.057742,2.034449
29736,0.870388,-0.560523,-0.100331,0.174085,-0.485502,-0.282301,-0.063576,-0.016195,-0.057742,-0.666158
...,...,...,...,...,...,...,...,...,...,...
87894,0.522233,-0.560523,-0.100331,-1.218548,1.604097,-0.476900,-0.063576,-0.903649,-0.057742,1.134247
68645,0.522233,1.267478,-0.100331,-0.174073,1.604097,0.690692,-0.063576,0.871259,-0.057742,0.234045
74955,-1.566699,1.741404,-0.100331,1.218560,-0.137236,0.106896,-0.063576,-0.016195,-0.057742,-0.666158
96006,1.218544,-0.966746,-0.100331,-1.566706,-0.485502,-1.060695,-0.063576,-0.903649,-0.057742,2.034449


In [36]:
y_valid_pred = tm.predict(x_valid_preprocessed)

In [37]:
y_valid_pred

array([[0.80683431, 0.19316569],
       [0.98389737, 0.01610263],
       [0.99237698, 0.00762302],
       ...,
       [0.99499051, 0.00500949],
       [0.86346206, 0.13653794],
       [0.9690315 , 0.0309685 ]])

In [38]:
class ModelEvaluation:
    def __init__(self):
        pass

    def eval_metrics(self,dados_reais,dados_preditos):
        roc_auc = roc_auc_score(dados_reais,dados_preditos)
        return roc_auc

In [39]:
me = ModelEvaluation()

In [None]:
# Na sequencia pesquisar a predict_proba()
me.eval_metrics(y_valid,y_valid_pred[:,1])

0.7975689573234787

# 8. Experiments

## 8.1. Experiment 1

In [53]:
#---------------------------- Etapa 1 
pipe = Pipeline([('imputer',MeanMedianImputer(variables=['RendaMensal',
                                                         'NumeroDeDependentes'])),
                 ('discretizer',EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                                     'TaxaDeEndividamento',
                                                                     'RendaMensal'])),
                 ('scaler',SklearnTransformerWrapper(RobustScaler()))])
dp = DataPreprocess(X_train,pipe)
X_trained_preprocessed = dp.run()


#---------------------------- Etapa 2
tm = TrainModels(X_trained_preprocessed,y_train)
tm.train(LogisticRegression())
# y_train_pred = tm.predict(X_trained_preprocessed)


#---------------------------- Etapa 3
preprocessor = dp.pipeline()
x_valid_preprocessed_exp = preprocessor.transform(X_valid)
y_valid_pred_exp = tm.predict(x_valid_preprocessed_exp)
me = ModelEvaluation()
me.eval_metrics(y_valid,y_valid_pred_exp[:,1])

Inciou o preprocessamento ...
...Finalizou preprocessamento


0.7974258704296862