# Parámetros

In [1]:
CREDIT_FILE_ID = '18P5oXUpch9s5Nm4WdV_fwjX4oBneyyuh'
APPLICATION_FILE_ID = '1ET5jQSMcLj7odR1OttvR1qdcWtQ-CnaQ'
BASE_DIR = '/Users/efrain.flores/Desktop/EF/EF/UnDosTres/data'
GOT_TIME_TO_TRAIN = True

# Entorno

In [2]:
# Control de datos
from pathlib import Path
from requests import Session
from pickle import dump as save_pkl, load as load_pkl

# Ingeniería de variables
from numpy import nan
from pandas import DataFrame, read_csv, cut, qcut

# Modelos
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Gráficas
import seaborn as sns
import matplotlib.pyplot as plt

# Código

In [None]:
class BussinessCase:
    def __init__(self, base_dir: str=None) -> None:
        '''
        Sólo recibe el directorio de trabajo y la clase cuenta con los métodos suficientes para:
            - limpieza
            - entrenamiento
            - predicción
        '''
        # Define el directorio como objeto Path para manejar eficientemente los archivos y directorios
        if base_dir is not None: self.base_dir = Path(base_dir)
        else: self.base_dir = Path().cwd()
        self.main_dict = {}


    def get_file(self, file_id: str, file_name: str) -> DataFrame:
        '''
        Extrae los archivos via Google Drive
        '''
        # Hace la solicitud a la URL y guarda la respuesta
        session = Session()
        URL = "https://docs.google.com/uc?export=download"
        response = session.get(URL, params={'id':file_id, 'confirm':'t'}, stream=True)

        # Guarda el archivo en el directorio descrito
        file_dir = self.base_dir.joinpath(f'{file_name}.csv')
        with open(file_dir, "wb") as f:
            for chunk in response.iter_content(32768):
                f.write(chunk)

        df = read_csv(file_dir)
        return df

    
    def get_both_files(self, credit_file: tuple, app_file: tuple) -> None:
        self.credit = self.get_file(credit_file[0], credit_file[-1])
        self.app = self.get_file(app_file[0], app_file[-1])
        print(f'Los archivos:\n\t-{credit_file[-1]}.csv\n\t-{app_file[-1]}.csv\nfueron importados exitosamente')


    def mod_credit(self, id_col: str, status_col: str, date_col: str) -> DataFrame:
        cred = self.credit.replace({
            **{'C':'good', 'X':'good', '0':'good'},
            **{str(x):'bad' for x in range(1,6)}
        })
        cred = cred.pivot_table(index=id_col, columns=date_col, values=status_col, aggfunc=lambda x:x)
        cred.fillna('good', inplace=True)
        return cred.astype(str)


    def to_drop(self, df: DataFrame, col: str) -> None:
        df.drop(col, axis=1, inplace=True)
        return df


    def to_range(self, df: DataFrame, col: str, is_train: bool, **kwargs) -> None:
        # Función para convertir float: 1.0 --> str: '01'
        def two_char(n): return str(int(n)).zfill(2)
        # Encontrar el bin al cual el dato pertenece
        if is_train:
            df[col], self.main_dict['ranges'][col] = qcut(df[col], retbins=True, duplicates='drop', **kwargs)
        else: 
            df[col] = cut(df[col], bins=self.main_dict['ranges'][col])
        # Convertirlo a texto: [1.0 - 5.0] --> '01 a 05'
        df[col] = df[col].map(lambda x: two_char(x.left+1)+' a '+two_char(x.right) if x!=nan else nan)
        return df


    def to_flag(self, df: DataFrame, col: str, option_list: list) -> None:
        df[col] = df[col].map(lambda x: 0 if x in option_list else 1)
        return df


    def mod_app(self, id_col: str, to_drop_cols: list, to_range_cols: list, to_flag_cols: list, **kwargs) -> None:
        X = self.app.drop_duplicates(id_col)
        X = X[X[id_col].isin(self.credit[id_col])].copy()
        X.set_index(id_col, inplace=True)

        bc.main_dict['ranges'] = {}
        for col in to_drop_cols: X = self.to_drop(X, col)
        for col in to_range_cols: X = self.to_range(X, col, **kwargs)
        for col, opt in to_flag_cols: X = self.to_flag(X, col, opt)
        return X.astype(str)


    def cm_sklearn(self, X: DataFrame, y: DataFrame, fit_model, target_encoder) -> DataFrame:
        '''
        Muestra la matriz de confusión en un mapa de calor
        '''
        # Regresa los números a etiquetas originales
        labels = target_encoder.inverse_transform(fit_model.classes_)
        # Calcula la matriz de confusión, real vs estimado
        cm = DataFrame(confusion_matrix(y_true=y.values, y_pred=fit_model.predict(X)), index=labels, columns=labels).replace({0:nan})
        
        # Define el tamaño de el mapa de calor
        size = len(cm)//2
        fig, ax = plt.subplots(figsize=(size, size)) 
        # Crea el mapa de calor con base en la distribución % del valor real a través de sus predicciones
        to_heatmap = DataFrame([cm[col] / cm.sum(axis=1) for col in cm.columns], index=labels, columns=labels).T
        sns.heatmap(to_heatmap, annot=True, fmt='.0%',cmap='Blues', linewidths=0.5, ax=ax, cbar=False)
        plt.show()
        return cm

    
    def save_model(self, model, model_name: str) -> None:
        '''
        Exporta el modelo en modo diccionario para que cuando se importe, se conozca de qué trata el objeto
        '''
        self.models_dir = self.base_dir.joinpath('model')
        self.models_dir.mkdir(exist_ok=True)

        self.main_dict[model_name] = model

        with open(self.models_dir.joinpath(f'{model_name}.xz'), 'wb') as f:
            # Como diccionario para conocer su nombre
            save_pkl(self.main_dict, f)

        print(f'El modelo {model_name}.xz ha sido guardado exitosamente en:\n{self.models_dir}')


    def get_model(self, model_name: str) -> None:
        with open(self.models_dir.joinpath(f'{model_name}.xz'), 'rb') as f:
            # Como diccionario para conocer su nombre
            model_dict = load_pkl(f)
            
        # Confirma que el archivo fue guardado exitosamente
        print(f'El modelo {model_name}.xz fue importado existosamente desde:\n{self.models_dir}')
        return model_dict

# Importar

In [None]:
bc = BussinessCase(BASE_DIR)
bc.get_both_files((CREDIT_FILE_ID, 'credit'), (APPLICATION_FILE_ID, 'app'))

# Transformar

## Clientes

In [None]:
X_pre = bc.mod_app(
    id_col='ID',
    is_train=True,
    to_drop_cols=['FLAG_MOBIL'],
    to_range_cols=['AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS'], q=10,
    to_flag_cols=[
        ('CNT_CHILDREN',['0']),
        ('NAME_HOUSING_TYPE',['House / apartment']),
        ('OCCUPATION_TYPE', [nan])
    ]
)

X_pre.head()

# Historia

In [None]:
y_pre = bc.mod_credit(id_col='ID', status_col='STATUS', date_col='MONTHS_BALANCE')
y_pre.head()

# Modelo

## f(X) = y

In [None]:
df = X_pre.join(y_pre)
X = df.loc[:, :-1].copy()
X.columns = list(map(str, X.columns))
y = df[[0]].copy()

X.sample()

## Conjunto validación

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

skf = StratifiedKFold(n_splits=4, shuffle=False)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


y_encod = LabelEncoder()
y_train = DataFrame(y_encod.fit_transform(y_train), index=y_train.index)
y_test = DataFrame(y_encod.transform(y_test), index=y_test.index)

## Hiperparametrización

In [None]:
from sklearn.linear_model import LogisticRegression

if GOT_TIME_TO_TRAIN:
    from sklearn.metrics import make_scorer, matthews_corrcoef
    from sklearn.model_selection import RandomizedSearchCV

    param_logreg = {
        'penalty':['l1', 'l2'], 
        'C':[x + y/10 for x in range(5) for y in range(1,5)], 
        'class_weight':['balanced'],
        'solver':['liblinear'],
        # 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    }
    search_logreg = RandomizedSearchCV(
        estimator=LogisticRegression(max_iter=100),
        param_distributions=param_logreg,
        scoring=make_scorer(matthews_corrcoef),
        n_iter=4,
        verbose=1,
        n_jobs=-1
    )

else: search_logreg = LogisticRegression(max_iter=5000, class_weight='balanced', solver='liblinear')

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.metrics import matthews_corrcoef
from category_encoders.cat_boost import CatBoostEncoder

models = {}
models['model'] = {}
models['scores'] = {}
for i, indexes in enumerate(skf.split(X_train, y_train)):
    train, test = indexes
    models['model'][i] = make_pipeline(CatBoostEncoder(), SMOTE(sampling_strategy='minority'), search_logreg)
    models['model'][i].fit(X_train.iloc[train,:], y_train.iloc[train,:].values)
    prediction = models['model'][i].predict(X_train.iloc[test,:])
    models['scores'][i] = matthews_corrcoef(y_train.iloc[test,:], prediction)

models = DataFrame(models).sort_values('scores', ascending=False)
models

In [None]:
model = models.iloc[0,0]
model

# Resultados

In [None]:
conf = bc.cm_sklearn(X_test, y_test, model, y_encod)

# Exportar modelo

In [None]:
bc.save_model({'model':model, 'encod':y_encod}, model_name='BussinessCase')

# Importar modelo

In [None]:
ready_to_use = bc.get_model(model_name='BussinessCase')
ready_to_use['BussinessCase']

# Resumen (por trabajar)

47 ID duplicados con diferentes datos (apenas el 0.01%)
Sólo hay historia de 45985 clientes
Existen 9528 ID con historia pero no están registrados
Clientes registrados y con historia -> 36457 (8.3% de los usuarios únicos registrados)

CODE_GENDER -> cool
FLAG_OWN_CAR -> cool
FLAG_OWN_REALTY -> cool
NAME_INCOME_TYPE -> cool
NAME_EDUCATION_TYPE -> cool
NAME_FAMILY_STATUS -> cool

FLAG_WORK_PHONE -> CAT
FLAG_PHONE -> CAT
FLAG_EMAIL -> CAT

FLAG_MOBIL -> borrar porque todo es 1

AMT_INCOME_TOTAL -> CAT rangos
DAYS_BIRTH -> CAT rangos
DAYS_EMPLOYED -> CAT rangos
CNT_FAM_MEMBERS -> CAT rangos

CNT_CHILDREN -> CAT flag 0 o >0
NAME_HOUSING_TYPE -> CAT flag house?
OCCUPATION_TYPE -> CAT is null?

Clasificación o regresión?
0 -> 1 ?
X, C -> -1 ?
'mean' o 'sum'?