In [1]:
import pandas as pd

## Lectura de Datos

Se leerán las bases básicas, que solo tienen registros a nivel de usuario. Sólo se leerá la información de campaña que dependa del tiempo. Queda para mejorar, la incorporacion de más informaición temporal. 

In [2]:
train = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_inicial_train/ib_base_inicial_train.csv")
X_test = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_inicial_test/ib_base_inicial_test.csv")

sunat = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_sunat/ib_base_sunat.csv")
reniec = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_reniec/ib_base_reniec.csv")
vehicular = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_vehicular/ib_base_vehicular.csv")
campanias = pd.read_csv("/kaggle/input/interbank-internacional-2019/ib_base_campanias/ib_base_campanias.csv")

## Creación del Target de predicción

Se opta por construir un target binario, para establecer quienes son clientes rentables y, por tanto, es conveniente hacerles campaña para atraerlos. 

In [3]:
y_train = train[['codmes', 'id_persona', 'margen']].copy()
y_train["prediction_id"] = y_train["id_persona"].astype(str) + "_" + y_train["codmes"].astype(str)
y_train["target"] = (y_train["margen"] > 0).astype(int)
y_train = y_train.set_index("prediction_id")
X_train = train.drop(["codtarget", "margen"], axis=1)
X_train["prediction_id"] = X_train["id_persona"].astype(str) + "_" + X_train["codmes"].astype(str)
del train

## Consolidación de Bases

Se unene todas las bases por id_persona

In [4]:
sunat = sunat.groupby(["id_persona", "activ_econo"]).meses_alta.sum().unstack(level=1, fill_value=0).astype("int32")
vehicular1 = vehicular.groupby(["id_persona", "marca"]).veh_var1.sum().unstack(level=1, fill_value=0).astype("float32")
vehicular2 = vehicular.groupby(["id_persona", "marca"]).veh_var2.sum().unstack(level=1, fill_value=0).astype("float32")
reniec = reniec.set_index("id_persona").astype("float32")
del vehicular

In [5]:
vehicular1.columns = [c + "_v1" for c in vehicular1.columns]
vehicular2.columns = [c + "_v2" for c in vehicular2.columns]

In [6]:
X_train = X_train.set_index("prediction_id").astype("int32").reset_index().set_index("id_persona").join(vehicular1).join(vehicular2).join(reniec).join(sunat)
X_test = X_test.set_index("prediction_id").astype("int32").reset_index().set_index("id_persona").join(vehicular1).join(vehicular2).join(reniec).join(sunat)
del vehicular1, vehicular2, reniec, sunat

In [7]:
camp_canal = campanias.groupby(["codmes", "id_persona", "canal_asignado"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
camp_prod = campanias.groupby(["codmes", "id_persona", "producto"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
del campanias

In [8]:
import gc
gc.collect()

11

In [9]:
meses = {
    201901: slice(201808, 201810),
    201902: slice(201809, 201811),
    201903: slice(201810, 201812),
    201904: slice(201811, 201901),
    201905: slice(201812, 201902),
    201906: slice(201901, 201903),
    201907: slice(201902, 201904)
}

complementos = []
for mes in meses.keys():
    print("*"*10, mes, "*"*10)
    res = pd.concat([
        camp_canal.loc[meses[mes]].groupby("id_persona").sum(),
        camp_prod.loc[meses[mes]].groupby("id_persona").sum()
        
    ], axis=1)
    res["codmes"] = mes
    res = res.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    complementos.append(res)

gc.collect()
print("contatenando complementos")
complementos = pd.concat(complementos)
gc.collect()
print("X_train join")
X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
gc.collect()
print("X_test join")
X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
gc.collect()

del camp_canal, camp_prod, complementos,res
gc.collect()

********** 201901 **********
********** 201902 **********
********** 201903 **********
********** 201904 **********
********** 201905 **********
********** 201906 **********
********** 201907 **********
contatenando complementos
X_train join
X_test join


0

## Renombrado de Variables con nombre no ascii

El algoritmo que usamos no se lleva bien con cadenas de texto con caracteres especiales, las renombramos.

In [10]:
non_ascii = X_train.columns[[not all(ord(c) < 128 for c in s) for s in X_train.columns]].tolist()
non_ascii

['LIMA MOTORÂ´S_v1',
 'VICTORIA MOTORÂ´S_v1',
 'VILMENÂ¨S_v1',
 'LIMA MOTORÂ´S_v2',
 'VICTORIA MOTORÂ´S_v2',
 'VILMENÂ¨S_v2',
 'Adquisición Convenios',
 'Adquisición TC',
 'Alcancía',
 'CD Préstamos',
 'Crédito Hipotecario',
 'Crédito Vehicular',
 'Depósito a Plazo',
 'Depósito a Plazo Jubilacion',
 'Depósito a Plazo Renovacion',
 'Déposito a Plazo Renovacion',
 'Membresía',
 'Préstamo Express',
 'Préstamos Personales',
 'Préstamos Reenganche',
 'Retención']

In [11]:
for i, c in enumerate(non_ascii):
    X_train["non_ascii_" + str(i)] = X_train[c]
    X_train = X_train.drop(c, axis= 1)
    X_test["non_ascii_" + str(i)] = X_test[c]
    X_test = X_test.drop(c, axis= 1)

## Entrenamiento del Modelo

Se entrena un modelo con valores en default, pero optimizando en nro de estimadores inferiores, con validación basada en meses.

In [12]:
from lightgbm import LGBMClassifier
drop_cols = ["codmes"]
fi = []
test_probs = []
train_probs = []
for mes in X_train.codmes.unique():
    print("*"*10, mes, "*"*10)
    Xt = X_train[X_train.codmes != mes]
    yt = y_train.loc[Xt.index, "target"]
    Xt = Xt.drop(drop_cols, axis=1)

    Xv = X_train[X_train.codmes == mes]
    yv = y_train.loc[Xv.index, "target"]
    
    learner = LGBMClassifier(n_estimators=10000)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=50)
    
    test_probs.append(pd.Series(learner.predict_proba(X_test.drop(drop_cols, axis=1))[:, -1],
                                index=X_test.index, name="fold_" + str(mes)))
    train_probs.append(pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1],
                                index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)

********** 201902 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.781409	training's binary_logloss: 0.274736	valid_1's auc: 0.717946	valid_1's binary_logloss: 0.291606
Early stopping, best iteration is:
[76]	training's auc: 0.792861	training's binary_logloss: 0.270832	valid_1's auc: 0.719605	valid_1's binary_logloss: 0.291291
********** 201904 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.770439	training's binary_logloss: 0.274381	valid_1's auc: 0.763971	valid_1's binary_logloss: 0.288995
Early stopping, best iteration is:
[55]	training's auc: 0.773112	training's binary_logloss: 0.273532	valid_1's auc: 0.76424	valid_1's binary_logloss: 0.288845
********** 201901 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.776134	training's binary_logloss: 0.288832	valid_1's auc: 0.737187	valid_1's binary_logloss: 0.247373
Early stopping, best iteration is:
[

## Importancia de Variables

Observamos la importancia media que le dieron los modelos a cada variables

In [13]:
fi.sort_values().tail(50).to_frame()

Unnamed: 0,0
HYUNDAI_v1,0.00245
MAZDA_v2,0.002461
FFVV,0.002679
DODGE_v2,0.002765
BAJAJ_v2,0.002769
TOYOTA_v1,0.002839
HONDA_v2,0.002864
HYUNDAI_v2,0.003002
Cuenta Millonaria SuperTasa,0.003051
Membresia,0.003535


## Optimización de punto de corte

Con las probabilidades calculadas en validación, calcularmos el punto de corte optimo para maximizar la ecuación económica de la empresa

In [14]:
from scipy.optimize import differential_evolution

res = y_train.join(train_probs.rename("probs"))
optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)])
optimization

     fun: -1.0746612059743577
 message: 'Optimization terminated successfully.'
    nfev: 122
     nit: 7
 success: True
       x: array([0.04297717])

## Guardado del modelo para hacer la presentación

In [15]:
test_preds = (test_probs > optimization["x"][0]).astype(int)
test_preds.index.name="prediction_id"
test_preds.name="class"
test_preds.to_csv("benchmark1.csv", header=True)