## Ejercicio de evaluación de modelos

In [None]:
%autosave 0
import pandas as pd
import numpy as np
from plotnine import *
from plotnine.labels import xlab, ylab
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

### Datos

**En este conjunto de datos queremos predecir qué clientes compran un nuevo producto
en términos de varias variables demográficas, variables de segmentación y variables de conducta.**

Todas las variables son numéricas, excepto MOSTYPE, MOSHOOFD


MOSTYPE: Customer Subtype; see L0
MAANTHUI: Number of houses 1 - 10
MGEMOMV: Avg size household 1 - 6
MGEMLEEF: Avg age; see L1
MOSHOOFD: Customer main type; see L2


MGODRK: Roman catholic
MGODPR: Protestant …
MGODOV: Other religion
MGODGE: No religion
MRELGE: Married
MRELSA: Living together
MRELOV: Other relation
MFALLEEN: Singles
MFGEKIND: Household without children
MFWEKIND: Household with children
MOPLHOOG: High level education
MOPLMIDD: Medium level education
MOPLLAAG: Lower level education
MBERHOOG: High status
MBERZELF: Entrepreneur
MBERBOER: Farmer
MBERMIDD: Middle management
MBERARBG: Skilled labourers
MBERARBO: Unskilled labourers
MSKA: Social class A
MSKB1: Social class B1
MSKB2: Social class B2
MSKC: Social class C
MSKD: Social class D
MHHUUR: Rented house
MHKOOP: Home owners
MAUT1: 1 car
MAUT2: 2 cars
MAUT0: No car
MZFONDS: National Health Service
MZPART: Private health insurance
MINKM30: Income < 30.000
MINK3045: Income 30-45.000
MINK4575: Income 45-75.000
MINK7512: Income 75-122.000
MINK123M: Income >123.000
MINKGEM: Average income
MKOOPKLA: Purchasing power class

PWAPART: Contribution private third party insurance
PWABEDR: Contribution third party insurance (firms) …
PWALAND: Contribution third party insurane (agriculture)
PPERSAUT: Contribution car policies
PBESAUT: Contribution delivery van policies
PMOTSCO: Contribution motorcycle/scooter policies
PVRAAUT: Contribution lorry policies
PAANHANG: Contribution trailer policies
PTRACTOR: Contribution tractor policies
PWERKT: Contribution agricultural machines policies
PBROM: Contribution moped policies
PLEVEN: Contribution life insurances
PPERSONG: Contribution private accident insurance policies
PGEZONG: Contribution family accidents insurance policies
PWAOREG: Contribution disability insurance policies
PBRAND: Contribution fire policies
PZEILPL: Contribution surfboard policies
PPLEZIER: Contribution boat policies
PFIETS: Contribution bicycle policies
PINBOED: Contribution property insurance policies
PBYSTAND: Contribution social security insurance policies
AWAPART: Number of private third party insurance 1 - 12
AWABEDR: Number of third party insurance (firms) …
AWALAND: Number of third party insurance (agriculture)
APERSAUT: Number of car policies
ABESAUT: Number of delivery van policies
AMOTSCO: Number of motorcycle/scooter policies
AVRAAUT: Number of lorry policies
AAANHANG: Number of trailer policies
ATRACTOR: Number of tractor policies
AWERKT: Number of agricultural machines policies
ABROM: Number of moped policies
ALEVEN: Number of life insurances
APERSONG: Number of private accident insurance policies
AGEZONG: Number of family accidents insurance policies
AWAOREG: Number of disability insurance policies
ABRAND: Number of fire policies
AZEILPL: Number of surfboard policies
APLEZIER: Number of boat policies
AFIETS: Number of bicycle policies
AINBOED: Number of property insurance policies
ABYSTAND: Number of social security insurance policies
CARAVAN: Number of mobile home policies 0 - 1

L0: Customer subtype

1: High Income, expensive child
2: Very Important Provincials
3: High status seniors
4: Affluent senior apartments
5: Mixed seniors
6: Career and childcare
7: Dinki's (double income no kids)
8: Middle class families
9: Modern, complete families
10: Stable family
11: Family starters
12: Affluent young families
13: Young all american family
14: Junior cosmopolitan
15: Senior cosmopolitans
16: Students in apartments
17: Fresh masters in the city
18: Single youth
19: Suburban youth
20: Etnically diverse
21: Young urban have-nots
22: Mixed apartment dwellers
23: Young and rising
24: Young, low educated
25: Young seniors in the city
26: Own home elderly
27: Seniors in apartments
28: Residential elderly
29: Porchless seniors: no front yard
30: Religious elderly singles
31: Low income catholics
32: Mixed seniors
33: Lower class large families
34: Large family, employed child
35: Village families
36: Couples with teens 'Married with children'
37: Mixed small town dwellers
38: Traditional families
39: Large religous families
40: Large family farms
41: Mixed rurals

L2: customer main type keys:

1: Successful hedonists
2: Driven Growers
3: Average Family
4: Career Loners
5: Living well
6: Cruising Seniors
7: Retired and Religeous
8: Family with grown ups
9: Conservative families
10: Farmers

In [None]:
caravan = pd.read_csv('../datos/caravan-insurance-challenge.csv')
caravan

In [None]:
from sklearn.preprocessing import OneHotEncoder

columnas = caravan.columns[2:86]
print(columnas)
def preprocesar_caravan(datos, tipo, columnas):
    datos_p = datos.copy()
    datos_tipo = pd.get_dummies(datos_p.MOSHOOFD, prefix="tipo", drop_first = True)
    datos_p = datos_p.drop(columns = ["MOSHOOFD"])
    datos_p = pd.concat([datos_tipo, datos_p], sort=False)
    datos_origen = datos[datos["ORIGIN"] == tipo].drop(columns = ["ORIGIN"])
    X = datos_origen[columnas].values
    y = datos_origen["CARAVAN"].values
    return X, y
X_ent, y_ent = preprocesar_caravan(caravan, "train", columnas)
X_ent.shape
np.unique(y_ent, return_counts=True)

Nótese que hay muy pocos valores de la clase positiva (es un producto que no es muy popular).

###  Ajuste de modelo

Probaremos con regresión logística regularizada. Para hacer eso de manera más efectiva preprocesamos los datos para estandarizarlos

In [None]:
escalador = StandardScaler()
escalador_ajustado = escalador.fit(X_ent)
X_ent_esc = escalador_ajustado.transform(X_ent)

In [None]:
reg_caravan = LogisticRegression(solver='newton-cg', C = 0.1 )
reg_caravan_ajuste = reg_caravan.fit(X_ent_esc, y_ent)

In [None]:
reg_caravan_ajuste.coef_

## Evaluación

1. Probaremos con perdida logarítmica, matriz de confusión, curvas ROC y precisión-exhaustividad
2. Checaremos calibración de las probabilidades obtenidas

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# preprocesar
X_pr, y_pr = preprocesar_caravan(caravan, "test", columnas)
X_pr_esc = escalador_ajustado.transform(X_pr)
# calcular probabilidades
probas  = reg_caravan_ajuste.predict_proba(X_pr_esc)
# calcular pérdida
log_perdida = log_loss(y_pr, probas[:, 1])
log_perdida.round(3)

In [None]:
confusion_matrix(y_pr, reg_caravan_ajuste.predict(X_pr_esc))


Nota que parece que nuestro modelo casi no hace predicciones positivas. Pero esto es usando un punto de corte de 0.5. Sin embargo, la curva ROC indica que estamos capturando información valiosa acerca de la compra del producto. Podemos buscar otros puntos de cortes que puedan ser valiosos para tomar decisiones (por ejemplo, sensibilidad 50% a una tasa de falsos positivos de 0.25)

In [None]:

tfp, tvp, cortes = roc_curve(y_pr, probas[:,1])
datos_roc = pd.DataFrame({"tfp":tfp, "tvp":tvp, "corte":cortes})
print("AUC:", roc_auc_score(y_pr, probas[:,1]).round(3))
(ggplot(datos_roc, aes("tfp", "tvp")) 
  + geom_step(size=1.5)
  + geom_abline(slope=1, intercept=0)
  + xlab("Tasa de falsos positivos") + ylab("Sensibilidad"))


In [None]:
prop_positivo = 348/(5474+ 348)
print("Proporción positivos: ", round(prop_positivo,2))
prec, rec, cortes = precision_recall_curve(y_pr, probas[:,1])
datos_prc = pd.DataFrame({"Precision":prec[5:-5], "Exhaustividad":rec[5:-5]})
print("Precisión promedio:", average_precision_score(y_pr, probas[:,1]).round(3))
(ggplot(datos_prc, aes("Exhaustividad", "Precision")) 
  + geom_point(size=1.5)
  + geom_hline(yintercept = 348/(5474+ 348))
  + scale_y_continuous(breaks = np.linspace(0.1, 0.8, 8))
)
 

Ahora checamos la calibración de las proabilidades obtenidas

In [None]:
from sklearn.calibration import calibration_curve
num_cubetas = 20
prop_positivos, pred_media_cubeta = calibration_curve(y_pr, probas[:, 1],  
  n_bins = num_cubetas, strategy = "quantile")
curva_cal = pd.DataFrame({"prediccion":pred_media_cubeta, "prop_pos":prop_positivos})
curva_cal

In [None]:
n_cubeta = round(len(y_pr) / num_cubetas)
n_cubeta
curva_cal["ee"] = np.sqrt(curva_cal["prop_pos"]*( 1 - curva_cal["prop_pos"]) / n_cubeta)
curva_cal["min"] = curva_cal["prop_pos"] - 2 * curva_cal["ee"]
curva_cal["max"] = curva_cal["prop_pos"] + 2 * curva_cal["ee"]
(ggplot(curva_cal, aes("prediccion", "prop_positivos", ymin="min", ymax="max"))
  + geom_abline(slope=1, intercept=0)
  + geom_point(color = "red")
  + geom_linerange()) 