In [1]:
import sys
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
from itertools import combinations

In [2]:
PARAMETROS = {
    'task_type' : 'CPU',
    'has_time' : True,
    'silent' : True,
    'depth': 4,
    'early_stopping_rounds': 2,
    'iterations' : 100
}

In [3]:
#APERTURA ARCHIVOS
test = pd.read_pickle("Archivos/Arboles_validacion.pkl")
prediccion_arbol = pd.read_csv("Archivos/prediccion_naive_bayes_val.csv")
prediccion_red   = pd.read_csv("Archivos/prediccion_neurona_validacion.csv")
prediccion_arbol_test = pd.read_csv("Archivos/prediccion_naive_bayes.csv")
prediccion_red_test   = pd.read_csv("Archivos/prediccion_neurona_test.csv")

In [4]:
#ORDENO ARCHIVOS PARA QUE COINCIDA OPPORTUNITY_ID
test = test.sort_values('Opportunity_ID')
prediccion_arbol = prediccion_arbol.sort_values('Opportunity_ID')
prediccion_red   = prediccion_red.sort_values('Opportunity_ID')
test_label = test['Stage']

In [5]:
prediccion_red.count()

Opportunity_ID    4515
Target            4515
dtype: int64

In [6]:
prediccion_arbol.count()

Opportunity_ID    4515
Target            4515
dtype: int64

In [7]:
#MAX
def ensamble_max():
    preds = prediccion_arbol['Target'].combine(prediccion_red['Target'], max, 0)
    print('Logloss Max: ', skl.metrics.log_loss(test_label,preds))
    preds = prediccion_arbol_test['Target'].combine(prediccion_red_test['Target'], max, 0)
    return preds

In [8]:
#MIN
def ensamble_min():
    preds = prediccion_arbol['Target'].combine(prediccion_red['Target'], min, 0)
    print('Logloss Min: ', skl.metrics.log_loss(test_label,preds))
    preds = prediccion_arbol_test['Target'].combine(prediccion_red_test['Target'], min, 0)
    return preds

In [9]:
#COMBINACION LINEAL
def ensamble_cl():
    min_a = -1
    min_b = -1
    min_logloss = 1000

    for a in range(0,100):
        for b in range(0,100):
            if(a == 0 & b == 0): continue
            preds = (a*prediccion_arbol['Target'] + b*prediccion_red['Target'])/(a+b)
            logloss = skl.metrics.log_loss(test_label,preds)
            if(logloss < min_logloss):
                min_logloss = logloss
                min_a = a
                min_b = b
    print("Logloss CL: ", min_logloss)
    print("min_a: ", min_a)
    print("min_b: ", min_b)
    preds = (min_a*prediccion_arbol_test['Target'] + min_b*prediccion_red_test['Target'])/(min_a+min_b)
    return preds

In [10]:
#KNN
def ensamble_knn():
    temp = pd.DataFrame()
    temp['Arbol'] = prediccion_arbol['Target']
    temp['Red'] = prediccion_red['Target']
    temp['Stage'] = test_label
    entrenamiento = temp.iloc[:4000,:]
    validacion = temp.iloc[4001:,:]
    entrenamiento_label = entrenamiento.pop('Stage')
    classifier = KNeighborsClassifier(n_neighbors=300)
    classifier.fit(entrenamiento, entrenamiento_label)
    preds = classifier.predict_proba(validacion[['Arbol','Red']])
    print("Logloss KNN: ", skl.metrics.log_loss(validacion['Stage'],preds))

    classifier.fit(temp[['Arbol','Red']], temp['Stage'])
    temp = pd.DataFrame()
    temp['Arbol'] = prediccion_arbol_test['Target']
    temp['Red'] = prediccion_red_test['Target']
    preds = classifier.predict_proba(temp)
    preds = pd.Series([p[1] for p in preds])
    return preds

In [11]:
#Catboost
def ensamble_catboost():
    temp = pd.DataFrame()
    temp['Arbol'] = prediccion_arbol['Target']
    temp['Red'] = prediccion_red['Target']
    temp['Stage'] = test_label
    entrenamiento = temp.iloc[:4000,:]
    validacion = temp.iloc[4001:,:]
    entrenamiento_label = entrenamiento.pop('Stage')
    entrenamiento_pool = cb.Pool(entrenamiento, entrenamiento_label)
    test_pool = cb.Pool(validacion)
    
    model = cb.CatBoostClassifier(**PARAMETROS)
    model.fit(entrenamiento_pool)
    preds = model.predict_proba(test_pool)
    preds = pd.Series([p[1] for p in preds])
    print("Logloss Cat: ", skl.metrics.log_loss(validacion['Stage'],preds))

    model.fit(temp[['Arbol','Red']], temp['Stage'])
    temp = pd.DataFrame()
    temp['Arbol'] = prediccion_arbol_test['Target']
    temp['Red'] = prediccion_red_test['Target']
    test_pool = cb.Pool(temp)
    preds = model.predict_proba(test_pool)
    preds = pd.Series([p[1] for p in preds]) 
    return preds


In [12]:
(prediccion_red['Target'] >= np.inf).value_counts()

False    4515
Name: Target, dtype: int64

In [13]:
#CALCULO Y SELECCION DE PREDS
preds_max = ensamble_max()
preds_min = ensamble_min()
preds_cl  = ensamble_cl()
preds_knn = ensamble_knn()
preds_catboost =ensamble_catboost()

Logloss Max:  0.36552889445835457
Logloss Min:  0.39454965947733617
Logloss CL:  0.19166015977318812
min_a:  2
min_b:  53
Logloss KNN:  0.1512373439261934
Logloss Cat:  0.1354540964015891


In [14]:
print(prediccion_arbol_test['Opportunity_ID'])

0       10689
1       10690
2       10691
3       10692
4       10693
        ...  
1562    12364
1563    12365
1564    12366
1565    12367
1566    12368
Name: Opportunity_ID, Length: 1567, dtype: int64


In [16]:
#GUARDADO DE ARCHIVOS
preds = preds_catboost
resultados = pd.DataFrame()
resultados['Opportunity_ID'] = prediccion_arbol_test['Opportunity_ID']
resultados['Target'] = pd.Series(preds)  
resultados.to_csv("Archivos/prediccion_ensamble_red_naive_bayes.csv", index=False)