# MODELIZACION PARA CLASIFICACION

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.metrics import classification_report

from sklearn.metrics import PrecisionRecallDisplay


%config IPCompleter.greedy=True

pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings("ignore")

In [12]:
ruta_proyecto = 'C:/Users/mcana/OneDrive/Escritorio/PORTAFOLIO/ML/LEAD_SCORINGLEAD_SCORING_PROYECTO'

In [13]:
nombre_x = 'x_preseleccionado.pickle'
nombre_y = 'y_preseleccionado.pickle'

In [14]:
#cargar los datos 
x = pd.read_pickle(ruta_proyecto + '/Datos/tabajo/' + nombre_x)
y = pd.read_pickle(ruta_proyecto + '/Datos/tabajo/' + nombre_y)

## MODELIZAR


In [15]:
train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.3)

### Crear el Pipe y el diccionario de algoritmos, parámetros y valores a testar

In [16]:
pipe = Pipeline([('algoritmo',RandomForestClassifier())])

grid = [{'algoritmo': [LogisticRegression()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__solver': ['saga'],
         'algoritmo__penalty': ['elasticnet', 'l1', 'l2', 'none'],
         'algoritmo__C': [0,0.25,0.5,0.75,1]},
        
       ]

### Optimizar los hiper parámetros

In [17]:
grid_search = GridSearchCV(estimator= pipe, 
                            param_grid = grid, 
                            cv = 3, 
                            scoring = 'roc_auc',
                            verbose = 0,
                            n_jobs = -1)

modelo = grid_search.fit(train_x,train_y)

pd.DataFrame(grid_search.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algoritmo,param_algoritmo__C,param_algoritmo__n_jobs,param_algoritmo__penalty,param_algoritmo__solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
17,0.11,0.01,0.01,0.0,LogisticRegression(),1.0,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.01,1
13,0.11,0.0,0.01,0.0,LogisticRegression(),0.75,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.01,2
9,0.12,0.01,0.01,0.0,LogisticRegression(),0.5,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.01,3
18,0.07,0.01,0.01,0.0,LogisticRegression(),1.0,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.02,4
5,0.11,0.03,0.01,0.0,LogisticRegression(),0.25,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.02,5
14,0.09,0.0,0.01,0.0,LogisticRegression(),0.75,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.02,6
10,0.12,0.02,0.01,0.0,LogisticRegression(),0.5,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.86,0.87,0.02,7
6,0.07,0.01,0.01,0.0,LogisticRegression(),0.25,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.89,0.85,0.85,0.86,0.02,8
16,0.0,0.0,0.0,0.0,LogisticRegression(),1.0,-1,elasticnet,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",,,,,,9
15,0.0,0.0,0.0,0.0,LogisticRegression(),0.75,-1,none,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",,,,,,9


## EVALUAR

In [18]:
pred = modelo.best_estimator_.predict_proba(val_x) [:,1]

In [19]:
roc_auc_score(val_y, pred)

0.8804638062518076

###### 0.88 está muy bien, podemos predecir que Leads son los que van a convertir con una capacidad predictiva bastante alta utlizando la R.Logistica

In [20]:
#Examinar el mejor modelo 
modelo.best_estimator_

el mejor modelo sería con el coeficiente de penalización de 1; regularización l1; está utilizando el motor de solver saga

In [23]:
#variables que para nuestro modelo funcionan mejor
modelo.best_estimator_.named_steps.algoritmo.coef_

array([[ 4.09212423,  8.12130309,  1.08684705,  0.        , -1.69932965,
         2.01784203, -1.55966438,  1.67920183, -0.65832098, -1.37640075,
        -0.93783071, -1.38252075, -1.70910262, -0.0395892 ,  0.33170004]])

In [24]:
#acceder a los nombres
modelo.best_estimator_.named_steps.algoritmo.feature_names_in_

array(['tiempo_en_site_total_mms', 'score_actividad_mms',
       'ult_actividad_SMS Sent', 'visitas_total_mms',
       'paginas_vistas_visita_mms', 'score_perfil_mms', 'ambito_Select',
       'ocupacion_Working Professional', 'ocupacion_Unemployed',
       'ult_actividad_Converted to Lead',
       'ult_actividad_Page Visited on Website',
       'ult_actividad_Chat Conversation',
       'origen_Landing Page Submission', 'fuente_Google',
       'descarga_lm_No'], dtype=object)

In [25]:
pd.DataFrame(data = modelo.best_estimator_.named_steps.algoritmo.coef_,
            columns= modelo.best_estimator_.named_steps.algoritmo.feature_names_in_)

Unnamed: 0,tiempo_en_site_total_mms,score_actividad_mms,ult_actividad_SMS Sent,visitas_total_mms,paginas_vistas_visita_mms,score_perfil_mms,ambito_Select,ocupacion_Working Professional,ocupacion_Unemployed,ult_actividad_Converted to Lead,ult_actividad_Page Visited on Website,ult_actividad_Chat Conversation,origen_Landing Page Submission,fuente_Google,descarga_lm_No
0,4.09,8.12,1.09,0.0,-1.7,2.02,-1.56,1.68,-0.66,-1.38,-0.94,-1.38,-1.71,-0.04,0.33


In [28]:
pd.DataFrame(data = modelo.best_estimator_.named_steps.algoritmo.coef_,
            columns= modelo.best_estimator_.named_steps.algoritmo.feature_names_in_).unstack().sort_values(ascending=False)

score_actividad_mms                    0    8.12
tiempo_en_site_total_mms               0    4.09
score_perfil_mms                       0    2.02
ocupacion_Working Professional         0    1.68
ult_actividad_SMS Sent                 0    1.09
descarga_lm_No                         0    0.33
visitas_total_mms                      0    0.00
fuente_Google                          0   -0.04
ocupacion_Unemployed                   0   -0.66
ult_actividad_Page Visited on Website  0   -0.94
ult_actividad_Converted to Lead        0   -1.38
ult_actividad_Chat Conversation        0   -1.38
ambito_Select                          0   -1.56
paginas_vistas_visita_mms              0   -1.70
origen_Landing Page Submission         0   -1.71
dtype: float64