<div style="text-align:right">Update date: Feb 21, 2024</div><br>

# Lead Scoring production code<br>
## Objetive<br>
Generate training and execution pipeline of the Lead Scoring model, with the purpose of making preparations for its move to the production phase.

## Imports

In [3]:
import os
import numpy as np
import pandas as pd
import cloudpickle

%config IPCompleter.greedy=True

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
##############################################
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

## Load data

### Original data file name

In [1]:
data_file_name = 'Leads.csv'

In [4]:
df = pd.read_csv('data/' + data_file_name, sep=";", index_col='id')

### Select final variables

In [7]:
pd.read_pickle('data/final_vars.pickle').sort_index().index.to_list()

['ambito_Marketing Management',
 'ambito_OTROS',
 'ambito_Select',
 'descarga_lm_No',
 'fuente_Organic Search',
 'ocupacion_Unemployed',
 'ocupacion_Working Professional',
 'origen_Lead Add Form',
 'score_actividad_mms',
 'score_perfil_mms',
 'tiempo_en_site_total_mms',
 'ult_actividad_Chat Conversation',
 'ult_actividad_Converted to Lead',
 'ult_actividad_Page Visited on Website',
 'ult_actividad_SMS Sent',
 'visitas_total_mms']

In [16]:
final_vars = [
        'ambito',
        'descarga_lm',
        'fuente',
        'ocupacion',
        'origen',
        'score_actividad',
        'score_perfil',
        'tiempo_en_site_total',
        'ult_actividad',
        'visitas_total'
]

## Datasets structure

### Delete records

In [12]:
df.drop_duplicates(inplace=True)

In [14]:
condicion = ((df['no_enviar_email'] != 'Yes')
             & (df['no_llamar'] != 'Yes')
             & (df['ult_actividad'] != 'Email Bounced')
            )
df = df[condicion]
df.shape

(6838, 20)

Select final predictor variables

In [17]:
x = df[final_vars].copy()

Specify the target

In [18]:
target = 'compra'
y = df[target].copy()

## Generate the pipeline

### Instantiate data quality

#### Create the function

In [23]:
# Update values in variables:
var_impute_value = [
    'origen',
    'fuente',
    'ult_actividad',
    'descarga_lm'
]
var_impute_mode = ['ocupacion', 'ambito']
# Define helper functions
def impute_mode(variable):
    return variable.fillna(variable.mode()[0])

def impute_median(variable):
    if pd.api.types.is_integer_dtype(variable):
        return variable.fillna(int(variable.median()))
    
    return variable.fillna(variable.median())

def group_rare_categories(variable, criterio=0.05):
    frequency = variable.value_counts(normalize=True)
    temp = list(frequency.loc[frequency < criterio].index.values)
    temp2 = np.where(variable.isin(temp), 'OTROS', variable)
    
    return temp2

def data_quality(df):
    temp = df.astype({'visitas_total': 'Int64'})             
    temp[var_impute_mode] = temp[var_impute_mode].apply(impute_mode)
    temp[var_impute_value] = temp[var_impute_value].fillna('DESCONOCIDO')
    var_impute_median = temp.select_dtypes(include='number').columns.to_list()
    temp[var_impute_median] = temp[var_impute_median].apply(impute_median)
    vars_rare = temp.select_dtypes(exclude='number').columns.to_list()   
    for variable in vars_rare:
        temp[variable] = group_rare_categories(temp[variable], criterio=0.02)
    #Winsorization manual
    temp['visitas_total'] = temp['visitas_total'].clip(0, 50)
    
    return(temp)

#### Convert it to transformer

In [24]:
make_data_quality = FunctionTransformer(data_quality)

### Instantiate variable transformation

In [25]:
var_ohe = ['ambito', 'descarga_lm', 'fuente', 'ocupacion', 'origen', 'ult_actividad']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
var_mms = ['score_actividad', 'score_perfil', 'tiempo_en_site_total', 'visitas_total']
mms = MinMaxScaler()

### Generate the preprocessing pipeline

#### Generate the column transformer

In [26]:
ct = make_column_transformer(
    (ohe, var_ohe),
    (mms, var_mms),
    remainder='drop'
)

#### Generate the preprocessing pipeline¶

In [27]:
pipe_prepro = make_pipeline(make_data_quality, ct)

### Instantiate the model

#### Instantiate the algorithm

In [28]:
model = LogisticRegression(
    n_jobs=-1,
    solver='saga',
    penalty='l1',
    C=1,
)

#### Generate the final training pipeline

In [32]:
pipe_train = make_pipeline(pipe_prepro, model)

#### Save the final training pipe

In [34]:
with open('models/pipe_train.pickle', mode='wb') as file:
    cloudpickle.dump(pipe_train, file)

#### Train the final execution pipeline

In [35]:
pipe_execution= pipe_train.fit(x, y)

In [36]:
pipe_execution

## Save the pipeline

### Save the final pipe of execution

In [37]:
with open('models/pipe_execution.pickle', mode='wb') as file:
    cloudpickle.dump(pipe_execution, file)