In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import plotly.express as px 
import plotly.graph_objects as go 
import numpy as np 
sns.set_style("whitegrid")

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Carregar Dados
df_leads = pd.read_csv(r'.\datasets\leads_cleaned.csv')

In [3]:
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [4]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos Dados

In [5]:
X = df_leads.drop(columns=['Converted'], axis=1)
y = df_leads.Converted

In [6]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [7]:
# USAR O PREPROCESSOR JA SALVO ANTERIORMENTE
import joblib
preprocessor = joblib.load('preprocessor__dataset_leads.pkl')

In [8]:
# Dividir os dados entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=51)

In [9]:
# Aplicar preprocessor 
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# Mostrar a estrutura desse conjuntos
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


### Treinamento do Modelo

In [59]:
boosting_model = AdaBoostClassifier(
    estimator=LogisticRegression(),
    n_estimators=50,
    learning_rate=1, #Nivel de aprendizado em cada iteração
    random_state=51
)

In [60]:
boosting_model.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegression()
,n_estimators,50
,learning_rate,1
,algorithm,'deprecated'
,random_state,51

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### Avaliação do modelo

In [61]:
# Fazer prediçoes no conj de teste
y_pred = boosting_model.predict(X_test)

In [62]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [63]:
# Mostrar Resultados
print(f'Acuracia: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acuracia: 0.7856749311294766
Precision: 0.7004279600570613
Recall: 0.7328358208955223
F1-Score: 0.7162654996353027


In [64]:
conf_matrix = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Connverted', 'Converted'],
                y=['Not Connverted', 'Converted'],
                color_continuous_scale='Viridis'
                )

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [65]:
# Calcular a importancia das features
importance_features = np.mean([np.abs(estimator.coef_[0]) for estimator in boosting_model.estimators_], axis=0)

In [66]:
importance_features.shape

(68,)

In [67]:
# Obter os nomes reais das features 
features_name = (numeric_features.tolist() + 
                 preprocessor.named_transformers_['cat'].
                 get_feature_names_out(categorical_features).tolist())

In [68]:
# Criar df
df_feature_importance = pd.DataFrame({'Feature': features_name, 'Importance': importance_features})

df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=True)

In [69]:
fig = px.bar(df_feature_importance,
             x='Importance',
             y='Feature', 
             orientation='h',
             title="Importância das features baseada nos coeficientes abs")

fig.update_layout(height=1280, width=1000, yaxis={'categoryorder': 'total ascending'})

fig.show()

### Saídas do Modelo

In [44]:
# Error dos estimadores 
boosting_model.estimator_errors_

array([0.37994214, 0.25903465, 0.44301389, 0.33267367, 0.43273034,
       0.42626923, 0.43217547, 0.48644275, 0.4268625 , 0.46478223,
       0.46065486, 0.46999515, 0.43488796, 0.48436183, 0.46258432,
       0.47685372, 0.48452293, 0.47781898, 0.47395167, 0.48928659,
       0.47750794, 0.48789657, 0.48545929, 0.48847906, 0.48047169,
       0.48643136, 0.49735856, 0.49632236, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [45]:
# Pesos dos estimadores
boosting_model.estimator_weights_

array([0.48979381, 1.05099201, 0.22893916, 0.69611714, 0.27072002,
       0.29708911, 0.27298076, 0.05424229, 0.29466371, 0.14110472,
       0.1577066 , 0.12016377, 0.26193558, 0.0625731 , 0.14994301,
       0.09265133, 0.06192807, 0.08878236, 0.10428774, 0.04286018,
       0.09002898, 0.04842316, 0.05817924, 0.04609193, 0.07815299,
       0.05428788, 0.01056587, 0.01471083, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [46]:
# Predizer a probabilidade de conversão 
y_pred_proba = boosting_model.predict_proba(X_test)

In [47]:
y_pred_proba

array([[0.28892438, 0.71107562],
       [0.55085138, 0.44914862],
       [0.55085138, 0.44914862],
       ...,
       [0.75407715, 0.24592285],
       [0.70374288, 0.29625712],
       [0.30284882, 0.69715118]], shape=(1815, 2))