In [1]:
# EDA
import pandas as pd
import numpy as np
import plotly.express as px

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os dados

In [2]:
# Carregar o dataset já limpo
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas do dataset
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [4]:
# Mostrar as ultimas linhas do dataset
df_leads.tail(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
9064,Landing Page Submission,Google,0,0,0,2.0,870,2.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9065,Landing Page Submission,Google,0,0,1,8.0,1016,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9066,Landing Page Submission,Direct Traffic,0,0,0,2.0,1770,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9067,API,Direct Traffic,0,0,1,13.0,1409,2.6,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9068,Landing Page Submission,Direct Traffic,0,0,1,5.0,210,2.5,SMS Sent,0,0,0,0,0,0,0,Modified
9069,Landing Page Submission,Direct Traffic,1,0,1,8.0,1845,2.67,Email Marked Spam,0,0,0,0,0,0,0,Email Marked Spam
9070,Landing Page Submission,Direct Traffic,0,0,0,2.0,238,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9071,Landing Page Submission,Direct Traffic,1,0,0,2.0,199,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9072,Landing Page Submission,Google,0,0,1,3.0,499,3.0,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9073,Landing Page Submission,Direct Traffic,0,0,1,6.0,1279,3.0,SMS Sent,0,0,0,0,0,0,1,Modified


In [5]:
# Mostrar estrutura do dataset
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparar os dados

In [6]:
# Preparar os dados
X = df_leads.drop(columns=['Converted'])
y = df_leads['Converted']

In [7]:
# Usar o preprocessor já salvo 
import joblib
preprocessor = joblib.load('./preprocessor_dataset_leads.pkl')

In [8]:
# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplicar o preprocessor nos dados de treino e teste
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [9]:
# Mostrar estrutura dos conjuntos
print("Estrutura do conjunto de treino:", X_train.shape)
print("Estrutura do conjunto de teste:", X_test.shape)

Estrutura do conjunto de treino: (7259, 66)
Estrutura do conjunto de teste: (1815, 66)


In [10]:
preprocessor.get_feature_names_out()

array(['num__Do Not Email', 'num__Do Not Call', 'num__TotalVisits',
       'num__Total Time Spent on Website', 'num__Page Views Per Visit',
       'num__Search', 'num__Newspaper Article', 'num__X Education Forums',
       'num__Newspaper', 'num__Digital Advertisement',
       'num__Through Recommendations',
       'num__A free copy of Mastering The Interview',
       'cat__Lead Origin_API', 'cat__Lead Origin_Landing Page Submission',
       'cat__Lead Origin_Lead Add Form', 'cat__Lead Origin_Lead Import',
       'cat__Lead Source_Click2call', 'cat__Lead Source_Direct Traffic',
       'cat__Lead Source_Facebook', 'cat__Lead Source_Google',
       'cat__Lead Source_Live Chat', 'cat__Lead Source_NC_EDM',
       'cat__Lead Source_Olark Chat', 'cat__Lead Source_Organic Search',
       'cat__Lead Source_Pay per Click Ads', 'cat__Lead Source_Reference',
       'cat__Lead Source_Referral Sites', 'cat__Lead Source_Social Media',
       'cat__Lead Source_WeLearn', 'cat__Lead Source_Welingak Webs

### Treinamento do modelo de Voting Classifier

In [11]:
# Criar um modelo de Voting Classifier

lr_model = LogisticRegression(random_state=51)
# Para o SVC, é importante definir probability=True para usar soft voting
svc_model = SVC(probability=True, kernel='linear')
tree_model = DecisionTreeClassifier(random_state=51)

voting_model = VotingClassifier(
    estimators=[
        ('logistic regression', lr_model),
        ('svc', svc_model),
        ('decision tree', tree_model)
    ],
    # Hard faz a votação pela maioria das predições dos estimadores
    # Soft faz a votação pela média das probabilidades de predição
    voting='soft'  # 'soft' para probabilidade média
)


In [12]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

### Analise dos resultados

In [13]:
# Realizar predições no conjunto de testes
y_pred = voting_model.predict(X_test)

In [14]:
# Visualizar y_pred
print("Predições do modelo:", y_pred)

Predições do modelo: [0 0 0 ... 0 0 1]


In [15]:
# Calcular métricas de avaliação
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [16]:
# Mostrar as métricas
print(f"Acurácia: {accuracy}")
print(f"Precisão: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Acurácia: 0.7796143250688705
Precisão: 0.7294303797468354
Recall: 0.6681159420289855
F1 Score: 0.697428139183056


In [17]:
# Mostrar a matriz de confusão
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Real", color="Contagem"),
                x=['Não Convertido', 'Convertido'],
                y=['Não Convertido', 'Convertido'],
                color_continuous_scale='Viridis'
)

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)

fig.show()

In [18]:
# Carregar as importancias dos modelos
importances = []

for estimador in voting_model.estimators_:
    # Para modelos lineares, retorna coef
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_[0]))
    # Para modelos baseados em árvores
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f"Não foi possivel obter importâncias para o modelo {type(estimador)._name_}")



In [19]:
# Calucular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [20]:
# Nomes das features
feature_names = preprocessor.get_feature_names_out()

In [21]:
# Criar um DataFrame para visualizar as importâncias
df_features_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importancia_media})

In [22]:
# Ordenar o DataFrame pelas importâncias
df_features_importances = df_features_importances.sort_values(by='Importance', ascending=True)

In [23]:
# Plotar as importâncias
fig = px.bar(df_features_importances,
             x='Importance',
             y='Feature',
             orientation='h',
             title='Importância das Features no Modelo de Voting Classifier'
             )

fig.update_layout(height=800, width=800)
fig.show()

### Propriedades do Modelo

In [24]:
# Mostrar evidencias do modelo - Hard Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais
log_pred = voting_model.named_estimators_['logistic regression'].predict(X_sample)
svc_pred = voting_model.named_estimators_['svc'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final do Voting Classifier
voting_pred = voting_model.predict(X_sample)

In [25]:
# Exibir as predições
print(f"Predição do Logistic Regression: {log_pred[0]}")
print(f"Predição do SVC: {svc_pred[0]}")
print(f"Predição do Decision Tree: {tree_pred[0]}")
print(f"Predição do Voting Classifier (Votação Majoritária): {voting_pred[0]}")

Predição do Logistic Regression: 1
Predição do SVC: 1
Predição do Decision Tree: 0
Predição do Voting Classifier (Votação Majoritária): 1


In [26]:
# Mostrar evidencias do modelo - Soft Voting

# Selecionar um registro da base para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais das probabilidades dos estimadores
log_proba = voting_model.named_estimators_['logistic regression'].predict_proba(X_sample)
svc_proba = voting_model.named_estimators_['svc'].predict_proba(X_sample)
tree_proba = voting_model.named_estimators_['decision tree'].predict_proba(X_sample)

# Predição final do Voting Classifier
voting_pred = voting_model.predict(X_sample)
voting_proba = voting_model.predict_proba(X_sample)

In [27]:
# Exibir as predições
print(f"Probabilidade do Logistic Regression: {log_proba}")
print(f"Probabilidade do SVC: {svc_proba}")
print(f"Probabilidade do Decision Tree: {tree_proba}")
print(f"Probabilidade final Soft Voting (Votação Ponderada): {voting_proba}")
print(f"Predição do Soft Voting (Votação Majoritária): {voting_pred[0]}")

Probabilidade do Logistic Regression: [[0.07326807 0.92673193]]
Probabilidade do SVC: [[0.06187108 0.93812892]]
Probabilidade do Decision Tree: [[1. 0.]]
Probabilidade final Soft Voting (Votação Ponderada): [[0.37837972 0.62162028]]
Predição do Soft Voting (Votação Majoritária): 1
