In [None]:
# EDA
import pandas as pd
import plotly.express as px
import seaborn as sns
import numpy as np

sns.set_style('whitegrid')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os dados

In [2]:
# Carregar os dados já tratados
df_leads = pd.read_csv('./datasets/leads_cleaned.csv')

In [3]:
# Mostrar as linhas iniciais
df_leads.head(20)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [4]:
# Mostrar as linhas finais
df_leads.tail(20)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
9054,Landing Page Submission,Direct Traffic,0,0,0,5.0,20,2.5,SMS Sent,0,0,0,0,0,0,1,Modified
9055,Landing Page Submission,Google,0,0,0,4.0,1347,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9056,API,Google,0,0,0,6.0,228,6.0,SMS Sent,0,0,0,0,0,0,0,Modified
9057,API,Organic Search,0,0,0,7.0,142,7.0,Email Opened,0,0,0,0,0,0,1,Modified
9058,Landing Page Submission,Google,0,0,0,4.0,455,4.0,Form Submitted on Website,0,0,0,0,0,0,0,Modified
9059,Landing Page Submission,Direct Traffic,1,0,0,2.0,74,2.0,Email Bounced,0,0,0,0,0,0,1,Modified
9060,API,Olark Chat,0,0,0,0.0,0,0.0,SMS Sent,0,0,0,0,0,0,0,Modified
9061,Landing Page Submission,Google,0,0,1,5.0,1283,1.67,Email Opened,0,0,0,0,0,0,0,Email Opened
9062,Landing Page Submission,Google,0,0,1,4.0,1944,2.0,SMS Sent,0,0,0,0,0,0,1,Modified
9063,Landing Page Submission,Organic Search,0,0,1,13.0,1226,6.5,SMS Sent,0,0,0,0,0,0,1,Modified


In [5]:
# Estrutura do dataset
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos dados

In [6]:
# Preparar os dados para o modelo
X= df_leads.drop(columns=['Converted'])
y =df_leads['Converted']

In [7]:
# Cria lista de colunas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
# Usar preprocessor existente
import joblib
preprocessor = joblib.load('./preprocessor_dataset_leads.pkl')

In [9]:
# Dividir o dataset entre treinamento e teste
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=51)

In [10]:
# Aplicar o preprocessor
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [11]:
# Mostrar os conjuntos
print(f'Conjunto de treinamento: {X_train.shape}')
print(f'Conjunto de teste: {X_test.shape}')

Conjunto de treinamento: (7259, 68)
Conjunto de teste: (1815, 68)


### Treinamento do Modelo

In [12]:
# Criar o modelo de StackingClassifier

# Meta-modelo
lr_model = LogisticRegression(random_state=51)

# Modelos Base
tree_model = DecisionTreeClassifier(random_state=51)
svc_model = SVC(kernel='linear')
sgd_model = SGDClassifier(penalty='elasticnet')

# Cirar o objeto do StackingClassifier
stacking_model = StackingClassifier(
  estimators=[
    ('sgd classifier', sgd_model),
    ('svc', svc_model),
    ('decision tree', tree_model)
  ],
  final_estimator=lr_model,
  # Passthrough = False, usa apenas os resultados estimadores de cada algoritmo base (Vanilla)
  # Passthrough = True, usa os resultados estimadores de cada algoritmo base + dataset original (Blending)
  passthrough=False
)

In [13]:
# Treinar o modelo
stacking_model.fit(X_train, y_train)

### Avaliação do Modelo

In [14]:
# Fazer predições no conjunto de testes
y_pred = stacking_model.predict(X_test)

In [15]:
# Mostrar y_pred
y_pred

array([1, 0, 0, ..., 0, 0, 1], shape=(1815,))

In [16]:
# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [17]:
# Apresentar as métricas
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acurácia: 0.8016528925619835
Precisão: 0.746031746031746
Recall: 0.7014925373134329
F1-Score: 0.7230769230769231


In [18]:
# Mostrar a matriz de confusão
conf_matriz = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matriz,
                labels=dict(x='Predição', y='Real', color="Contagem"),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis')

fig.update_traces(text=conf_matriz, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [19]:
# Calcular a importância das variáveis considerando o Stacking Classifier

importances = []

for estimator in stacking_model.estimators_:
  # Modelos lineares possuem coeficiente
  if hasattr(estimator, 'coef_'):
    importances.append(np.abs(estimator.coef_[0]))
    print(f'Coeficientes do modelo {type(estimator).__name__}')
  elif hasattr(estimator, 'feature_importances_'):
    importances.append(np.abs(estimator.feature_importances_))
    print(f'Feature Importances do modelo {type(estimator).__name__}')
  # Caso não encontre coef e feature importances
  else:
    print(f'Não foi possível calcular a importância para {type(estimator).__name__}')


Coeficientes do modelo SGDClassifier
Coeficientes do modelo SVC
Feature Importances do modelo DecisionTreeClassifier


In [20]:
# Calcular a média das importâncias
importance_mean = np.mean(importances, axis=0)

In [21]:
# Obter os nomes das features
features_names = (numeric_features.tolist() +
                  preprocessor.named_transformers_['cat']
                  .get_feature_names_out(categorical_features).tolist())


In [22]:
# Criar um dataframe com nomes e importância
df_feature_importances = pd.DataFrame({"Feature": features_names, 'Importance': importance_mean})

In [23]:
# Ordenar o Dataframe
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=True)

In [24]:
# Mostrar o ranking de importância

fig = px.bar(df_feature_importances,
             x="Importance",
             y="Feature",
             orientation='h',
             title='Importância das features dos algortimos base')

fig.update_layout(height=1280, width=1000)
fig.show()

### Propriedades do Modelo

In [35]:
# Mostrar evidências do Modelo

# Fazer uma predição num exemplo específico
X_sample = X_test[7].reshape(1,-1)

# Predições individuais dos estimadores

sgd_pred = stacking_model.named_estimators_['sgd classifier'].predict(X_sample)
svc_pred = stacking_model.named_estimators_['svc'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final com o Stacking
stacking_pred = stacking_model.predict(X_sample)

In [36]:
# Exibir os resultados
print(f'Predição do SGD: {sgd_pred[0]}')
print(f'Predição do SVC: {svc_pred[0]}')
print(f'Predição da Árvore de Decisão: {tree_pred[0]}')
print(f'Predição final do Stacking (Logistic Regression): {stacking_pred[0]}')


Predição do SGD: 0
Predição do SVC: 0
Predição da Árvore de Decisão: 1
Predição final do Stacking (Logistic Regression): 0
