In [3]:
import pandas as pd 
import plotly.express as px 
import numpy as np 
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar os dados

In [5]:
df_leads = pd.read_csv(r'.\datasets\leads_cleaned.csv')

In [6]:
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [7]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

### Preparação dos Dados

In [8]:
X = df_leads.drop(columns=['Converted'], axis=1)
y = df_leads.Converted

In [10]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [12]:
import joblib

preprocessor = joblib.load('preprocessor__dataset_leads.pkl')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=51)

In [14]:
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [15]:
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


### Treinamento do Modelo

In [33]:
# Criar o modelo de Staccking classifier

# Meta-Modelo
lr_model = LogisticRegression(random_state=51)

# Modelo Base
tree_model = DecisionTreeClassifier(random_state=51)
svc_model = SVC(kernel='linear')
sgd_model = SGDClassifier(penalty='elasticnet', random_state=51)

# Criar o Objeto do StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('sgd', sgd_model),
        ('svc', svc_model),
        ('decision tree', tree_model),
    ],
    final_estimator=lr_model,
    passthrough=True
    # False usa o estimadores de cada algoritmo base  (vanilla)
    # True usa os resultados dos estimadores de cada algoritmo base mais o dataset original (Blendi)
)


In [34]:
# Treinar o modelo
stacking_model.fit(X_train, y_train)

0,1,2
,estimators,"[('sgd', ...), ('svc', ...), ...]"
,final_estimator,LogisticRegre...ndom_state=51)
,cv,
,stack_method,'auto'
,n_jobs,
,passthrough,True
,verbose,0

0,1,2
,loss,'hinge'
,penalty,'elasticnet'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,51
,solver,'lbfgs'
,max_iter,100


### Avaliação do modelo

In [35]:
y_pred = stacking_model.predict(X_test)

In [36]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [37]:
# Mostrar Resultados
print(f'Acuracia: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acuracia: 0.7944903581267218
Precision: 0.7454545454545455
Recall: 0.673134328358209
F1-Score: 0.7074509803921568


In [38]:
conf_matrix = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf_matrix,
                labels=dict(x='Predição', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis'
                )

fig.update_traces(text=conf_matrix, texttemplate="%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [39]:
# Calcular a importancia das variaveis considerando o Stacking Classifier

importances=[]

for estimator in stacking_model.estimators_:
    # Modelos lineares possuem coeficientes
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_[0]))
        print(f'Coeficiente do modelo {type(estimator).__name__}')
    # Modelos baseados em arvores
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(np.abs(estimator.feature_importances_))
        print(f'Feature importances do modelo {type(estimator).__name__}')
    # Caso nao encontre coed e feature importances
    else: 
        print(f'Não foi possivel calcular a importancia para {type(estimator).__name__}')
        

Coeficiente do modelo SGDClassifier
Coeficiente do modelo SVC
Feature importances do modelo DecisionTreeClassifier


In [40]:
# Calcular media das importancia para cada feature
importancia_media = np.mean(importances, axis=0)

In [41]:
# Obter nome das features
features_names = (numeric_features.to_list() + preprocessor.named_transformers_['cat']
                  .get_feature_names_out(categorical_features).tolist())

In [42]:
df_feature_importance = pd.DataFrame({'Feature': features_names, 'Importance': importancia_media})

In [43]:
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=True)

In [44]:
fig = px.bar(df_feature_importance, x='Importance', y='Feature', orientation='h', title='Importancia das features')
fig.update_layout(height=1280, width=1000)
fig.show()

# Propriedades do modelo

In [47]:
# Mostrar evidencias do modelo

# Fazer predicao em um exemplo especifico
X_sample = X_test[9].reshape(1,-1)

# Predicoes individuais dos estimadores

sgd_pred = stacking_model.named_estimators_['sgd'].predict(X_sample)
svc_pred = stacking_model.named_estimators_['svc'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

# Predicao final com o Stackin
stacking_pred = stacking_model.predict(X_sample)

In [46]:
print(f"Predição do SGD: {sgd_pred[0]}")
print(f"Predição do SVC: {svc_pred[0]}")
print(f"Predição da Decision Tree: {tree_pred[0]}")
print(f"Predição final da Stacking: {stacking_pred[0]}")


Predição do SGD: 0
Predição do SVC: 0
Predição da Decision Tree: 1
Predição final da Stacking: 0
