In [27]:
import pandas as pd 
import plotly.express as px 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Carregar Dados

In [28]:
df_leads = pd.read_csv(r'.\datasets\leads_cleaned.csv')

In [29]:
df_leads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9074 entries, 0 to 9073
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Lead Origin                             9074 non-null   object 
 1   Lead Source                             9074 non-null   object 
 2   Do Not Email                            9074 non-null   int64  
 3   Do Not Call                             9074 non-null   int64  
 4   Converted                               9074 non-null   int64  
 5   TotalVisits                             9074 non-null   float64
 6   Total Time Spent on Website             9074 non-null   int64  
 7   Page Views Per Visit                    9074 non-null   float64
 8   Last Activity                           9074 non-null   object 
 9   Search                                  9074 non-null   int64  
 10  Newspaper Article                       9074 non-null   int6

In [30]:
df_leads.head(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
0,API,Olark Chat,0,0,0,0.0,0,0.0,Page Visited on Website,0,0,0,0,0,0,0,Modified
1,API,Organic Search,0,0,0,5.0,674,2.5,Email Opened,0,0,0,0,0,0,0,Email Opened
2,Landing Page Submission,Direct Traffic,0,0,1,2.0,1532,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
3,Landing Page Submission,Direct Traffic,0,0,0,1.0,305,1.0,Unreachable,0,0,0,0,0,0,0,Modified
4,Landing Page Submission,Google,0,0,1,2.0,1428,1.0,Converted to Lead,0,0,0,0,0,0,0,Modified
5,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
6,Landing Page Submission,Google,0,0,1,2.0,1640,2.0,Email Opened,0,0,0,0,0,0,0,Modified
7,API,Olark Chat,0,0,0,0.0,0,0.0,Olark Chat Conversation,0,0,0,0,0,0,0,Modified
8,Landing Page Submission,Direct Traffic,0,0,0,2.0,71,2.0,Email Opened,0,0,0,0,0,0,1,Email Opened
9,API,Google,0,0,0,4.0,58,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened


In [31]:
df_leads.tail(10)

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Search,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,A free copy of Mastering The Interview,Last Notable Activity
9064,Landing Page Submission,Google,0,0,0,2.0,870,2.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9065,Landing Page Submission,Google,0,0,1,8.0,1016,4.0,Email Opened,0,0,0,0,0,0,0,Email Opened
9066,Landing Page Submission,Direct Traffic,0,0,0,2.0,1770,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9067,API,Direct Traffic,0,0,1,13.0,1409,2.6,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9068,Landing Page Submission,Direct Traffic,0,0,1,5.0,210,2.5,SMS Sent,0,0,0,0,0,0,0,Modified
9069,Landing Page Submission,Direct Traffic,1,0,1,8.0,1845,2.67,Email Marked Spam,0,0,0,0,0,0,0,Email Marked Spam
9070,Landing Page Submission,Direct Traffic,0,0,0,2.0,238,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9071,Landing Page Submission,Direct Traffic,1,0,0,2.0,199,2.0,SMS Sent,0,0,0,0,0,0,1,SMS Sent
9072,Landing Page Submission,Google,0,0,1,3.0,499,3.0,SMS Sent,0,0,0,0,0,0,0,SMS Sent
9073,Landing Page Submission,Direct Traffic,0,0,1,6.0,1279,3.0,SMS Sent,0,0,0,0,0,0,1,Modified


### Preparaçao dos Dados

In [32]:
X = df_leads.drop(columns=['Converted'], axis=1)
y = df_leads.Converted

In [33]:
import joblib

preprocessor = joblib.load('preprocessor__dataset_leads.pkl')

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=51)

In [35]:
X_train = preprocessor.fit_transform(X_train).toarray()
X_test = preprocessor.transform(X_test).toarray()

In [36]:
print(X_train.shape)
print(X_test.shape)

(7259, 68)
(1815, 68)


### Treinamento do Modelo

In [86]:
lr_model = LogisticRegression(random_state=51)
svc_model = SVC(probability=True, kernel='linear')
#Para executar o modeo de soft voting é necessario colocar o hiperparametro probability = true no svc
tree_model = DecisionTreeClassifier(random_state=51)

voting_model = VotingClassifier(
    estimators=[
        ('logistic regression', lr_model),
        ('svc', svc_model),
        ('decision tree', tree_model)
    ],
    # Hard -> Faz a votação pela maioria das predições dos estimadores
    # Soft -> Faz a votação pela media das probabilidades de cada classe vindas de cada estimador
    voting='soft'
)

In [87]:
voting_model.fit(X_train, y_train)

0,1,2
,estimators,"[('logistic regression', ...), ('svc', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,51
,solver,'lbfgs'
,max_iter,100

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Análise dos Resultados

In [88]:
# Realizar predicoes no conj de testes
y_pred = voting_model.predict(X_test)

In [89]:
y_pred

array([1, 0, 0, ..., 0, 0, 1], shape=(1815,))

In [90]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [91]:
# Mostrar Resultados
print(f'Acuracia: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Acuracia: 0.7862258953168044
Precision: 0.7203125
Recall: 0.6880597014925374
F1-Score: 0.7038167938931298


In [92]:
conf_matrix = confusion_matrix(y_test, y_pred)

fig = px.imshow(conf_matrix, labels=dict(x='Predicao', y='Real', color='Contagem'),
                x=['Not Converted', 'Converted'],
                y=['Not Converted', 'Converted'],
                color_continuous_scale='Viridis'
                )

fig.update_traces(text=conf_matrix, texttemplate = "%{z}")
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [93]:
importances = []

for estimator in voting_model.estimators_:
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_[0]))
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(np.abs(estimator.feature_importances_))
    else:
        print(f"Não foi póssivel carregar a importancia do estimador {type(estimator).__name__}")
        

In [94]:
importance_mean = np.mean(importances, axis=0)

In [95]:
feature_names = preprocessor.get_feature_names_out()

In [96]:
df_feature_names = pd.DataFrame({'Feature': feature_names, 'Importance': importance_mean})
df_feature_names = df_feature_names.sort_values(by='Importance', ascending=True)

In [97]:
fig = px.bar(df_feature_names, x='Importance', y='Feature', orientation='h', title='Importancia das Features')
fig.update_layout(height=1280, width=1000)
fig.show()

### Propriedades do Modelo

In [82]:
X_sample = X_test[7].reshape(1,-1)
lr_pred = voting_model.named_estimators_['logistic regression'].predict(X_sample)
svc_pred = voting_model.named_estimators_['svc'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision tree'].predict(X_sample)

voting_pred = voting_model.predict(X_sample)

In [85]:
print(f"Predição do Regressao Logisitca: {lr_pred[0]}")
print(f"Predição do SVC: {svc_pred[0]}")
print(f"Predição da Decision Tree: {tree_pred[0]}")
print(f"Predição final da Hard Voting: {voting_pred[0]}")

Predição do Regressao Logisitca: 0
Predição do SVC: 0
Predição da Decision Tree: 1
Predição final da Hard Voting: 0


In [98]:
# Evidencias vooting soft

X_sample = X_test[7].reshape(1,-1)
lr_pred_proba = voting_model.named_estimators_['logistic regression'].predict_proba(X_sample)
svc_pred_proba = voting_model.named_estimators_['svc'].predict_proba(X_sample)
tree_pred_proba = voting_model.named_estimators_['decision tree'].predict_proba(X_sample)

voting_pred_proba = voting_model.predict_proba(X_sample)
voting_pred = voting_model.predict(X_sample)

In [101]:
print(f"Probabilidade do Regressao Logisitca: {lr_pred_proba[0]}")
print(f"Probabilidade do SVC: {svc_pred_proba[0]}")
print(f"Probabilidade da Decision Tree: {tree_pred_proba[0]}")
print(f"Probabilidade final da Soft Voting: {voting_pred_proba[0]}")
print(f"Predição final da Soft Voting (Votacao Ponderada): {voting_pred[0]}")

Probabilidade do Regressao Logisitca: [0.67050795 0.32949205]
Probabilidade do SVC: [0.69460835 0.30539165]
Probabilidade da Decision Tree: [0. 1.]
Probabilidade final da Soft Voting: [0.45503877 0.54496123]
Predição final da Soft Voting (Votacao Ponderada): 1
