# Notebook para servir predições do modelo

## Bibliotecas

In [32]:
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib 
from sqlalchemy import create_engine, text
import sqlalchemy
import os
from dotenv import load_dotenv

from sklearn.metrics import silhouette_score

import mlflow
import mlflow.sklearn

from datetime import datetime

In [2]:
path_dotenv = r'C:\Users\erico\Documents\projeto-clusterizacao\customer-segmentation\env\.env'
load_dotenv(path_dotenv)

True

In [3]:
# Utils

def make_monitoring(X, model, experiment_name, model_name, run_name, score):

    mlflow.set_tracking_uri('http://127.0.0.1:5000')
    mlflow.set_experiment(experiment_name)


    with mlflow.start_run(run_name=run_name):
            
        mlflow.log_params(model.get_params())
        mlflow.log_param('Features', X.columns.to_list())
        mlflow.log_metric('silhouette_score', score)
        mlflow.sklearn.log_model(model, model_name)
    mlflow.end_run()
    
    print('silhouette_score:',score)

    return print('Iniciando monitoramento do experimento no Mlflow...')

## Extração

In [4]:
# criar conexão com MySQL
engine = create_engine('mysql+pymysql://admin:' + os.environ.get("MYSQL_PASSWORD") + '@localhost:3306/Bank_Credit_Card')

In [5]:
query = 'SELECT * FROM customer_credit_card'
df = pd.read_sql_query(sql=text(query), con=engine.connect())

## Pré-processamento

In [6]:
def preprocessing_data(df:pd.DataFrame) ->pd.DataFrame:

    # missing
    df.loc[df['MINIMUM_PAYMENTS'].isnull()==True, 'MINIMUM_PAYMENTS'] = 0.0
    df.loc[df['CREDIT_LIMIT'].isnull()==True, 'CREDIT_LIMIT'] = 0.0

    # adicionar CUST_ID como indice para evitar usar a variável com alta cardinalidade
    df.set_index('CUST_ID', inplace=True)

    # adicionar cópia do df
    X = df.copy()
    # dropar colunas criadas anteriormente
    X.drop(['CLUSTER_KMEANS_PCA','SEGMENTATION'], axis=1, inplace=True)

    return X


In [7]:
X = preprocessing_data(df)

In [8]:
X.head()

Unnamed: 0_level_0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
CUST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
C15490,8419.74,1.0,3050.48,1475.56,1574.92,2127.49,1.0,1.0,1.0,0.25,5,67,12300.0,3298.74,4239.3,0.0,12
C17237,2980.05,0.818182,26784.6,26514.3,270.3,0.0,0.583333,0.5,0.083333,0.0,0,33,30000.0,30029.0,11853.8,0.75,12
C10529,2643.34,1.0,26402.4,22257.4,4145.0,0.0,1.0,1.0,0.333333,0.0,0,114,16500.0,24529.3,534.032,1.0,12
C14918,4929.76,1.0,4939.1,4939.1,0.0,2365.84,0.666667,0.666667,0.0,0.25,7,309,6000.0,3952.19,5234.37,0.0,12
C10534,2549.13,1.0,13771.6,4503.31,9268.32,0.0,1.0,0.5,1.0,0.0,0,83,14500.0,16826.4,451.636,0.25,12


## Predição

In [9]:
model = joblib.load(open(r'C:\Users\erico\Documents\projeto-clusterizacao\customer-segmentation\models\model.pkl','rb'))

In [10]:
model

In [12]:
df['CLUSTER_KMEANS_PCA'] = model.predict(X)
score = silhouette_score(X, df['CLUSTER_KMEANS_PCA'])

experiment_name = 'Segmentação em produção'
model_name = 'Kmeans produção'
date = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
run_name = model_name + ' - ' + date

In [13]:
make_monitoring(X=X, model=model, experiment_name=experiment_name, model_name=model_name, run_name=run_name, score=score)

silhouette_score: 0.05271308224052359
Iniciando monitoramento do experimento no Mlflow...




In [15]:
df['CLUSTER_KMEANS_PCA'].value_counts()

1    3906
3    3292
0    1246
2     506
Name: CLUSTER_KMEANS_PCA, dtype: int64

In [16]:
cluster_centroids = pd.DataFrame(
    data=model.named_steps['scaler']
        .inverse_transform(model.named_steps['PCA']
        .inverse_transform(model.named_steps['kmeans'].cluster_centers_)),
    columns=X.columns
)


In [17]:
cluster_centroids

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,4323.978714,0.968937,829.292701,638.575353,190.70229,4148.49596,0.160248,0.137041,0.064058,0.441263,13.204102,7.859752,7043.255465,3862.662483,2112.064527,-0.056467,11.441095
1,1110.521381,0.832008,-218.486699,-215.280944,-2.975929,865.49109,0.328533,0.083135,0.228706,0.14255,3.142373,1.500175,3268.596057,531.434248,549.366987,0.111668,11.369538
2,3394.202657,1.069061,6314.8702,4096.018188,2219.473625,1309.419444,1.209553,0.723716,0.968896,0.087394,3.203392,72.396071,9682.672679,6840.652341,2004.218125,0.346933,12.162653
3,774.393001,0.866541,1695.719092,990.535858,705.636492,-138.38815,0.695961,0.288057,0.545614,0.017804,-0.388638,24.040837,4178.361664,1561.095805,506.279931,0.25328,11.621555


## Dataviz

In [18]:
fig = px.bar(df['SEGMENTATION'].value_counts(), title="Número de clientes segmentados por grupo")
fig.update_layout(xaxis_title="Segmentação", yaxis_title="Número de clientes")
fig.update_layout(showlegend=False)
fig.show()

In [31]:
fig = go.Figure()
angles = list(X.columns)

layoutdict = dict(
    radialaxis=dict(
        visible=True,
        range=[0, 1]
    )
)

unique_clusters = df['CLUSTER_KMEANS_PCA'].unique()
for i in unique_clusters:
    subset = df[df['CLUSTER_KMEANS_PCA'] == i]
    data = [np.mean(subset[col]) for col in subset.columns[:-2]]
    data.append(data[0])
    fig.add_trace(go.Scatterpolar(
        r=data,
        theta=angles,
        fill='toself',
        name="SEGMENTATION " + df.loc[df['CLUSTER_KMEANS_PCA'] == i, 'SEGMENTATION'].unique()[0]
    ))

fig.update_layout(
    polar=layoutdict,
    showlegend=True,
    height=700,
    width=1400
)
fig.show()

In [44]:
fig = make_subplots(rows=2, cols=2, subplot_titles=df['SEGMENTATION'].unique(), specs=[[{'type': 'polar'}]*2]*2)

angles = list(X.columns)
layoutdict = dict(radialaxis=dict(visible=True, range=[0, 1]))

row = 1
col = 1
for segment in df['SEGMENTATION'].unique():
    subset = df[df['SEGMENTATION'] == segment]
    data = [np.mean(subset[col]) for col in subset.columns[:-2]]
    data.append(data[0])
    
    fig.add_trace(go.Scatterpolar(
        r=data,
        theta=angles,
        fill='toself',
        name="Segmentation: " + segment
    ), row=row, col=col)
    
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1],
            tickfont=dict(size=6)  
        )
    ),
    showlegend=True,
    height=800,
    width=1400,
    template="plotly"
)

fig.show()

## Carga

In [22]:
df.head()

Unnamed: 0_level_0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,CLUSTER_KMEANS_PCA,SEGMENTATION
CUST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C15490,8419.74,1.0,3050.48,1475.56,1574.92,2127.49,1.0,1.0,1.0,0.25,5,67,12300.0,3298.74,4239.3,0.0,12,2,Vip
C17237,2980.05,0.818182,26784.6,26514.3,270.3,0.0,0.583333,0.5,0.083333,0.0,0,33,30000.0,30029.0,11853.8,0.75,12,2,Vip
C10529,2643.34,1.0,26402.4,22257.4,4145.0,0.0,1.0,1.0,0.333333,0.0,0,114,16500.0,24529.3,534.032,1.0,12,2,Vip
C14918,4929.76,1.0,4939.1,4939.1,0.0,2365.84,0.666667,0.666667,0.0,0.25,7,309,6000.0,3952.19,5234.37,0.0,12,2,Vip
C10534,2549.13,1.0,13771.6,4503.31,9268.32,0.0,1.0,0.5,1.0,0.0,0,83,14500.0,16826.4,451.636,0.25,12,2,Vip


In [23]:
# adicionar coluna de segmentação 

df['SEGMENTATION'] = df['CLUSTER_KMEANS_PCA'].map({2:'Vip', 3:'Plus', 1:'Mid', 0:'Low',})

In [24]:
df_load = df.sort_values(by='SEGMENTATION', ascending=False)

In [25]:
df_load.reset_index(inplace=True)

In [26]:
df_load.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,CLUSTER_KMEANS_PCA,SEGMENTATION
0,C15490,8419.74,1.0,3050.48,1475.56,1574.92,2127.49,1.0,1.0,1.0,0.25,5,67,12300.0,3298.74,4239.3,0.0,12,2,Vip
1,C18125,530.574,1.0,5221.42,2202.78,3018.64,0.0,1.0,1.0,1.0,0.0,0,135,3000.0,4590.88,191.304,0.583333,12,2,Vip
2,C12829,1141.35,1.0,6003.86,3474.35,2529.51,0.0,1.0,0.916667,1.0,0.0,0,110,2500.0,6848.24,287.309,0.083333,12,2,Vip
3,C11356,2415.41,1.0,3313.65,1938.44,1375.21,843.735,1.0,0.666667,1.0,0.416667,10,55,8500.0,858.811,582.753,0.0,12,2,Vip
4,C14494,3277.09,1.0,3936.51,3249.14,687.37,517.563,1.0,0.666667,0.916667,0.166667,2,57,10000.0,2183.77,812.767,0.0,12,2,Vip


In [27]:
# Carga do csv para o banco MySQL com integração direta do Pandas
df_load.to_sql(name='customer_credit_card',
          con=engine,
          if_exists='replace',
          index=False)

8950

In [28]:
# fechar conexão com o banco
engine.dispose()