# 0.0 Imports

In [8]:
import numpy   as np
import pandas  as pd
import umap.umap_ as umap
import regex as re
import sqlite3
import s3fs

from sklearn             import cluster as c
from sklearn             import metrics as m
from sklearn             import preprocessing as pp
from sklearn             import decomposition as dd
from sklearn             import ensemble as en
from scipy.cluster       import hierarchy as hc
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sqlalchemy          import create_engine

# 8.0 Cluster 

In [11]:
df8=pd.read_csv('model/df8')
df82=pd.read_csv('model/df82')

## 8.1 Cluster creation

In [7]:
# model definition
hc1 = hc.linkage(df82, 'ward')

# model predict
labels = hc.fcluster(hc1, 10, criterion='maxclust')

df8['cluster'] = labels

print('SS value: {}'.format(m.silhouette_score(df82, labels, metric='euclidean')))

SS value: 0.6450212205923372


## 8.2 Cluster profile

In [25]:
df8['cluster'] = labels

# Percentual of customers
df_cluster = df8[['customer_id', 'cluster']].groupby('cluster').count().reset_index()
df_cluster['perc_customer'] = 100*(df_cluster['customer_id'] / df_cluster['customer_id'].sum())
df_cluster['perc_customer']=df_cluster['perc_customer'].map('{:,.2f} %'.format)

# Gross revenue
df_gross_revenue = df8[['gross_revenue', 'cluster']].groupby('cluster').sum().reset_index()
# df_gross_revenue=df_gross_revenue.rename(columns={'gross_revenue':'avg_gross_revenue'})
df_cluster = pd.merge(df_cluster, df_gross_revenue, how='inner', on='cluster')

# Gross revenue percentual
df_cluster['gross_perc'] = 100*(df_cluster['gross_revenue'] / df_cluster['gross_revenue'].sum())
df_cluster['gross_perc']=df_cluster['gross_perc'].map('{:,.2f} %'.format)

# items
df_items = df8[['items', 'cluster']].groupby('cluster').sum().reset_index()
df_cluster = pd.merge(df_cluster, df_items, how='inner', on='cluster')
df_cluster['items']=df_cluster['items'].astype(int)

# products 
df_products = df8[['products', 'cluster']].groupby('cluster').sum().reset_index()
df_cluster = pd.merge(df_cluster, df_products, how='inner', on='cluster')
df_cluster['products']=df_cluster['products'].astype(int)

# invoice_no
df_invoice_no = df8[['total_invoices', 'cluster']].groupby('cluster').sum().reset_index()
df_invoice_no=df_invoice_no.rename(columns={'total_invoices':'invoices'})
df_cluster = pd.merge(df_cluster, df_invoice_no, how='inner', on='cluster')
df_cluster['invoices']=df_cluster['invoices'].astype(int)

# returns
df_returns = df8[['returns_qtt', 'cluster']].groupby('cluster').sum().reset_index()
df_cluster = pd.merge(df_cluster, df_returns, how='inner', on='cluster')
df_cluster['returns_qtt']=df_cluster['returns_qtt'].astype(int)

# Avg recency days
df_avg_recency_days = df8[['recency_days', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster, df_avg_recency_days, how='inner', on='cluster')
df_cluster['recency_days']=df_cluster['recency_days'].astype(int)
df_cluster=df_cluster.rename(columns={'recency_days':'avg_recency_days'})

# Avg frequency
df_avg_frequency = df8[['frequency', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster, df_avg_frequency, how='inner', on='cluster')
df_cluster['frequency']=df_cluster['frequency'].astype(float)
df_cluster['frequency']=df_cluster['frequency'].round(2)
df_cluster=df_cluster.rename(columns={'frequency':'avg_frequency'})


df_cluster=df_cluster.sort_values(['gross_revenue'],ascending=False).reset_index(drop=True)

df_cluster['gross_revenue']=df_cluster['gross_revenue'].map('$ {:,.2f}'.format)

df_cluster=df_cluster.rename(columns={'customer_id':'customers'})

df_cluster.loc[0:0,'cluster'] = 1
df_cluster.loc[1:1,'cluster'] = 2
df_cluster.loc[2:2,'cluster'] = 3

rows=df_cluster.shape[0:1]
rows=int(rows[0])

if rows>3:
        df_cluster.loc[3:3,'cluster'] = 4
        df_cluster.loc[4:4,'cluster'] = 5
        df_cluster.loc[5:5,'cluster'] = 6
        df_cluster.loc[6:6,'cluster'] = 7
        df_cluster.loc[7:7,'cluster'] = 8
        df_cluster.loc[8:8,'cluster'] = 9
        df_cluster.loc[9:9,'cluster'] = 10
        df_cluster.loc[10:10,'cluster'] = 11
else:
    None

df_cluster

Unnamed: 0,cluster,customers,perc_customer,gross_revenue,gross_perc,items,products,invoices,returns_qtt,avg_recency_days,avg_frequency
0,1,550,9.66 %,"$ 5,654,076.61",56.39 %,3250561,201908,7366,72580,44,0.21
1,2,819,14.38 %,"$ 1,432,428.18",14.29 %,727001,97768,3285,11821,67,0.25
2,3,1194,20.97 %,"$ 1,202,398.85",11.99 %,523178,100459,2804,6550,104,0.46
3,4,383,6.73 %,"$ 979,998.18",9.77 %,601463,61483,2535,8305,40,0.1
4,5,1050,18.44 %,"$ 479,853.66",4.79 %,207407,43740,1889,3046,132,0.57
5,6,326,5.73 %,"$ 86,580.39",0.86 %,50758,3769,353,693,155,0.98
6,7,309,5.43 %,"$ 83,215.82",0.83 %,27482,7375,391,301,181,0.84
7,8,356,6.25 %,"$ 71,615.04",0.71 %,20886,6751,411,506,179,0.92
8,9,494,8.68 %,"$ 34,508.00",0.34 %,9066,3832,520,52,193,0.98
9,10,213,3.74 %,"$ 2,800.02",0.03 %,388,325,214,4,188,1.0


In [26]:
df8['cluster']=df8['cluster'].apply(lambda x: 1 if x == 3 
                                         else 2 if x == 6
                                         else 3 if x == 4
                                         else 4 if x == 8
                                         else 5 if x == 10
                                         else 6 if x == 7
                                         else 7 if x == 9
                                         else 8 if x == 5
                                         else 9 if x == 1
                                         else 10
                                    )

# 9 Deploy to production

## 9.1 Insert into POSTGRES

In [27]:
df8['total_invoices'] = df8['total_invoices'].astype(int)
df8['items'] = df8['items'].astype(int)

In [28]:
# # endpoint='sqlite:///insiders_db.sqlite' # local

host='insiders-database.cq1cjxmuhsns.us-east-2.rds.amazonaws.com'
port='5432'
database='postgres'
user='eduardo'
pwd='soturno95'

endpoint='postgresql://eduardo:soturno95@insiders-database.cq1cjxmuhsns.us-east-2.rds.amazonaws.com/postgres'
conn = create_engine(endpoint)

# # create table
# query_create_table_insiders = """
#    CREATE TABLE insiders ( 
#        customer_id     INTEGER,
#        gross_revenue   REAL,
#        recency_days    INTEGER,
#        frequency       REAL,
#        total_invoices  INTEGER,
#        items           INTEGER,
#        products        INTEGER,
#        returns_qtt     INTEGER,
#        cluster         INTEGER
#    )
# """

# # conn.execute(query_create_table_insiders)
# # conn.commit()
# # conn.close()

df8.to_sql( 'insiders', con=conn, if_exists='append', index=False )
# # conn.close()

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f388e00b400>

In [32]:
# consulting database
# query = """
#     SELECT * FROM insiders 
# """

# df = pd.read_sql_query( query, conn )

# df.head()