# Data Dictionary

|Attribute|Description
----------|-----------
InvoiceNo| Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
StockCode| Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
Description| Product (item) name. Nominal.
Quantity| The quantities of each product (item) per transaction. Numeric.
InvoiceDate| Invoice Date and time. Numeric, the day and time when each transaction was generated.
UnitPrice| Unit price. Numeric, Product price per unit in sterling.
CustomerID| Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
Country| Country name. Nominal, the name of the country where each customer resides.

# 0.0 Imports

In [14]:
import pandas as pd
import inflection
#from src import GeneralUtils  as gu
#import seaborn as sns
#import matplotlib.pyplot as plt
#import plotly.express as px
#import matplotlib.cm as cm
import numpy as np
#import scipy.stats as st
#from scipy.stats import kstest

import umap.umap_ as um
#from sklearn.decomposition import PCA
import sklearn.metrics as mt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn.preprocessing as pp
import sklearn.manifold as man

from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans, AgglomerativeClustering
import sklearn.mixture as mix
from scipy.cluster.hierarchy import dendrogram, ward, linkage, fcluster

import dotenv
import datetime as dt
import sqlite3
from sqlalchemy import create_engine
import pickle
import s3fs

In [15]:
def recency_feat(df_feature_, df_feature_transaction_):

    # Recency
    diff = df_feature_.invoice_date.max() - df_feature_transaction_.groupby('customer_id').max()['invoice_date']
    recency = pd.DataFrame(diff).reset_index().rename(columns={'invoice_date': 'recency'})

    recency['recency'] = recency['recency'].dt.days

    return recency


def frequency_feat(df_feature_, df_feature_transaction_):

    # Frequency
    max_date = df_feature_.invoice_date.max()
    aux = df_feature_transaction_.groupby('customer_id').agg( max_ = ('invoice_date', 'max'), 
                                        min_ = ('invoice_date', 'min'),
                                        #days_ = ('invoice_date', lambda x: (max_date - min_date).days + 1),
                                        days_ = ('invoice_date', lambda x: (max_date - x.min()).days + 1),
                                        buy_ = ('invoice_no', 'nunique'),
                                        ).reset_index()

    aux['frequency'] = aux.buy_/aux.days_

    return aux


def total_spent_feat(df_feature_):

    # Total Spent
    df_feature_['gross_revenue'] = df_feature_.quantity * df_feature_.unit_price

    monetary = df_feature_.groupby('customer_id').sum(numeric_only=True).gross_revenue.reset_index().rename(columns={'gross_revenue': 'total_spent'})
    df_feature_ = df_feature_.merge((monetary), how='left', on='customer_id')

    return monetary


def transactions_feat(df_features_):

    # Transactions
    transactions = df_features_.groupby('customer_id').nunique()[['invoice_no']].reset_index().rename(columns={'invoice_no': 'transactions'})

    return transactions


def avg_ticket_feat(df_features_):

    # Average Ticket
    avg_ticket = df_features_.groupby(['customer_id', 'invoice_no']).sum(numeric_only=True).gross_revenue.reset_index().rename(columns={'gross_revenue': 'avg_ticket'}).groupby('customer_id').mean(numeric_only=True).reset_index()
   
    return avg_ticket


def product_variety_feat(df_features_):

    # Product Variety
    product_variety = df_features_.groupby('customer_id').nunique()[['description']].reset_index().rename(columns={'description': 'product_variety'})
    
    return product_variety


def avg_basket_feat(df_features_):

    # Average Basket Size 
    avg_basket = df_features_[['customer_id', 'invoice_no', 'quantity']].groupby(['customer_id', 'invoice_no']).sum()\
                                                                                                                    .reset_index().groupby('customer_id').mean(numeric_only=True)\
                                                                                                                    .reset_index().rename(columns={'quantity': 'avg_basket'})

    return avg_basket


def avg_basket_variety_feat(df_features_): 

    # Average Basket Variety Size
    avg_basket_var = df_features_[['customer_id', 'invoice_no', 'stock_code']].groupby(['customer_id', 'invoice_no']).nunique()\
                                                                                                                    .reset_index().groupby('customer_id').mean(numeric_only=True)\
                                                                                                                    .reset_index().rename(columns={'stock_code': 'avg_basket_variety'})
  
    return avg_basket_var


def cancellations_feat(df_features_): 

    cancellations = df_features_[['customer_id', 'quantity']].groupby('customer_id').apply(lambda c: c.abs().sum())[['quantity']].reset_index().rename(columns={'quantity': 'number_of_cancellations'})

    return cancellations


def quantity_feat(df_features_): 

    # Total Quantity
    quantity = df_features_.groupby('customer_id').sum(numeric_only=True).quantity.reset_index().rename(columns={'quantity': 'total_quantity'})

    return quantity

In [16]:
secret = dotenv.dotenv_values('../vars/.env')['SECRET_KEY']
key = dotenv.dotenv_values('../vars/.env')['ACCESS_KEY']
fs = s3fs.S3FileSystem(anon=False, key=key, secret=secret)

# 0.1 Load Data

In [17]:
path_s3 = 's3://insider-data-bucket/'
df_raw = pd.read_csv(path_s3 +  'Ecommerce.csv', 
                     encoding='ISO-8859-1',
                     storage_options={'key': key,
                                      'secret': secret})
#df_raw = pd.read_csv('../data/raw/Ecommerce.csv', encoding='ISO-8859-1')

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 1.0. Data Description

In [18]:
df_description = df_raw.drop('Unnamed: 8', axis=1)

## 1.1. Renaming Columns

In [19]:
snakecase = lambda x: inflection.underscore(x)
new_cols = list(map(snakecase, df_description.columns))

df_description.columns = new_cols

In [20]:
df_description['description'] = df_description['description'].str.lower()

## 1.4.  Handling Missing Values

**I will add a custom customer ID for each unique inovice number with a missing customer ID**

Issue: Doing this, these new customers' IDs will only have one transaction, but this might be better than removing all this data.

In [21]:
df_missing = df_description.query('customer_id.isnull()')
missing_invoice_no = df_missing[['invoice_no']].drop_duplicates()
missing_invoice_no['customer_id'] = np.arange(19000, 19000+len(missing_invoice_no))
df_description = df_description.merge(missing_invoice_no, on='invoice_no', how='left')
df_description['customer_id'] = df_description['customer_id_x'].combine_first(df_description['customer_id_y'])
df_clean = df_description.drop(columns=['customer_id_x', 'customer_id_y'])

## 1.5. Fixing Data Types

In [22]:
df_clean.invoice_date = pd.to_datetime(df_clean.invoice_date)
df_clean.customer_id = df_clean.customer_id.astype(int)

## 1.6. Descriptive Statistics

### 1.6.1 Numerical Attributes

In [23]:
num_att = df_clean.select_dtypes(include=['float64', 'int64'])
cat_att = df_clean.select_dtypes(exclude=['float64', 'int64', 'datetime64[ns]'])

### 1.6.2 Categorical Attributes

**Invoice Number with at least one string**

In [24]:
# df_description.inoice_no.astype(int)
letter_invoices = df_clean.loc[df_description.invoice_no.str.contains('C')]

print('Total Number of Cancellation invoices:', len(letter_invoices))
print('Total Number of Negative quantites:', len(letter_invoices.query('quantity < 0')))

Total Number of Cancellation invoices: 9288
Total Number of Negative quantites: 9288


![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 2.0. Variables Filtering

In [25]:
#df_filter = df_clean.copy()

## 2.1. Numerical

By the description of the items with a negative quantity and are not cancellations, it seems like they are some mistakes or an issue with the product. Since these instances don't represent actual sales, we will remove them from the dataset.

In [26]:
df_filter = df_clean.copy().query("~(quantity <= 0 & invoice_no.str.isdigit())")\
                           .query('unit_price >= 0.004')

**Unit prices very close to zero**

Half of the instances don't have a description, which makes it harder to define what these instances mean. Therefore, they will be removed (unit price < 0.004).

In [27]:
#df_filter = df_filter.query('unit_price >= 0.004')

## 2.2. Categorical

In [28]:
# removing stock_codes that are only letters
df_filter = df_filter.query("~stock_code.isin(['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY', 'DCGSSGIRL', 'PADS', 'B', 'CRUK'])")\
                      .query("~country.isin(['European Community', 'Unspecified'])")

# removing European Community and Unspecified
#df_filter = df_filter.query("~country.isin(['European Community', 'Unspecified'])")

#country_dict = {'EIRE': 'Ireland', 'RSA': 'South Africa', 'USA': 'United States', 'Channel Islands': 'United Kingdom'}

#countries_to_modify = ['USA', 'Channel Islands', 'RSA', 'EIRE']

#df_filter.country = df_filter.country.apply(lambda x: country_dict[x] if x in countries_to_modify else x)

In [29]:
# description
print(f'Removed data: {1 - (df_filter.shape[0]/df_clean.shape[0]):.3%}')
print(f'Number of Elements Removed: {df_clean.shape[0] - df_filter.shape[0]}')

Removed data: 1.064%
Number of Elements Removed: 5765


In [30]:
print('Number of rows Cancelation:', df_filter.query("quantity < 0 or invoice_no.str.contains('C')").shape[0])
print('Number of rows Transaction:', df_filter.query('quantity > 0').shape[0])

Number of rows Cancelation: 8730
Number of rows Transaction: 527414


![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 3.0. Feature Engineering

In [31]:
df_feature = df_filter.copy()
df_feature_cancellation = df_filter.query("quantity < 0 or invoice_no.str.contains('C')").copy()
df_feature_transaction = df_filter.query('quantity > 0').copy()

**Recency, Frequency and Monetary**

In [35]:
#  Number of days since the last purchase
recency = recency_feat(df_feature, df_feature_transaction)

# Number of unique purchases/days since first date in the dataset.
frequency = frequency_feat(df_feature, df_feature_transaction)

# Total Spent
monetary = total_spent_feat(df_feature)

**Number of transactions, Variety, Average Ticket and Average Basket Size**

In [36]:
# Number of purchases
transactions = transactions_feat(df_feature)

# Average purchases value
avg_ticket = avg_ticket_feat(df_feature)

# Number Unique products.
product_variety = product_variety_feat(df_feature_transaction)

# Average basket size
avg_basket = avg_basket_feat(df_feature)

# Average number of unique products per purchase
avg_basket_variety = avg_basket_variety_feat(df_feature_transaction)

**Average Recency, Total Cancelltions, Quantity** - <font color='red'>Average Recency Removed</font>

In [37]:
# Number of cancellation
cancellations = cancellations_feat(df_feature_cancellation)

# Total Quantity
quantity = quantity_feat(df_feature)

In [38]:
df_feature_new = df_feature.copy().merge(recency, how='left', on='customer_id')\
.drop_duplicates('customer_id')\
.merge(frequency[['customer_id', 'frequency']], how='left', on='customer_id')\
.merge(monetary, how='left', on='customer_id')\
.merge(transactions, how='left', on='customer_id')\
.merge(avg_ticket, how='left', on='customer_id')\
.merge(product_variety, how='left', on='customer_id')\
.merge(avg_basket, how='left', on='customer_id')\
.merge(avg_basket_variety, how='left', on='customer_id')\
.merge(cancellations, how='left', on='customer_id')\
.merge(quantity, how='left', on='customer_id')


Some customers only have cancellations, they need to be removed.

In [39]:
columns = ['customer_id', 'recency', 'frequency', 'total_spent', 'transactions', 'total_quantity', 'avg_ticket', 'product_variety', 'avg_basket', 'avg_basket_variety', 'number_of_cancellations']
df_customers = df_feature_new.copy()[columns]
#df_customers.head()

The number of cancelations with null values are customers with no cancellations.

I want to focus on customers who have made purchases. Therefore I will remove those with a negative monetary values, which may mean they have more canceled orders than purchases.

In [40]:
# fill null values for number of cancellations
df_customers['number_of_cancellations'] = df_customers['number_of_cancellations'].fillna(0)
df_customers_clean = df_customers.query('(total_spent > 0)')

#print('Items Removed:', df_customers.shape[0] - df_customers_clean.shape[0])

In [41]:
df_eda = df_customers_clean.copy().query('~(customer_id == 16446) & (total_quantity!=0)')

**Removing bad users**

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 5.0. Data Preparation

In [42]:
#df_prep = df_eda.copy()

In [43]:
#std = Pipeline([('standard', pp.StandardScaler())])
#minmax = Pipeline([('minmax', pp.MinMaxScaler())])
#robust = Pipeline([('robust', pp.RobustScaler())])
#cols = ['recency', 'frequency', 'total_spent', 'total_quantity', 'transactions', 'avg_ticket', 'product_variety', 'avg_basket', 'number_of_cancellations']
#pass_cols = ['customer_id'] 
#
#preprocess = ColumnTransformer([('pass', 'passthrough', pass_cols), ('minmax', minmax, cols)])
##std_preprocess = ColumnTransformer([('pass', 'passthrough', pass_cols), ('standard', std, cols)])

**Using Min Max Scaler for all features.**

In [44]:
#df_prep_scaled
#df_prep_scaled = pd.DataFrame(preprocess.fit_transform(df_prep))
#cols = np.concatenate((pass_cols, cols), axis=0)
#df_prep_scaled.columns = cols

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 6.0. Feature Selection

In [45]:
# remove avg_basket
# remove avg_recency
# remove items
# remove Transaction

columns_selected = ['customer_id', 'recency', 'frequency', 'total_spent', 'total_quantity', 'number_of_cancellations']

In [46]:
df_fselect = df_eda.copy()[columns_selected]
#df_fselect_sc = df_prep_scaled.copy()[columns_selected]

# 7.0. Embedding Space

In [47]:
df_embed = df_fselect.drop(columns=['customer_id']).copy()

#df_embed_sc = df_fselect_sc.drop(columns=['customer_id']).copy()

## 7.4. Tree-Based Embedding

Variable that organize the customers in a way that I can group the most valuable clients.

In [48]:
X = df_embed.drop(columns='total_spent')
y = df_embed['total_spent']

# model
# rf = RandomForestRegressor(n_estimators=100, random_state=42)

# training
#rf.fit(X, y)

#rf = pickle.load(open('../model/rf_model.pkl', 'rb'))
rf = pickle.load(fs.open('s3://insider-data-bucket/rf_model.pkl', 'rb'))
# Aplying the tree structure in the training set (Leafs)
df_leaf = pd.DataFrame(rf.apply(X)) 


In [49]:
#umap = um.UMAP(n_neighbors=50, random_state=42, n_jobs=-1, n_epochs=500, n_components=4)
#umap = pickle.load(open('../model/umap_reducer.pkl', 'rb'))

umap = pickle.load(fs.open('s3://insider-data-bucket/umap_reducer.pkl', 'rb'))
df_umap_rf = umap.transform(df_leaf)
df_embed_tree = pd.DataFrame(df_umap_rf)
#plt.figure(figsize=(12, 6))
#
#sns.scatterplot(x=df_umap_rf[:, 0], y=df_umap_rf[:, 1], size=0.1)


![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 8.0. Hyperparameter Fine-Tuning

In [50]:
#X = df_fselect_sc.copy().reset_index(drop=True).drop(columns=['customer_id'])
# Using the data of the embedding space build with umap and a tree model

#X = df_embed_sc.copy()

X_tree = df_embed_tree.copy()

#X_umap = df_umap_sc.copy()
#
#X_tsne = df_tsne_sc.copy()

# Results

In [51]:
#clusters = np.arange(2, 30, 1)
#hierarch_perf = gu.hierarchical_performance(X=X_tree, clusters=clusters, plot=False, metric='euclidean')
#gmm_perf = gu.gmm_performance(X=X_tree, components=clusters, plot=False, metric='euclidean', covariance_type='full')
#kmeans_perf = gu.kmeans_performance(X=X_tree, clusters=clusters, plot=False, metric='euclidean')
#
#results = pd.concat([kmeans_perf, hierarch_perf, gmm_perf])
#results.columns = clusters
#
#results.style.highlight_max(color='lightgreen', axis=1)

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 8.0. Model Training

## 8.1. Hierarchical Clustering

In [52]:
n_cluster = 10
kmeans = KMeans(init='random', n_clusters=n_cluster, n_init=10, max_iter=300, random_state=42 )

labels = kmeans.fit_predict(X_tree)

sil = mt.silhouette_score(X_tree, labels, metric='euclidean')

print(f'Silhouette Score: {sil}')

Silhouette Score: 0.6161004900932312


![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 9.0. Cluster Analysis

## 9.1. Visualization

In [53]:
cluster_df = df_fselect.copy()
#cluster_df_sc = df_fselect_sc.copy()
cluster_df['cluster'] = labels
#cluster_df_sc['cluster'] = labels

## 9.2. Clusters Profile

In [54]:
prof = cluster_df.groupby('cluster').count()['customer_id'].reset_index().rename(columns={'customer_id': 'number_of_customers'})
prof['percentual'] = round((cluster_df.groupby('cluster').count().reset_index()['customer_id']/cluster_df.shape[0])*100, 2)

prof['avg_recency'] = cluster_df.groupby('cluster').mean().reset_index()['recency']

prof['avg_frequency'] = cluster_df.groupby('cluster').mean().reset_index()['frequency']

prof['avg_spent'] = cluster_df.groupby('cluster').mean().reset_index()['total_spent']

prof['number_of_cancellations'] = cluster_df.groupby('cluster').mean().reset_index()['number_of_cancellations']

prof['avg_quantity'] = cluster_df.groupby('cluster').mean().reset_index()['total_quantity']


prof

Unnamed: 0,cluster,number_of_customers,percentual,avg_recency,avg_frequency,avg_spent,number_of_cancellations,avg_quantity
0,0,487,8.57,30.172485,0.052795,10708.476448,162.74538,6309.527721
1,1,189,3.33,178.597884,0.018227,8.208095,0.0,1.550265
2,2,782,13.77,183.681586,0.013896,274.779156,0.930946,91.512788
3,3,851,14.98,125.80141,0.018373,587.172456,6.065805,258.13866
4,4,482,8.49,56.149378,0.02984,2771.582656,16.775934,1523.890041
5,5,364,6.41,92.151099,0.013011,1484.395275,11.997253,755.623626
6,6,624,10.99,192.171474,0.01626,89.819247,0.225962,24.331731
7,7,401,7.06,47.673317,0.040373,1903.204065,13.690773,962.154613
8,8,929,16.36,101.769645,0.019284,1050.478967,5.080732,470.316469
9,9,571,10.05,123.022767,0.021182,350.855289,0.873905,164.488616


![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)


# 11.0. Deploy to Production

In [61]:
cluster_df['last_training_time'] = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [76]:
# Banco NA AWS
variables_env = dotenv.dotenv_values('../vars/.env')

host = variables_env['HOST']
port = variables_env['PORT']
database =variables_env['DATABASE']
user=variables_env['USER']
pwd=variables_env['PASSWORD']

endpoint = f'postgresql+psycopg2://{user}:{pwd}@{host}:{port}/postgres'

engine = create_engine(endpoint)
con = engine.connect()

In [77]:
query_create_table_insiders = 'TRUNCATE TABLE insiders'
con.execute(query_create_table_insiders)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x16287ee2400>

In [78]:
query_create_table_insiders = """
    CREATE TABLE IF NOT EXISTS insiders(
        customer_id                 INTEGER,
        recency                     INTEGER,
        frequency                   REAL,
        total_spent                 REAL,
        total_quantity              INTEGER,
        number_of_cancellations     INTEGER,
        cluster                     INTEGER,
        last_training_time          TEXT
    )"""

# Create Table

con.execute(query_create_table_insiders)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x162884eb2b0>

In [79]:
# Inser Data
cluster_df.to_sql('insiders', con=con, if_exists='append', index=False)

680

In [80]:
test = pd.read_sql('SELECT * FROM insiders', con=con)

combined = pd.concat([test, cluster_df], axis=0)
cols = combined.drop(columns='last_training_time').columns.to_list()
combined = combined.drop_duplicates(subset=cols, keep='last')

combined.to_sql('insiders', con=con, if_exists='replace', index=False)

con.close()