In [None]:
# !gdown 1hqHARw2rrB_Y9gaz_wLuaSSd6DhoWqC0
# !gdown 1WAhgUE0ctA6C477rbrfiCs3nH1Xh3uHT

Downloading...
From: https://drive.google.com/uc?id=1hqHARw2rrB_Y9gaz_wLuaSSd6DhoWqC0
To: /content/mockup_ecomm_cust_profile-202303.csv
100% 577k/577k [00:00<00:00, 83.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WAhgUE0ctA6C477rbrfiCs3nH1Xh3uHT
To: /content/mockup_ecomm_cust_profile.csv
100% 369k/369k [00:00<00:00, 130MB/s]


#Dependence library

In [None]:
import pandas as pd;
import numpy as np;
import json;
import re;
import joblib;

from sklearn.cluster import KMeans;
from sklearn.metrics import silhouette_score;
from sklearn.decomposition import PCA;
from sklearn.ensemble import RandomForestClassifier;
from sklearn.preprocessing import MinMaxScaler;

# Customer Classification

In [None]:
class CustomerClassification:
    RAND_SEED = 3141592857;

    customer_profile:pd.DataFrame = None;
    trainset: pd.DataFrame = None;

    model:KMeans = None;
    model_k:int = 4


    def set_profile(self, df:pd.DataFrame):
        self.customer_profile = df;
        return self;
    
    def set_model_k(self, k:int):
        self.model_k = k;
        return self;

    def get_profile(self) -> pd.DataFrame: return self.customer_profile.loc[:];
    def get_trainset(self) -> pd.DataFrame: return self.trainset.loc[:];


    def prepare_trainset(self):
        def flag_outlier(x:float, q1:float, q3:float, iqr:float)->int:
            if x < q1 - 1.5*iqr: return -1; #Extreme low,
            if x > q3 + 1.5*iqr: return 1; #Extreme hi,
            return 0;
        
        def get_dummies_recent_flag(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            dummies:pd.DataFrame = df['recent_flag'].apply(lambda x: 'recent_flag__'+x).str.get_dummies();
            if 'recent_flag__unknown' in dummies.columns: dummies.drop(columns=['recent_flag__unknown'], inplace=True);
            df = df.join(dummies).drop(columns=['recent_flag']);
            return df.reset_index(names='id');
        
        def explode_category_p(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            cat:pd.DataFrame = ( df[ df['category_p'].notna() ]['category_p']
                    .apply(lambda x:json.loads(x)['category_prob'])
                    .transform(pd.Series) );
            cat = cat.reindex( sorted(cat.columns), axis='columns' );
            df = df.join(cat).drop(columns=['category_p']).fillna(0);
            return df.reset_index(names=['id']);
        
        def normalize(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            df[df.columns] = MinMaxScaler().fit_transform( df[df.columns] );
            return df.reset_index(names=['id']);

        prep:pd.DataFrame = self.customer_profile.drop(columns=['reg_date', 'latest_purchase']);
        prep.drop(prep[prep['recent_flag']=='unknown'].index, inplace=True);

        pkt_sz__q1 = prep['pkt_sz'].quantile(.25);
        pkt_sz__q3 = prep['pkt_sz'].quantile(.75);
        pkt_sz__iqr = pkt_sz__q3 - pkt_sz__q1;
        prep['pkt_sz__outlier_f'] = prep['pkt_sz'].apply( lambda x: flag_outlier(x, pkt_sz__q1, pkt_sz__q3, pkt_sz__iqr) );

        gap__q1 = prep['gap'].quantile(.25);
        gap__q3 = prep['gap'].quantile(.75);
        gap__iqr = gap__q3 - gap__q1;
        prep['gap__outlier_f'] = prep['gap'].apply( lambda x: flag_outlier(x, gap__q1, gap__q3, gap__iqr) );

        # Remove outlier
        prep = prep[ (prep['pkt_sz__outlier_f']==0) & (prep['gap__outlier_f']==0) ].drop(columns=['pkt_sz__outlier_f', 'gap__outlier_f'] ).loc[:];

        # Dummies recency
        prep = get_dummies_recent_flag(df=prep.loc[:]);

        # Explode category probability
        prep = explode_category_p(df=prep.loc[:]);
        
        # Normalize data: min-max
        prep = normalize(df=prep.loc[:]);

        self.trainset = prep.loc[:];
        return self;
    

    def explore_models(self, normalized:pd.DataFrame, rng:range=range(2,8), rand_seed:int=None):
        '''Generate K-Means models with selection criteria'''
        def gen_models(k:int, df:pd.DataFrame, seed:int):
            np.random.seed(seed);
            return KMeans(n_init = 'auto', n_clusters=k).fit(df);
        
        ln = len(rng);
        k_metrix:pd.DataFrame = pd.DataFrame({
            'k': rng,
            'model': None,
            'wcss': None,
            'silhouette_avg': None,
        });

        if rand_seed is None: rand_seed=self.RAND_SEED;
        normalized.set_index('id', inplace=True);
        k_metrix['model'] = k_metrix['k'].apply( lambda x: gen_models(x, normalized, rand_seed) );
        k_metrix['wcss'] = k_metrix['model'].apply(lambda x: x.inertia_);
        k_metrix['silhouette_avg'] = k_metrix['model'].apply(lambda x: silhouette_score(normalized, x.fit_predict(normalized)));
        normalized.reset_index(names=['id'], inplace=True);
        return k_metrix.loc[:];
    

    def explore_model__wcc_silhouette(self, k_metrix:pd.DataFrame):
        from matplotlib import pyplot as plt;

        fig, ax1 = plt.subplots();
        color = 'tab:blue';
        ax1.set_xlabel('k');
        ax1.set_ylabel('wcss', color=color);
        ax1.plot(k_metrix['k'], k_metrix['wcss'], color=color);
        ax1.tick_params(axis='y', labelcolor=color);

        ax2 = ax1.twinx();

        color = 'tab:brown';
        ax2.set_ylabel('silhouette', color=color);
        ax2.plot(k_metrix['k'], k_metrix['silhouette_avg'], color=color);
        ax2.tick_params(axis='y', labelcolor=color);

        fig.tight_layout();
        plt.title('k-means: criteria');
        plt.show();
        pass;


    def explore_model__cluster_visual(self, df:pd.DataFrame, k_metrix:pd.DataFrame, rand_seed:int=None):
        from matplotlib import pyplot as plt;

        if rand_seed is None: rand_seed=self.RAND_SEED;

        df.set_index('id', inplace=True);
        pca:PCA = PCA(n_components=2);
        pca_xy:pd.DataFrame = pd.DataFrame(pca.fit(df).transform(df), columns=['x','y']);

        k_metrix['idx'] = k_metrix.index;
        k_metrix['ax_col'] = k_metrix['idx']%4;
        k_metrix['ax_row'] = k_metrix['idx']/4;
        k_metrix['ax_row'] = k_metrix['ax_row'].apply(np.floor);
        k_metrix['ax_row'] = k_metrix['ax_row'].astype(int);
        k_metrix['ax_title'] = k_metrix['k'].apply(lambda x: 'Cluster @ k='+str(x));

        height = (k_metrix['ax_row'].max() + 1)*2.4
        fig, axs = plt.subplots(k_metrix['ax_row'].max()+1, 4, figsize=(12.8, height));
        fig.tight_layout()

        k_metrix.set_index('k', inplace=True);
        for i in k_metrix.index:
            np.random.seed(rand_seed);
            pca_xy['cluster'] = k_metrix.at[i,'model'].fit_predict(df);
            axs[ k_metrix.at[i,'ax_row'], k_metrix.at[i,'ax_col'] ].scatter(pca_xy['x'], pca_xy['y'], c=pca_xy['cluster'], alpha=0.5);
            axs[ k_metrix.at[i,'ax_row'], k_metrix.at[i,'ax_col'] ].set_title(k_metrix.at[i,'ax_title']);
            pass;
        k_metrix.reset_index(names=['k'],inplace=True);
        df.reset_index(names=['id'], inplace=True);
        pass;


    def explore_model__important_feature(self, clustered:pd.DataFrame, rand_seed:int=None) -> pd.DataFrame:
        if rand_seed is None: rand_seed=self.RAND_SEED;
        np.random.seed(rand_seed);

        rf:RandomForestClassifier = RandomForestClassifier();
        clustered.set_index('id', inplace=True);
        rf.fit(clustered.drop(columns=['cluster']), clustered['cluster']);
        clustered.reset_index(names=['id'], inplace=True);
        feature_importance = rf.feature_importances_;
        col_order = feature_importance.argsort();
        return pd.DataFrame({
            'feature': clustered.drop(columns=['id']).columns[col_order],
            'importance': feature_importance[col_order]
        }).sort_values('importance', ascending=False, ignore_index=True);

    
    def train_model(self, rand_seed:int=None):
        if rand_seed is None: rand_seed = self.RAND_SEED;
        df:pd.DataFrame = self.get_trainset();

        np.random.seed(rand_seed);
        self.model = KMeans(n_init = 'auto', n_clusters=self.model_k).fit(df.drop(columns='id'));
        return self;
    
    def export_model(self, filename:str):
        joblib.dump(self.model, filename);
        return self;

    def import_model(self, filename:str):
        self.model = joblib.load(filename);
        return self;

    def execute_model(self, dataset:pd.DataFrame):

        def get_dummies_recent_flag(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            dummies:pd.DataFrame = df['recent_flag'].apply(lambda x: 'recent_flag__'+x).str.get_dummies();
            if 'recent_flag__unknown' in dummies.columns: dummies.drop(columns=['recent_flag__unknown'], inplace=True);
            df = df.join(dummies).drop(columns=['recent_flag']);
            return df.reset_index(names='id');
        
        def explode_category_p(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            cat:pd.DataFrame = ( df[ df['category_p'].notna() ]['category_p']
                    .apply(lambda x:json.loads(x)['category_prob'])
                    .transform(pd.Series) );
            cat = cat.reindex( sorted(cat.columns), axis='columns' );
            df = df.join(cat).drop(columns=['category_p']).fillna(0);
            return df.reset_index(names=['id']);
        
        def normalize(df:pd.DataFrame)->pd.DataFrame:
            if df.empty: return df;
            df.set_index('id', inplace=True);
            df[df.columns] = MinMaxScaler().fit_transform( df[df.columns] );
            return df.reset_index(names=['id']);

        def feature_desc(clustered:pd.DataFrame, k:int)->pd.DataFrame:
            col='cluster';
            # Filter features
            desc:pd.DataFrame = ( clustered[ clustered[col]==k ].drop(columns=[col]).describe().transpose()
                    .sort_values(['50%', 'mean'], ascending=False) );
            desc = desc[ desc['50%']>0.05 ].head(6).loc[:];

            # Order columns
            cols:pd.Series = pd.Series(['recent_flag__active', 'recent_flag__inactive', 'recent_flag__churn', 'patron_length',
                    'pkt_sz', 'dc_amnt', 'gap', 'p_purchase', 'p_purchase_dc'], name='feature');
            imp_feature:list = desc.index.to_list();
            imp_feature = cols[cols.isin(imp_feature)].to_list() + re.findall(r'\bCAT[0-9]*\b',' '.join(imp_feature));
            desc = desc.reindex(imp_feature);
            return desc.loc[:]

        def label(df:pd.DataFrame)->str:
            def fn(x:pd.DataFrame)->str:
                if x['label'] in ['STATUS-ACTIVE', 'STATUS-INACTIVE', 'STATUS-CHURN']:
                    return '{}:{}%'.format(x['label'], round(x['mean'], 3)*100);

                c1:str = 'L' if x['25%'] < .33 else 'H' if x['25%'] > .67 else 'M';
                c2:str = 'L' if x['75%'] < .33 else 'H' if x['75%'] > .67 else 'M';
                return '{}:{}{}'.format(x['label'], c1, c2);

            col_name_map:dict = {
                'recent_flag__active': 'STATUS-ACTIVE',
                'recent_flag__inactive': 'STATUS-INACTIVE',
                'recent_flag__churn': 'STATUS-CHURN',
                'patron_length': 'PATRON-LEN',
                'pkt_sz': 'POCKET-SIZE',
                'dc_amnt': 'DISCOUNT',
                'gap':'GAP-BTW-PURCHASE',
                'p_purchase': 'PURCHASE-PROB',
                'p_purchase_dc': 'DISCOUNT-PROB'
            }

            df = df.reset_index(names=['col']);
            df['label'] = df['col'].map(col_name_map);
            df['label'] = df['label'].combine_first(df['col']);
            df['label'] = df[['label', 'mean', '25%', '75%']].apply(fn, axis='columns');
            return ' | '.join(df['label']);

        dataset = dataset.loc[:];
        dataset = explode_category_p(dataset);
        df:pd.DataFrame = dataset.drop(columns=['reg_date', 'latest_purchase']);
        df = ( df.pipe(get_dummies_recent_flag)
                .pipe(normalize) );
        
        np.random.seed(self.RAND_SEED);
        df['cluster'] = self.model.fit_predict(df.drop(columns=['id']));

        segment:pd.DataFrame = pd.DataFrame(
            index=df['cluster'].sort_values().drop_duplicates(),
            data={
                'c':None,
                'feature': None,
                'label': None
            });
        segment['c'] = segment.index;
        segment['feature'] = segment['c'].apply(lambda x: feature_desc(df, x));
        segment['label'] = segment['feature'].apply(lambda x: label(x));
        segment_label:dict = segment[['label']].to_dict()['label'];
        
        df['segment'] = df['cluster'].map(segment_label);
        customer_cluster:dict = df.set_index('id')['cluster'].to_dict();
        dataset['segment_id'] = dataset['id'].map(customer_cluster);
        customer_segment:dict = df.set_index('id')['segment'].to_dict();
        dataset['segment'] = dataset['id'].map(customer_segment);
        return dataset.loc[:];

#Example

##Sample of model development

In [None]:
# # Prepare train-dataset
# src:pd.DataFrame = pd.read_csv('mockup_ecomm_cust_profile-202303.csv');
# cc:CustomerClassification = ( CustomerClassification().set_profile(src)
#         .prepare_trainset() );
# prep = cc.get_trainset();

# # Generate model and k - criteria
# k_metrix = cc.explore_models( prep, range(2,20) );
# display( cc.explore_model__wcc_silhouette(k_metrix),
#     prep.boxplot(['patron_length', 'pkt_sz', 'dc_amnt', 'gap', 'p_purchase', 'p_purchase_dc']),
#     cc.explore_model__cluster_visual(prep,k_metrix) )

# # Define cluster characteristic - customer segment
# prep['cluster'] = k_metrix.set_index('k').at[4, 'model'].fit_predict(prep.drop(columns=['id']))
# display( cc.explore_model__important_feature(prep) )
# for x in prep['cluster'].drop_duplicates().sort_values():
#     segment_desc = ( prep[prep['cluster']==x].drop(columns=['cluster']).describe().transpose()
#             .sort_values(['50%', 'mean'], ascending=False) );
#     display('cluster: '+str(x), segment_desc)

##Sample of model testing

In [None]:
# src:pd.DataFrame = pd.read_csv('mockup_ecomm_cust_profile-202303.csv');
# tgt:pd.DataFrame = pd.read_csv('mockup_ecomm_cust_profile-202303.csv');
# cc:CustomerClassification = ( CustomerClassification().set_profile(src)
#         .prepare_trainset()
#         .set_model_k(4).train_model() );
# df = cc.execute_model(tgt)

## Sample of export, deploy, and execute

In [None]:
# src:pd.DataFrame = pd.read_csv('mockup_ecomm_cust_profile-202303.csv');
# tgt:pd.DataFrame = pd.read_csv('mockup_ecomm_cust_profile-202303.csv');
# ( CustomerClassification().set_profile(src).prepare_trainset()
#         .set_model_k(4).train_model()
#         .export_model('model.joblib') );

# df:pd.DataFrame = ( CustomerClassification().import_model('model.joblib')
#         .execute_model(tgt) )
# df.segment.drop_duplicates()