In [21]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import shap

## Data

In [10]:
df = pd.read_csv('ga_customers_clustered.csv')

In [11]:
df.head()

Unnamed: 0,fullVisitorId,channelGrouping,weekend_prop,hour,sessionId,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,totals.hits,totals.pageviews,bounce_prop,trafficSource.medium,cluster
0,213131000000000.0,Direct,0.0,22.0,1,Chrome,desktop,0.0,Macintosh,14.0,13.0,0.0,(none),8
1,435324000000000.0,Referral,0.666667,21.0,3,Chrome,desktop,0.0,Macintosh,14.0,11.0,0.0,referral,7
2,562678000000000.0,Organic Search,0.0,14.0,2,Chrome,desktop,0.0,Macintosh,12.5,10.5,0.0,organic,8
3,585709000000000.0,Referral,0.0,20.0,1,Chrome,desktop,0.0,Linux,22.0,20.0,0.0,referral,8
4,670722000000000.0,Referral,0.0,17.0,2,Chrome,desktop,0.0,Linux,9.5,9.5,0.0,referral,8


## Profile by Cluster

In [27]:
def profile_clusters(df_profile):
    #-------------------------------Classification Model-------------------------------
    X = df_profile.drop('cluster', axis=1)
    y = df_profile['cluster']

    clf = LGBMClassifier(class_weight='balanced', colsample_bytree=0.6)
    scores = cross_val_score(clf, X, y, scoring='f1_weighted', cv=5)
    print(f'F1 score is {scores.mean()}')
    
    # Model quality check
    if scores.mean() > 0.5:
        clf.fit(X, y)
    else:
        raise ValueError("Clusters are not distinguishable. Can't profile. ")
    
    #-----------------------------SHAP Importance--------------------------------------
    # Get importance
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X)

    # Get 7 most important features
    importance_dict = {f: 0 for f in X.columns}
    topn = 7
    topn = min(len(X.columns), topn)
    
    #Aggregating the absolute importance scores per feature per cluster
    for c in np.unique(df_profile['cluster']):
        shap_df = pd.DataFrame(shap_values[c], columns=X.columns)
        abs_importance = np.abs(shap_df).sum()
        for f in X.columns:
            importance_dict[f] += abs_importance[f]
            
    #Sorting the dictionary by importance
    importance_dict = {k: v for k, v in sorted(importance_dict.items(), key=lambda item: item[1], reverse=True)}
    important_features = [k for k, v in importance_dict.items()]
    n_important_features = [k for k, v in importance_dict.items()][:topn]
    
    #----------------------------Output prep--------------------------------------------
    # DATAFRAME OUTPUT - concatenate profiles of all the clusters into 1 dataframe
    for k in np.unique(df_profile['cluster']):
        if k == 0:
            profile = pd.DataFrame(columns=['cluster', 'feature', 'mean_value'], index=range(len(n_important_features)))
            profile['cluster'] = k
            profile['feature'] = n_important_features
            profile['mean_value'] = df_profile.loc[df_profile.cluster == k, n_important_features].mean().values
        else:
            profile_2 = pd.DataFrame(columns=['cluster', 'feature', 'mean_value'],
                                     index=range(len(n_important_features)))
            profile_2['cluster'] = k
            profile_2['feature'] = n_important_features
            profile_2['mean_value'] = df_profile.loc[df_profile.cluster == k, n_important_features].mean().values
            profile = pd.concat([profile, profile_2])
            
    profile.reset_index(drop=True, inplace=True)
    
    #PLOT OUTPUT
    # Scaling for plotting
    for c in X.columns:
        df_profile[c] = MinMaxScaler().fit_transform(np.array(df_profile[c]).reshape(-1, 1))

    # Plotly output
    cluster_names = [f'Cluster {k}' for k in np.unique(df_profile['cluster'])] # X values such as "Cluster 1", "Cluster 2", etc
    data = [go.Bar(name=f, x=cluster_names, y=df_profile.groupby('cluster')[f].mean()) for f in n_important_features] #a list of plotly GO objects with different Y values
    fig = go.Figure(data=data)
    # Change the bar mode
    fig.update_layout(barmode='group')

    return fig, profile, important_features

In [28]:
df_profile = df.drop('fullVisitorId', axis=1)
categorical = ['channelGrouping', 'device.browser', 'device.deviceCategory', 'device.operatingSystem', 'trafficSource.medium']
#OHE if categorical data is present
if categorical:
    df_profile = pd.get_dummies(df_profile, columns=categorical)
    
fig, profile, important_features = profile_clusters(df_profile)

F1 score is 0.9781903662766842



Setting feature_perturbation = "tree_path_dependent" because no background data was given.



In [29]:
fig.show()

In [18]:
profile.sample(5)

Unnamed: 0,cluster,feature,mean_value
42,6,bounce_prop,0.300549
50,7,totals.pageviews,23.170485
29,4,totals.pageviews,21.571303
57,8,totals.pageviews,15.574065
31,4,sessionId,1.661972


In [20]:
print('All the features in decreasing importance order\n\n', important_features)

All the features in decreasing importance order

 ['bounce_prop', 'totals.pageviews', 'totals.hits', 'sessionId', 'weekend_prop', 'hour', 'device.isMobile', 'device.deviceCategory_desktop', 'device.deviceCategory_mobile', 'channelGrouping_Referral', 'trafficSource.medium_referral', 'device.operatingSystem_Macintosh', 'trafficSource.medium_organic', 'channelGrouping_Organic Search', 'device.operatingSystem_Windows', 'device.operatingSystem_Linux', 'device.operatingSystem_iOS', 'device.operatingSystem_Chrome OS', 'channelGrouping_Direct', 'device.operatingSystem_Android', 'device.browser_Chrome', 'device.deviceCategory_tablet', 'device.browser_Safari', 'trafficSource.medium_(none)', 'trafficSource.medium_cpc', 'channelGrouping_Paid Search', 'device.browser_Firefox', 'channelGrouping_Display', 'device.browser_Internet Explorer', 'trafficSource.medium_cpm', 'device.browser_Edge', 'channelGrouping_Social', 'channelGrouping_(Other)', 'channelGrouping_Affiliates', 'device.browser_Amazon Silk'

## Profile by Feature

In [30]:
def profile_feature(df_profile, feature):
    #Checks if it's a binary 
    if df_profile[feature].nunique() > 2:
        #If not binary, make Box plots
        box_data = [go.Box(y=df_profile.loc[df_profile.cluster == k, feature].values, name=f'Cluster {k}') for k in np.unique(df_profile.cluster)]
        fig = go.Figure(data=box_data)
    else:
        #If binary, make bar plot
        x =[f'Cluster {k}' for k in np.unique(df_profile.cluster)]
        y = [df_profile.loc[df_profile.cluster == k, feature].mean() for k in np.unique(df_profile.cluster)]
        fig = go.Figure([go.Bar(x=x, y=y)])
    return fig

In [32]:
feature = 'bounce_prop'
profile_feature(df_profile, feature)