In [1]:
import pandas as pd
import numpy as np

from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesse
 
import shap
from dataset import engine, RFM, Churn, Engagement, RFM_engage, RFM_churn
from models import CLFSwitcher, Transform, Pipe, parameters
from sklearn.model_selection import GridSearchCV

In [2]:
rfm = RFM(engine)

engage = Engagement(engine)

churn = Churn(engine)


def train(data):
    X = data.get_X()
    y = data.get_y()
    ct = Transform(data)
    X, y = ct.get_Xy()

    pipeline = Pipe(ct).get_pipeline()

    def train(X, y, pipeline, parameters):
        grid_search = GridSearchCV(
            pipeline, parameters, cv=5, n_jobs=12, return_train_score=True, verbose=1
        )
        # grid_search = pipeline
        grid_search.fit(X, y)
        return grid_search, grid_search.best_estimator_[-1]

    _, best_est = train(X, y, pipeline, parameters)
    best_est.explain(data, ct)

    return best_est

customer_lst = ["Hibernating", "At Risk", "Loyal Customers", "New Customers"]
# customer_lst = ['Loyal Customers']
explained_dct = {}
for customer in customer_lst:
    engage_explain = train(RFM_engage(rfm, engage, customer))
    churn_explain = train(RFM_churn(rfm, churn, customer))
    explained_dct["engage " + customer] = engage_explain
    explained_dct["churn " + customer] = churn_explain

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [28]:
RFM_engage(rfm, engage).get_X()

Unnamed: 0,campaign_month,target_segment,budget,channel,goal,engage_month,feedback_score
10,9,Families,158.0,jkl,consideration,6,5.0
17,12,Young Adults,299.0,social,consideration,12,5.0
20,7,Families,387.0,social,conversion,11,1.0
24,8,Families,159.0,social,retention,4,3.0
28,9,Families,192.0,influencer,retention,6,2.0
...,...,...,...,...,...,...,...
7749,8,Families,51.0,influencer,retention,7,2.0
7751,1,Families,343.0,influencer,retention,4,4.0
7757,3,Young Adults,98.0,influencer,conversion,6,1.0
7780,1,Retirees,267.0,influencer,awareness,4,1.0


In [12]:
explained_dct.keys()

dict_keys(['engage Hibernating', 'churn Hibernating', 'engage At Risk', 'churn At Risk', 'engage Loyal Customers', 'churn Loyal Customers', 'engage New Customers', 'churn New Customers'])

In [79]:
def get_relation(explained_dct, est = 'engage New Customers', X_col="goal",
		y_col="action_type", y_val="converted"
	):
	data = explained_dct[est].get_shap(X_col, y_col, y_val)
	data = pd.concat([data.iloc[:, -1], data.iloc[:, 0]], axis=1).sort_values(["{}__{}".format(X_col, y_val)])
	data['shap'] = data['shap'].apply(lambda x:x*10000)
	data = data.groupby(["{}__{}".format(X_col, y_val)]) \
		.agg(["mean"])['shap'].reset_index() \
		.rename(columns={"shap": "Weight", "{}__{}".format(X_col, y_val): X_col})
	data["meanColor"] = ["hsl(229, 70%, 50%)" for i in range(len(data))]
	data['mean'] = data['mean'].astype(int)
	data = data.to_dict(orient="records")
	# data = [{'id':'1', "data": data}]
	return data



data = get_relation(explained_dct)
# data = bar_plot(data)
data

[{'goal': 'awareness', 'mean': -1, 'meanColor': 'hsl(229, 70%, 50%)'},
 {'goal': 'consideration', 'mean': -2, 'meanColor': 'hsl(229, 70%, 50%)'},
 {'goal': 'conversion', 'mean': 7, 'meanColor': 'hsl(229, 70%, 50%)'},
 {'goal': 'retention', 'mean': -2, 'meanColor': 'hsl(229, 70%, 50%)'}]