# Imports

In [19]:
# import necessary python libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy.stats import pearsonr
import datetime as dt
import seaborn as sns

# Load and Manipulate Data

##### Load Monthly Volume by Cohort Data

In [20]:
# Load data
df = pd.read_csv('Volume by Cohort by Month.csv', parse_dates=['Month', 'Customer Cohort Month'])
df['Month'] = pd.to_datetime(df['Month'])
#df.set_index('Month', inplace=True)
df = df.sort_values(['Customer Cohort Month', 'Month'])
df

Unnamed: 0,Customer Cohort Month,Month,Volume
0,2016-06-01,2016-06-01,55000
1,2016-06-01,2016-07-01,107000
3,2016-06-01,2016-08-01,68000
6,2016-06-01,2016-09-01,21000
9,2016-06-01,2016-10-01,33000
...,...,...,...
3042,2023-02-01,2023-03-01,161000
3111,2023-02-01,2023-04-01,21000
3043,2023-03-01,2023-03-01,50000
3112,2023-03-01,2023-04-01,13000


##### Create Variable for Percent Change in Volume and Remove NaNs (also removed % changes worse than -25% to prevent churn from increasing StDev)

In [21]:
# make pct change data
df['Volume % Change'] = df.groupby('Customer Cohort Month')['Volume'].pct_change()
df.dropna(inplace=True)
df = df[df['Volume % Change'] > -.25]
df

Unnamed: 0,Customer Cohort Month,Month,Volume,Volume % Change
1,2016-06-01,2016-07-01,107000,0.945455
9,2016-06-01,2016-10-01,33000,0.571429
19,2016-06-01,2016-12-01,32000,0.391304
25,2016-06-01,2017-01-01,36000,0.125000
31,2016-06-01,2017-02-01,444000,11.333333
...,...,...,...,...
3039,2022-11-01,2023-03-01,363000,0.025424
2897,2022-12-01,2023-01-01,33800000,20.487603
3040,2022-12-01,2023-03-01,599000,-0.009917
3041,2023-01-01,2023-03-01,70000,3.375000


##### Calculate Mean % Volume Change and Standard Deviation in Volume Change by Cohort

In [22]:
cohort_std_dev = df.groupby('Customer Cohort Month')['Volume % Change'].std()
cohort_std_dev

Customer Cohort Month
2016-06-01     1.793127
2016-07-01     1.294264
2016-08-01     0.707048
2016-09-01     1.443916
2016-10-01     0.470878
                ...    
2022-10-01     0.495802
2022-11-01     0.077175
2022-12-01    14.493936
2023-01-01          NaN
2023-02-01          NaN
Name: Volume % Change, Length: 80, dtype: float64

In [23]:
cohort_mean = df.groupby('Customer Cohort Month')['Volume % Change'].mean()
cohort_mean

Customer Cohort Month
2016-06-01     0.507414
2016-07-01     0.870366
2016-08-01     0.657658
2016-09-01     0.453245
2016-10-01     0.237167
                ...    
2022-10-01     0.278482
2022-11-01     0.094326
2022-12-01    10.238843
2023-01-01     3.375000
2023-02-01     2.577778
Name: Volume % Change, Length: 80, dtype: float64

#### Join to Create Mean and stDev by Cohort Data

In [24]:
df0 = pd.merge(cohort_std_dev, cohort_mean, left_on="Customer Cohort Month", right_on='Customer Cohort Month')
df0

Unnamed: 0_level_0,Volume % Change_x,Volume % Change_y
Customer Cohort Month,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-06-01,1.793127,0.507414
2016-07-01,1.294264,0.870366
2016-08-01,0.707048,0.657658
2016-09-01,1.443916,0.453245
2016-10-01,0.470878,0.237167
...,...,...
2022-10-01,0.495802,0.278482
2022-11-01,0.077175,0.094326
2022-12-01,14.493936,10.238843
2023-01-01,,3.375000


##### Load Customer Level volume Data and Create % Change in Volume Variable

In [25]:
df1 = pd.read_csv('Clean Customer Data.csv', parse_dates=['Month', 'Customer Cohort Month'])
df1['Month'] = pd.to_datetime(df1['Month'])
df1 = df1[ ['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month', 'Grand Total']].sort_values(['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month'])
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total
6970,A00000001,2016-06-01,2016-06-01,9000
6935,A00000001,2016-07-01,2016-06-01,4000
6929,A00000001,2016-08-01,2016-06-01,11000
6948,A00000001,2016-09-01,2016-06-01,8000
6912,A00000001,2016-10-01,2016-06-01,12000
...,...,...,...,...
1145,,2023-02-01,2016-06-01,38000
1146,,2023-02-01,2016-06-01,4000
1147,,2023-02-01,2016-06-01,3000
1596,,2023-03-01,2016-06-01,1000


In [26]:

df1['Volume % Change'] = df1.groupby('CUSTOMER_ID-1')['Grand Total'].pct_change()
df1.dropna(inplace=True)
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change
6935,A00000001,2016-07-01,2016-06-01,4000,-0.555556
6929,A00000001,2016-08-01,2016-06-01,11000,1.750000
6948,A00000001,2016-09-01,2016-06-01,8000,-0.272727
6912,A00000001,2016-10-01,2016-06-01,12000,0.500000
6915,A00000001,2016-11-01,2016-06-01,4000,-0.666667
...,...,...,...,...,...
7137,A00002445,2022-11-01,2022-10-01,1000,0.000000
7139,A00002445,2023-03-01,2022-10-01,2000,1.000000
3549,A00002446,2023-02-01,2023-01-01,3000,-0.857143
3552,A00002446,2023-03-01,2023-01-01,48000,15.000000


##### Merge stDev and Customer Level Data

In [27]:
df2 = pd.merge(df1, df0, left_on='Customer Cohort Month', right_on='Customer Cohort Month')
df2

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change,Volume % Change_x,Volume % Change_y
0,A00000001,2016-07-01,2016-06-01,4000,-0.555556,1.793127,0.507414
1,A00000001,2016-08-01,2016-06-01,11000,1.750000,1.793127,0.507414
2,A00000001,2016-09-01,2016-06-01,8000,-0.272727,1.793127,0.507414
3,A00000001,2016-10-01,2016-06-01,12000,0.500000,1.793127,0.507414
4,A00000001,2016-11-01,2016-06-01,4000,-0.666667,1.793127,0.507414
...,...,...,...,...,...,...,...
9521,A00002391,2020-03-01,2019-09-01,1000,0.000000,0.000000,0.000000
9522,A00002391,2020-11-01,2019-09-01,1000,0.000000,0.000000,0.000000
9523,A00002391,2022-03-01,2019-09-01,1000,0.000000,0.000000,0.000000
9524,A00002391,2023-01-01,2019-09-01,1000,0.000000,0.000000,0.000000


##### Filter for Where Customer % Change in Volume is Within 1 stDev of Mean Volume Change for the Cohort

In [28]:
df3 = df2[((df2['Volume % Change_y']-df2['Volume % Change_x'])<=df2['Volume % Change']) & (df2['Volume % Change']<=(df2['Volume % Change_y']+df2['Volume % Change_x']))]
df4 = df3.sort_values(['CUSTOMER_ID-1', 'Month'])
df4

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change,Volume % Change_x,Volume % Change_y
0,A00000001,2016-07-01,2016-06-01,4000,-0.555556,1.793127,0.507414
1,A00000001,2016-08-01,2016-06-01,11000,1.750000,1.793127,0.507414
2,A00000001,2016-09-01,2016-06-01,8000,-0.272727,1.793127,0.507414
3,A00000001,2016-10-01,2016-06-01,12000,0.500000,1.793127,0.507414
4,A00000001,2016-11-01,2016-06-01,4000,-0.666667,1.793127,0.507414
...,...,...,...,...,...,...,...
2509,A00002434,2021-11-01,2016-07-01,255000,0.976744,1.294264,0.870366
2510,A00002434,2021-12-01,2016-07-01,539000,1.113725,1.294264,0.870366
2511,A00002434,2022-01-01,2016-07-01,411000,-0.237477,1.294264,0.870366
2512,A00002434,2022-02-01,2016-07-01,311000,-0.243309,1.294264,0.870366


##### Find the First Month for Each Customer Where % Change in Volume Month-to-Month is Within Bounds (ramp month)

In [29]:
df5 = df4.groupby(['CUSTOMER_ID-1', 'Customer Cohort Month'])['Month'].min().reset_index()
df5['Ramp_Time_In_Months'] = (df5['Month'].dt.to_period('M') - df5['Customer Cohort Month'].dt.to_period('M')).apply(lambda x: (x.n))
df5

Unnamed: 0,CUSTOMER_ID-1,Customer Cohort Month,Month,Ramp_Time_In_Months
0,A00000001,2016-06-01,2016-07-01,1
1,A00000002,2016-10-01,2016-11-01,1
2,"A00000002, A00000181",2019-03-01,2019-04-01,1
3,A00000003,2016-09-01,2016-10-01,1
4,A00000005,2016-09-01,2017-06-01,9
...,...,...,...,...
440,A00002391,2019-09-01,2020-02-01,5
441,A00002404,2022-12-01,2023-01-01,1
442,A00002409,2022-12-01,2023-01-01,1
443,A00002434,2016-07-01,2016-08-01,1


In [30]:
df5.drop(columns=['Month'], inplace=True)

##### Reload Original Customer Data for Cohort Weighted Averages

In [31]:
# Reoad in intial data
df1 = pd.read_csv('Clean Customer Data.csv', parse_dates=['Month', 'Customer Cohort Month'])
df1['Month'] = pd.to_datetime(df1['Month'])
df1 = df1 [ ['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month', 'Grand Total']]
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total
0,A00001405,2021-08-01,2021-08-01,2000
1,A00000263,2020-05-01,2019-07-01,173000
2,A00000263,2020-01-01,2019-07-01,40000
3,A00000263,2020-06-01,2019-07-01,316000
4,A00000263,2020-12-01,2019-07-01,190000
...,...,...,...,...
10680,A00000201,2019-03-01,2019-01-01,1000
10681,A00000201,2020-02-01,2019-01-01,16000
10682,A00000201,2019-06-01,2019-01-01,8000
10683,A00000066,2017-07-01,2017-05-01,1000


In [32]:
df2 = df1.groupby(['CUSTOMER_ID-1', 'Customer Cohort Month'])['Grand Total'].sum()
df2

CUSTOMER_ID-1         Customer Cohort Month
A00000001             2016-06-01                  350000
A00000002             2016-10-01                 4903000
A00000002, A00000181  2019-03-01                 3094000
A00000003             2016-09-01                94724000
A00000005             2016-09-01               465676000
                                                 ...    
A00002445             2022-10-01                    4000
A00002446             2023-01-01                   85000
A00002448             2023-03-01                    1000
A00002459             2023-04-01                    3000
A00002460             2023-04-01                    3000
Name: Grand Total, Length: 575, dtype: int64

##### Join Customer Volume Totals to Ramp Data and Calculate Weighted Average Ramp Time by Cohort

In [33]:
df3 = pd.merge(df5, df2, left_on=['CUSTOMER_ID-1', 'Customer Cohort Month'], right_on=['CUSTOMER_ID-1', 'Customer Cohort Month'])
df3

Unnamed: 0,CUSTOMER_ID-1,Customer Cohort Month,Ramp_Time_In_Months,Grand Total
0,A00000001,2016-06-01,1,350000
1,A00000002,2016-10-01,1,4903000
2,"A00000002, A00000181",2019-03-01,1,3094000
3,A00000003,2016-09-01,1,94724000
4,A00000005,2016-09-01,9,465676000
...,...,...,...,...
440,A00002391,2019-09-01,5,7000
441,A00002404,2022-12-01,1,91000
442,A00002409,2022-12-01,1,16000
443,A00002434,2016-07-01,1,3809000


In [34]:
df4 = df3.groupby('Customer Cohort Month').apply(lambda x: np.average(x['Ramp_Time_In_Months'], weights=x['Grand Total']))
df4 = pd.DataFrame(df4)
df4['Weighted Average Ramp'] = df4[0]
df4.drop(columns=[0], inplace=True)
df4
#df4.to_csv('WeightedAverageRampByCohort.csv)

Unnamed: 0_level_0,Weighted Average Ramp
Customer Cohort Month,Unnamed: 1_level_1
2016-06-01,9.946188
2016-07-01,1.169364
2016-08-01,13.478261
2016-09-01,14.846667
2016-10-01,1.087701
...,...
2022-08-01,2.872845
2022-09-01,2.178643
2022-10-01,1.267017
2022-11-01,2.000000


##### Load Cohort Volume Totals Data and Join to Weighted Average Ramp Data

In [35]:
# Reload data
df = pd.read_csv('Volume by Cohort by Month.csv', parse_dates=['Month', 'Customer Cohort Month'])
df['Month'] = pd.to_datetime(df['Month'])
#df.set_index('Month', inplace=True)
df = df.sort_values('Customer Cohort Month').groupby('Customer Cohort Month').sum()
df

Unnamed: 0_level_0,Volume
Customer Cohort Month,Unnamed: 1_level_1
2016-06-01,386718000
2016-07-01,4387000
2016-08-01,165000
2016-09-01,1216442000
2016-10-01,5840000
...,...
2022-12-01,36887000
2023-01-01,169000
2023-02-01,227000
2023-03-01,63000


##### Ignore More Recent Cohorts

In [36]:
df5 = pd.merge(df, df4, left_index=True, right_index=True)
df5

Unnamed: 0_level_0,Volume,Weighted Average Ramp
Customer Cohort Month,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-06-01,386718000,9.946188
2016-07-01,4387000,1.169364
2016-08-01,165000,13.478261
2016-09-01,1216442000,14.846667
2016-10-01,5840000,1.087701
...,...,...
2022-08-01,464000,2.872845
2022-09-01,1041000,2.178643
2022-10-01,4576000,1.267017
2022-11-01,1653000,2.000000


##### Export to .csv file

In [37]:
#df5.to_csv('Weighted Average Ramp Time.csv', header=True)

##### Calculate Weighted Average of Weighted Averages (just a sense check; not as accurate as replicating the analysis at the cohort, not customer level, and weighting aggregate cohort ramp)

In [38]:
totalVol = df5['Volume'].sum()
df5['Weight'] = df5['Volume'] / totalVol
weightedAverageRamp = (df5['Weighted Average Ramp'] * df5['Weight']).sum()
weightedAverageRamp

5.246237594224773