# Imports

In [22]:
# import necessary python libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy.stats import pearsonr
import datetime as dt
import seaborn as sns

# Load and Manipulate Data

##### Load Monthly Volume by Cohort Data

In [23]:
# Load data
df = pd.read_csv('Non-Committed Volume by Cohort by Month.csv', parse_dates=['Month', 'Customer Cohort Month'])
df['Month'] = pd.to_datetime(df['Month'])
#df.set_index('Month', inplace=True)
df = df.sort_values(['Customer Cohort Month', 'Month'])
df

Unnamed: 0,Customer Cohort Month,Month,Volume
1533,2018-01-31,2018-01-31,5996000
1543,2018-01-31,2018-02-28,7859000
1535,2018-01-31,2018-03-31,9836000
603,2018-01-31,2018-04-30,12033000
237,2018-01-31,2018-05-31,16466000
...,...,...,...
1050,2023-03-31,2023-04-30,1000
1344,2023-03-31,2023-05-31,9000
574,2023-04-30,2023-04-30,10000
695,2023-04-30,2023-05-31,20000


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1923 entries, 1533 to 441
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Customer Cohort Month  1923 non-null   datetime64[ns]
 1   Month                  1923 non-null   datetime64[ns]
 2   Volume                 1923 non-null   int64         
dtypes: datetime64[ns](2), int64(1)
memory usage: 60.1 KB


##### Create Variable for Percent Change in Volume and Remove NaNs (also removed % changes worse than -25% to prevent churn from increasing StDev)

In [25]:
# make pct change data
df['Volume % Change'] = df.groupby('Customer Cohort Month')['Volume'].pct_change()
df.dropna(inplace=True)
df = df[df['Volume % Change'] > -.25]
df

Unnamed: 0,Customer Cohort Month,Month,Volume,Volume % Change
1543,2018-01-31,2018-02-28,7859000,0.310707
1535,2018-01-31,2018-03-31,9836000,0.251559
603,2018-01-31,2018-04-30,12033000,0.223363
237,2018-01-31,2018-05-31,16466000,0.368404
763,2018-01-31,2018-06-30,14499000,-0.119458
...,...,...,...,...
1675,2022-12-31,2023-05-31,1000,0.000000
1351,2023-02-28,2023-03-31,94000,1.043478
1731,2023-02-28,2023-05-31,69000,2.833333
1344,2023-03-31,2023-05-31,9000,8.000000


##### Calculate Mean % Volume Change and Standard Deviation in Volume Change by Cohort

In [26]:
cohort_std_dev = df.groupby('Customer Cohort Month')['Volume % Change'].std()
cohort_std_dev

Customer Cohort Month
2018-01-31     0.447788
2018-02-28     1.298205
2018-03-31     1.431845
2018-04-30     0.682445
2018-05-31     1.525897
                ...    
2022-11-30     2.862110
2022-12-31    14.510923
2023-02-28     1.265619
2023-03-31          NaN
2023-04-30          NaN
Name: Volume % Change, Length: 62, dtype: float64

In [27]:
cohort_mean = df.groupby('Customer Cohort Month')['Volume % Change'].mean()
cohort_mean

Customer Cohort Month
2018-01-31     0.134000
2018-02-28     0.296320
2018-03-31     0.849591
2018-04-30     0.437500
2018-05-31     0.758974
                ...    
2022-11-30     3.313889
2022-12-31    10.260772
2023-02-28     1.938406
2023-03-31     8.000000
2023-04-30     1.000000
Name: Volume % Change, Length: 62, dtype: float64

#### Join to Create Mean and stDev by Cohort Data

In [28]:
df0 = pd.merge(cohort_std_dev, cohort_mean, left_on="Customer Cohort Month", right_on='Customer Cohort Month')
df0

Unnamed: 0_level_0,Volume % Change_x,Volume % Change_y
Customer Cohort Month,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-31,0.447788,0.134000
2018-02-28,1.298205,0.296320
2018-03-31,1.431845,0.849591
2018-04-30,0.682445,0.437500
2018-05-31,1.525897,0.758974
...,...,...
2022-11-30,2.862110,3.313889
2022-12-31,14.510923,10.260772
2023-02-28,1.265619,1.938406
2023-03-31,,8.000000


##### Load Customer Level volume Data and Create % Change in Volume Variable

In [29]:
df1 = pd.read_csv('Clean Non-Committed Customer Data.csv', parse_dates=['Month', 'Customer Cohort Month'])
df1['Month'] = pd.to_datetime(df1['Month'])
df1 = df1[ ['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month', 'Grand Total']].sort_values(['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month'])
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total
1677,A00000001,2018-01-31,2018-01-31,5000
5521,A00000001,2018-02-28,2018-01-31,4000
3022,A00000001,2018-03-31,2018-01-31,13000
937,A00000001,2018-04-30,2018-01-31,6000
239,A00000001,2018-05-31,2018-01-31,5000
...,...,...,...,...
1659,A00002499,2023-05-31,2023-05-31,1000
4701,A00002500,2023-05-31,2023-05-31,1000
2288,A00002503,2023-05-31,2023-05-31,1000
2614,A00002507,2023-05-31,2023-05-31,8000


In [30]:

df1['Volume % Change'] = df1.groupby('CUSTOMER_ID-1')['Grand Total'].pct_change()
df1.dropna(inplace=True)
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change
5521,A00000001,2018-02-28,2018-01-31,4000,-0.200000
3022,A00000001,2018-03-31,2018-01-31,13000,2.250000
937,A00000001,2018-04-30,2018-01-31,6000,-0.538462
239,A00000001,2018-05-31,2018-01-31,5000,-0.166667
5094,A00000001,2018-06-30,2018-01-31,8000,0.600000
...,...,...,...,...,...
3431,A00002418,2023-04-30,2023-02-28,6000,-0.857143
1095,A00002419,2023-03-31,2023-02-28,4000,3.000000
5203,A00002425,2023-05-31,2023-04-30,20000,5.666667
4024,A00002431,2023-04-30,2023-03-31,1000,-0.947368


##### Merge stDev and Customer Level Data

In [31]:
df2 = pd.merge(df1, df0, left_on='Customer Cohort Month', right_on='Customer Cohort Month')
df2

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change,Volume % Change_x,Volume % Change_y
0,A00000001,2018-02-28,2018-01-31,4000,-0.200000,0.447788,0.134000
1,A00000001,2018-03-31,2018-01-31,13000,2.250000,0.447788,0.134000
2,A00000001,2018-04-30,2018-01-31,6000,-0.538462,0.447788,0.134000
3,A00000001,2018-05-31,2018-01-31,5000,-0.166667,0.447788,0.134000
4,A00000001,2018-06-30,2018-01-31,8000,0.600000,0.447788,0.134000
...,...,...,...,...,...,...,...
5964,A00001720,2023-01-31,2022-12-31,2000,-0.500000,14.510923,10.260772
5965,A00001720,2023-02-28,2022-12-31,1000,-0.500000,14.510923,10.260772
5966,A00002425,2023-05-31,2023-04-30,20000,5.666667,,1.000000
5967,A00002431,2023-04-30,2023-03-31,1000,-0.947368,,8.000000


##### Filter for Where Customer % Change in Volume is Within 1 stDev of Mean Volume Change for the Cohort

In [32]:
df3 = df2[((df2['Volume % Change_y']-df2['Volume % Change_x'])<=df2['Volume % Change']) & (df2['Volume % Change']<=(df2['Volume % Change_y']+df2['Volume % Change_x']))]
df4 = df3.sort_values(['CUSTOMER_ID-1', 'Month'])
df4

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total,Volume % Change,Volume % Change_x,Volume % Change_y
0,A00000001,2018-02-28,2018-01-31,4000,-0.200000,0.447788,0.134000
3,A00000001,2018-05-31,2018-01-31,5000,-0.166667,0.447788,0.134000
5,A00000001,2018-07-31,2018-01-31,10000,0.250000,0.447788,0.134000
6,A00000001,2018-08-31,2018-01-31,7000,-0.300000,0.447788,0.134000
7,A00000001,2018-09-30,2018-01-31,8000,0.142857,0.447788,0.134000
...,...,...,...,...,...,...,...
5963,A00001718,2023-01-31,2022-12-31,2000,1.000000,14.510923,10.260772
5964,A00001720,2023-01-31,2022-12-31,2000,-0.500000,14.510923,10.260772
5965,A00001720,2023-02-28,2022-12-31,1000,-0.500000,14.510923,10.260772
5824,A00002408,2023-04-30,2023-02-28,2000,1.000000,1.265619,1.938406


##### Find the First Month for Each Customer Where % Change in Volume Month-to-Month is Within Bounds (ramp month)

In [33]:
df5 = df4.groupby(['CUSTOMER_ID-1', 'Customer Cohort Month'])['Month'].min().reset_index()
df5['Ramp_Time_In_Months'] = (df5['Month'].dt.to_period('M') - df5['Customer Cohort Month'].dt.to_period('M')).apply(lambda x: (x.n))
df5

Unnamed: 0,CUSTOMER_ID-1,Customer Cohort Month,Month,Ramp_Time_In_Months
0,A00000001,2018-01-31,2018-02-28,1
1,A00000002,2018-01-31,2018-02-28,1
2,A00000003,2018-01-31,2018-03-31,2
3,A00000005,2018-01-31,2018-03-31,2
4,A00000006,2018-01-31,2018-03-31,2
...,...,...,...,...
338,A00001704,2022-12-31,2023-02-28,2
339,A00001718,2022-12-31,2023-01-31,1
340,A00001720,2022-12-31,2023-01-31,1
341,A00002408,2023-02-28,2023-04-30,2


In [34]:
df5.drop(columns=['Month'], inplace=True)

##### Reload Original Customer Data for Cohort Weighted Averages

In [35]:
# Reoad in intial data
df1 = pd.read_csv('Clean Non-Committed Customer Data.csv', parse_dates=['Month', 'Customer Cohort Month'])
df1['Month'] = pd.to_datetime(df1['Month'])
df1 = df1 [ ['CUSTOMER_ID-1', 'Month', 'Customer Cohort Month', 'Grand Total']]
df1

Unnamed: 0,CUSTOMER_ID-1,Month,Customer Cohort Month,Grand Total
0,A00000091,2019-11-30,2019-11-30,1000
1,A00000006,2020-06-30,2018-01-31,295000
2,A00000042,2019-10-31,2018-03-31,1000
3,A00000025,2020-05-31,2018-01-31,1000
4,A00000150,2021-02-28,2018-07-31,277000
...,...,...,...,...
6434,A00001448,2023-01-31,2021-11-30,11000
6435,A00000317,2023-02-28,2020-03-31,64000
6436,A00000065,2022-05-31,2018-03-31,1000
6437,A00001419,2023-01-31,2021-07-31,106000


In [36]:
df2 = df1.groupby(['CUSTOMER_ID-1', 'Customer Cohort Month'])['Grand Total'].sum()
df2

CUSTOMER_ID-1  Customer Cohort Month
A00000001      2018-01-31                  206000
A00000002      2018-01-31                 4589000
A00000003      2018-01-31                48052000
A00000005      2018-01-31               595682000
A00000006      2018-01-31                 9929000
                                          ...    
A00002499      2023-05-31                    1000
A00002500      2023-05-31                    1000
A00002503      2023-05-31                    1000
A00002507      2023-05-31                    8000
A00002509      2023-05-31                    4000
Name: Grand Total, Length: 465, dtype: int64

##### Join Customer Volume Totals to Ramp Data and Calculate Weighted Average Ramp Time by Cohort

In [37]:
df3 = pd.merge(df5, df2, left_on=['CUSTOMER_ID-1', 'Customer Cohort Month'], right_on=['CUSTOMER_ID-1', 'Customer Cohort Month'])
df3

Unnamed: 0,CUSTOMER_ID-1,Customer Cohort Month,Ramp_Time_In_Months,Grand Total
0,A00000001,2018-01-31,1,206000
1,A00000002,2018-01-31,1,4589000
2,A00000003,2018-01-31,2,48052000
3,A00000005,2018-01-31,2,595682000
4,A00000006,2018-01-31,2,9929000
...,...,...,...,...
338,A00001704,2022-12-31,2,11000
339,A00001718,2022-12-31,1,3000
340,A00001720,2022-12-31,1,7000
341,A00002408,2023-02-28,2,7000


In [38]:
df4 = df3.groupby('Customer Cohort Month').apply(lambda x: np.average(x['Ramp_Time_In_Months'], weights=x['Grand Total']))
df4 = pd.DataFrame(df4)
df4['Weighted Average Ramp'] = df4[0]
df4.drop(columns=[0], inplace=True)
df4
#df4.to_csv('WeightedAverageRampByCohort.csv)

Unnamed: 0_level_0,Weighted Average Ramp
Customer Cohort Month,Unnamed: 1_level_1
2018-01-31,1.927711
2018-02-28,2.985846
2018-03-31,2.186537
2018-04-30,3.063492
2018-05-31,2.966102
2018-06-30,2.326849
2018-07-31,1.136865
2018-08-31,2.556167
2018-09-30,6.69775
2018-10-31,2.955831


##### Load Cohort Volume Totals Data and Join to Weighted Average Ramp Data

In [39]:
# Reload data
df = pd.read_csv('Non-Committed Volume by Cohort by Month.csv', parse_dates=['Month', 'Customer Cohort Month'])
df['Month'] = pd.to_datetime(df['Month'])
#df.set_index('Month', inplace=True)
df = df.sort_values('Customer Cohort Month').groupby('Customer Cohort Month').sum()
df

Unnamed: 0_level_0,Volume
Customer Cohort Month,Unnamed: 1_level_1
2018-01-31,1678394000
2018-02-28,24661000
2018-03-31,1233000
2018-04-30,63000
2018-05-31,60000
...,...
2023-01-31,26000
2023-02-28,227000
2023-03-31,91000
2023-04-30,30000


##### Ignore More Recent Cohorts

In [40]:
df5 = pd.merge(df, df4, left_index=True, right_index=True)
df5

Unnamed: 0_level_0,Volume,Weighted Average Ramp
Customer Cohort Month,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-31,1678394000,1.927711
2018-02-28,24661000,2.985846
2018-03-31,1233000,2.186537
2018-04-30,63000,3.063492
2018-05-31,60000,2.966102
2018-06-30,83910000,2.326849
2018-07-31,1785988000,1.136865
2018-08-31,33624000,2.556167
2018-09-30,69685000,6.69775
2018-10-31,8243000,2.955831


##### Export to .csv file

In [41]:
#df5.to_csv('Weighted Average Ramp Time (Non-Committed).csv', header=True)

##### Calculate Weighted Average of Weighted Averages (just a sense check; not as accurate as replicating the analysis at the cohort, not customer level, and weighting aggregate cohort ramp)

In [42]:
totalVol = df5['Volume'].sum()
df5['Weight'] = df5['Volume'] / totalVol
weightedAverageRamp = (df5['Weighted Average Ramp'] * df5['Weight']).sum()
weightedAverageRamp

1.7899662142008748