# Imports

In [None]:
# import necessary python libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy.stats import pearsonr
import datetime as dt
import seaborn as sns

# Load Data

##### Monthly Ocrolus Volume and Active Company Counts

In [None]:
# Load data
data = pd.read_csv('Monthly Volume and Company Count.csv', parse_dates=['Month'])
data['Month'] = pd.to_datetime(data['Month'])
data['EOMONTH'] = pd.to_datetime(data['EOMONTH'])
data.head()

##### Customer Level Ocrolus Data

In [None]:
# Load data
data1 = pd.read_csv('Clean Customer Data.csv', parse_dates=['Month'])
data1['Month'] = pd.to_datetime(data1['Month'])
data1['EOMONTH'] = pd.to_datetime(data1['EOMONTH'])
data1.head()

##### Monthly Correlation Metrics

In [None]:
# Load data
data2 = pd.read_csv('Monthly Correlation Metrics.csv', parse_dates=['Month'])
data2['Month'] = pd.to_datetime(data2['Month'])
data2['EOMONTH'] = pd.to_datetime(data2['EOMONTH'])
data2 = data2.sort_values(by=['Month'])
data2['Month'] = data2['Month'].shift(-1)
data2

##### Monthly Ocrolus S&M Spend

In [None]:
# Load data
data3 = pd.read_csv('P.Oakley Sales and Marketing Spend.csv', parse_dates=['Month'])
data3['Total S_M Spend'] = data3['Total S_M Spend'].apply(lambda x: float(x)*1000)
data3['Month'] = pd.to_datetime(data3['Month'])
data3 = data3.sort_values(by=['Month'])
data3['Month'] = data3['Month'].shift(-1)
data3.set_index('Month', inplace=True)
d3 = data3[['Total S_M Spend']]
d3.head()

##### Existing Account Monthly Ocrolus S&M Spend

In [None]:
# Load data
data4 = pd.read_csv('Existing S&M Spend.csv', parse_dates=['Date'])
data4['Sales Team Farmer Expense'] = data4['Sales Team Farmer Expense'].apply(lambda x: float(x)*1000)
data4['Account Management'] = data4['Account Management'].apply(lambda x: float(x)*1000)
data4['Date'] = pd.to_datetime(data4['Date'])
data4 = data4.sort_values(by=['Date'])
data4['Date'] = data4['Date'].shift(-1)
data4.set_index('Date', inplace=True)
data4.drop(columns=['Unnamed: 3'], inplace=True)
data4.head()

##### Combined Dataset: Monthly Ocrolus Volume and Active Company Counts + Monthly Correlation Metrics (1M Lag)

In [None]:
# Join dataframes on month, drop null values (1st row due to 1m lag on correlation metrics), and create average volume per customer variable
df = pd.merge(data, data2, left_on='Month', right_on="Month")
df['Month'] = pd.to_datetime(df['Month'])
df['EOMONTH_x'] = pd.to_datetime(df['EOMONTH_x'])
df['EOMONTH_y'] = pd.to_datetime(df['EOMONTH_y'])
df.set_index('Month', inplace=True)
df.dropna(inplace=True)
df = df.sort_index()
df['Avg Vol Per Customer'] = df['Volume']/df['Active Customer Count']
df

In [None]:
# Basic info on dataframe structure
df.info()

##### Combined Dataset With Volume % Change Variable 

In [None]:
# Create new dataframe with volume percent change variable
dfg = df
dfg['Volume_Change'] = dfg['Volume'].pct_change()*100
dfg.dropna(inplace=True)
dfg

##### Combined Dataset With S&M Spend and US Prime Rate

In [None]:
# Create dataframe with Total S&M Spend, US Prime Rate, Volume, and Volume % Change
ust = pd.merge(d3,df, left_index=True, right_index=True, how='inner')
ust['Volume_Change'] = ust['Volume'].pct_change()*100
ust.dropna(inplace=True)
ust = ust[['Volume_Change', 'Volume','Total S_M Spend', 'US_Prime_Rate']]
ust['US_Prime_Rate'] = ust['US_Prime_Rate']*100
ust

##### Combined Dataset with Existing Account Related S&M and US Prime Rate

In [None]:
# Create dataframe with Existing Account Related S&M Spend, US Prime Rate, Volume, and Volume % Change
esm = pd.merge(data4,df, left_index=True, right_index=True, how='inner')
esm['Volume_Change'] = esm['Volume'].pct_change()*100
esm.dropna(inplace=True)
esm['Total Existing Account S&M'] = esm['Sales Team Farmer Expense'] + esm['Account Management']
esm = esm[['Volume_Change', 'Volume','Sales Team Farmer Expense', 'Account Management', 'Total Existing Account S&M', 'US_Prime_Rate']]
esm['US_Prime_Rate'] = esm['US_Prime_Rate']*100
esm

In [None]:
# Create Percent Change dataframe
cesm = pd.merge(data4,df, left_index=True, right_index=True, how='inner')
cesm['Volume_Change'] = esm['Volume'].pct_change()*100
cesm['Total Existing Account S&M'] = cesm['Sales Team Farmer Expense'] + cesm['Account Management']
cesm['Total Existing Account S&M % Change'] = cesm['Total Existing Account S&M'].pct_change()*100
cesm['US_Prime_Rate % Change'] = cesm['US_Prime_Rate'].pct_change()*100
cesm['Sales Team Farmer Expense % Change'] = cesm['Sales Team Farmer Expense'].pct_change()*100
cesm['Account Management % Change'] = cesm['Account Management'].pct_change()*100
cesm.dropna(inplace=True)
cesm = cesm[['Volume_Change', 'Total Existing Account S&M % Change', 'US_Prime_Rate % Change', 'Sales Team Farmer Expense % Change', 'Account Management % Change']]
cesm

##### Create dataframe of monthly volume by cohort month

In [None]:
# Load data
dc = pd.read_csv('Cohort Correlations v1.csv', parse_dates=['Month'])
dc['Month'] = pd.to_datetime(dc['Month'])
dc.set_index('Month', inplace=True)

# Initial Linear Regressions

##### Simple Least Squares Regression of Volume % Change to Inspect General Growth Trend

In [None]:
# Linear regression: Dependent variable = Ocrolus Volume Percent Change; Independent variables = Time (in months) - to inspect general growth trend
X = pd.to_numeric(dfg.index).values.reshape(-1,1)
y = dfg['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Simple Least Squares Regression of Volume to Inspect General Growth Trend

In [None]:
# Linear regression: Dependent variable = Ocrolus Volume; Independent variables = Time (in months) - to inspect general growth trend
X = pd.to_numeric(df.index).values.reshape(-1,1)
y = df['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

# Signal Decomposition Analysis

In [None]:
# Perform Holt-Winters decomposition
result = seasonal_decompose(df['Volume'], model='additive', extrapolate_trend='freq')

In [None]:
# Extract trend, seasonal, and residual components
trend = result.trend
seasonal = result.seasonal
residual = result.resid

In [None]:
trend

In [None]:
seasonal

In [None]:
residual

In [None]:
# Create dataframe with residual portion of volume data and monthly correlation metrics (multiplied by 100 for rate %s)
corrdf = pd.merge(residual.dropna(), df, left_index=True, right_index=True, how='inner')
corrdf['Fed_Funds_Target_Rate'] = corrdf['Fed_Funds_Target_Rate']*100
corrdf['LIBOR___3_Month'] = corrdf['LIBOR___3_Month']*100
corrdf['United_States_Treasury_Constant_Maturity___1_Year'] = corrdf['United_States_Treasury_Constant_Maturity___1_Year']*100
corrdf['United_States_Treasury_Constant_Maturity___5_Year'] = corrdf['United_States_Treasury_Constant_Maturity___5_Year']*100
corrdf['United_States_Treasury_Constant_Maturity___10_Year'] = corrdf['United_States_Treasury_Constant_Maturity___10_Year']*100
corrdf['United_States_Treasury_Constant_Maturity___30_Year'] = corrdf['United_States_Treasury_Constant_Maturity___30_Year']*100
corrdf['US_Prime_Rate'] = corrdf['US_Prime_Rate']*100
corrdf.head()

In [None]:
# Create dataframe with trend portion of volume data and monthly Existing Account S&M expenses
trd = pd.merge(trend.dropna(), esm, left_index=True, right_index=True, how='inner')
trd

In [None]:
# Create dataframe with trend portion of volume data and monthly Total S&M spend
srd = pd.merge(trend.dropna(), d3, left_index=True, right_index=True, how='inner')
srd

In [None]:
# Calculate correlation between residual component and Fed Funds Target Rate
correlation, p_value = pearsonr(residual.dropna(), corrdf['Fed_Funds_Target_Rate'])
print(f"Correlation between residual component and the fed funds target rate: {correlation:.6f}")
print(f"P-value: {p_value:.6f}")

In [None]:
# Calculate correlation between residual component and US Prime Rate
correlation, p_value = pearsonr(residual.dropna(), corrdf['US_Prime_Rate'])
print(f"Correlation between residual component and the US prime rate: {correlation:.6f}")
print(f"P-value: {p_value:.6f}")

In [None]:
# Calculate correlation between residual component and LIBOR 3M Rate
correlation, p_value = pearsonr(residual.dropna(), corrdf['LIBOR___3_Month'])
print(f"Correlation between residual component and the LIBOR 3 Month Rate: {correlation:.3f}")
print(f"P-value: {p_value:.5f}")

In [None]:
# Calculate correlation between residual component and US Treasury Constant Maturity 1 Year
correlation, p_value = pearsonr(residual.dropna(), corrdf['United_States_Treasury_Constant_Maturity___1_Year'])
print(f"Correlation between residual component and the US Treasury Constant Maturity 1 Year: {correlation:.3f}")
print(f"P-value: {p_value:.5f}")

In [None]:
# Calculate correlation between residual component and US Treasury Constant Maturity 5 Year
correlation, p_value = pearsonr(residual.dropna(), corrdf['United_States_Treasury_Constant_Maturity___5_Year'])
print(f"Correlation between residual component and the US Treasury Constant Maturity 5 Year: {correlation:.3f}")
print(f"P-value: {p_value:.5f}")

In [None]:
# Calculate correlation between residual component and US Treasury Constant Maturity 10 Year
correlation, p_value = pearsonr(residual.dropna(), corrdf['United_States_Treasury_Constant_Maturity___10_Year'])
print(f"Correlation between residual component and the US Treasury Constant Maturity 10 Year: {correlation:.3f}")
print(f"P-value: {p_value:.5f}")

In [None]:
# Calculate correlation between residual component and US Treasury Constant Maturity 30 Year
correlation, p_value = pearsonr(residual.dropna(), corrdf['United_States_Treasury_Constant_Maturity___30_Year'])
print(f"Correlation between residual component and the US Treasury Constant Maturity 30 Year: {correlation:.3f}")
print(f"P-value: {p_value:.5f}")

In [None]:
# Calculate correlation between trend component and Account Management S&M spend (1M lag)
correlation, p_value = pearsonr(trd['trend'], trd['Account Management'])
print(f"Correlation between trend component and the Existing Account S&M Spend: {correlation:.3f}")
print(f"P-value: {p_value:.10f}")
# Note: Total Existing S&M spend was tried as well as just sales team farmer expense and lowered correlation

# Regression Analyses

##### Linear Regression Showing Impact of US Prime Rate on Residual Component of Ocrolus Volume (Signal Decomposed Volume)

In [None]:
# Linear regression: Dependent variable = Residual Ocrolus Volume (seasonality and trend removed); Independent variable = US Prime Rate
X = corrdf['US_Prime_Rate']
y = residual.dropna()
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key takeaways:
    # The US Prime Rate has a significant negative correlation with the residual component of Ocrolus Volume
    # US Prime Rate does not have a significant correlation with overall volume (trend in overall platform growth clouds relationship between US Prime Rate and Volume)

##### Linear regression showing relationship between Account Management Spend and the underlying trend in Ocrolus Volume

In [None]:
# Linear regression: Dependent variable = Trend Ocrolus Volume (seasonality and residual removed); Independent variable = Account Management Expenses
X = trd['Account Management']
y = trd['trend']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key takeaways:
    # The spend on account amnagement has a significant positive correlation with the trend component of Ocrolus Volume

##### Linear regression showing relationship between Total S_M Spend and the underlying trend in Ocrolus Volume

In [None]:
# Linear regression: Dependent variable = Trend Ocrolus Volume (seasonality and residual removed); Independent variable = Total S_M spend
X = srd['Total S_M Spend']
y = srd['trend']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key takeaways:
    # The spend on sales and marketing has a significant positive correlation with the trend component of Ocrolus Volume

##### Linear Regression Showing Impact of US Prime Rate on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variable = US Prime Rate
X = ust['US_Prime_Rate']
y = ust['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of US Prime Rate on Percent Change in Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variable = US Prime Rate
X = ust['US_Prime_Rate']
y = ust['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of S&M Spend and US Prime Rate on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variables = Total S&M Spend and US Prime Rate
X = ust[['Total S_M Spend', 'US_Prime_Rate']]
y = ust['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key Takeaways:
    # Volume is dependent on Total S&M Spend and US Prime Rate (this includes the impact on volume from new customers) 

##### Linear Regression Showing Impact of Existing Account Related S&M Spend on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variables = Sales Team Farmer and Account Management Expenses
X = esm[['Sales Team Farmer Expense', 'Account Management']]
y = esm['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of Total Existing Account Related S&M Spend on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variable = Total S&M Spend Related to Existing Accounts
X = esm['Total Existing Account S&M']
y = esm['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of Existing Account Related S&M Spend on % Change in Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variables = Sales Team Farmer and Account Management Expenses
X = esm[['Sales Team Farmer Expense', 'Account Management']]
y = esm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of Account Management Expense on Volume % Change

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variables = Account Management Expenses
X = esm['Account Management']
y = esm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key Takeaways:
    # Existing Account related S&M spend is slightly more related to % volume change than nominal volume amount
    # Although significant at 0.1 alpha level when included with sales team farmer expenses, account management expenses alone are not significant

##### Linear Regression Showing Impact of Total Existing Account Related S&M Spend on % Change in Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume Percent Change; Independent variable = Total S&M Spend Related to Existing Accounts
X = esm['Total Existing Account S&M']
y = esm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of Existing Account Related S&M Spend and US Prime Rate on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variables = US Prime Rate and Account Management Expenses
X = esm[['Account Management', 'US_Prime_Rate']]
y = esm['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Note: Adding Sales Farmer expense lowers significance

##### Linear Regression Showing Impact of Total Existing Account S&M Spend and US Prime Rate on Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume; Independent variables = US Prime Rate and Total Existing Account S&M Spend
X = esm[['Total Existing Account S&M', 'US_Prime_Rate']]
y = esm['Volume']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key Takeaways:
    # All variables in the model are significant at the 0.1 alpha level
    # This indicates that the US Prime Rate has a significant negative correlation with the portion of volume not impacted by existing account S&M spend

##### Linear Regression Showing Impact of Existing Account Related S&M Spend and US Prime Rate on % Change in Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variables = US Prime Rate, Sales Team Farmer Expenses, and Account Management Expenses
X = esm[['Sales Team Farmer Expense', 'Account Management', 'US_Prime_Rate']]
y = esm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())

##### Linear Regression Showing Impact of Total Existing Account S&M Spend and US Prime Rate on % Change in Ocrolus Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variables = US Prime Rate and Total Existing Account S&M Spend
X = esm[['Total Existing Account S&M', 'US_Prime_Rate']]
y = esm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key Takeaways:
    # Model is not significant when looking at % change in volume (yet is when looking at nominal volume)

##### Linear regression showing relationship between % Change variabels and % Change Volume

In [None]:
# Multiple Linear regression: Dependent variable = Ocrolus Volume % Change; Independent variables = US Prime Rate % Change and Total Existing Account S&M Spend % Change
X = cesm[['Total Existing Account S&M % Change', 'US_Prime_Rate % Change']]
y = cesm['Volume_Change']
X = sm.add_constant(X)
model=sm.OLS(y,X)
results = model.fit()
print(results.summary())
# Key Takeaways:
    # Model is not significant when looking at % change in volume (yet is when looking at nominal volume)

##### Regression of each cohorts time-series volume, and the existing S&M spend on a 1 month lag

In [None]:
em = esm['Account Management']
results_df = pd.DataFrame()
for i in range(len(dc.columns)-2):
    newdf = dc.iloc[i:,i]
    # Create dataframes with monthly Existing Account Management spend and monthly volume per cohort (excluding months before cohort date)
    cdf = pd.merge(em, newdf, left_index=True, right_index=True, how='inner')
    X = cdf.iloc[:,0]
    y = cdf.iloc[:,1]
    X = sm.add_constant(X)
    model=sm.OLS(y,X)
    results = model.fit()
    # Add the results to the results dataframe
    results_df = results_df.append({'Cohort': cdf.columns[1], 'R-squared': results.rsquared, 'P-value': results.pvalues[1]}, ignore_index=True)

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
results_df['P-value'] = results_df['P-value'].round(6)
print(results_df)
# key takeaways:
    # the majority of cohorts with sufficient volume sample size show existing account management S&M spend to be significant in describing a portion of cohort volume over time (removes for new customer growth)
    # total existing account S&M spend worsened statistical significance (limiting spend to just account management variable yields better results)

##### Regression including account management spend and US Prime rate to explain cohort volume over time

In [None]:
em1 = esm[['US_Prime_Rate', 'Account Management']]
results_df1 = pd.DataFrame()
for i in range(len(dc.columns)-2):
    newdf = dc.iloc[i:,i]
    # Create dataframes with monthly Existing Account Management spend, US prime rate, and monthly volume per cohort (excluding months before cohort date)
    cdf1 = pd.merge(em1, newdf, left_index=True, right_index=True, how='inner')
    X = cdf1.iloc[:,0:2]
    y = cdf1.iloc[:,2]
    X = sm.add_constant(X)
    model=sm.OLS(y,X)
    results = model.fit()
    # Add the results to the results dataframe
    results_df1 = results_df1.append({'Cohort': cdf1.columns[2], 'R-squared': results.rsquared, 'US Prime Rate P-value': results.pvalues[1], 'Acc Mgmt Spend P-value': results.pvalues[2]}, ignore_index=True)
    #print(results.summary())

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
results_df1['US Prime Rate P-value'] = results_df1['US Prime Rate P-value'].round(6)
results_df1['Acc Mgmt Spend P-value'] = results_df1['Acc Mgmt Spend P-value'].round(6)
print(results_df1)
# Key takeaways:
    # including US Prime rate and account management spend yields less significant results on a cohort by cohort basis (although some are significant)

# Commentary

##### The trend component of Ocrulus's volume is highly correlated with Total S&M and Existing Account Management spend

##### The residual component of Ocrolus's volume has a significant negative correlation with the US Prime Rate

##### On a cohort to cohort basis, existing account S&M spend is highly significant in explaining a portion of volume (when volume growth from new users is not considered)

##### On a cohort to cohort basis, using both existing account S&M spend and US Prime rate is shown to have mized results in terms of its significance in predicting cohort volume (some cohorts are significant while some are not) - This could be due to intra-cohort volatility (at large there are no outliers but intracohort there may be as some cohorts have few customers)

##### There is a significant relationship between Ocrolus's volume and total S&M spend (not just the portion related to existing account management)

##### The US Prime Rate and total S&M spend explain a large portion of the variance in Ocrolus's volume

##### The US Prime Rate and total S&M spend related to existing accounts explain a large portion of the variance in Ocrolus's volume

##### The models are not significant when predicting % changes in volume