In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.linear_model import LinearRegression
import scipy.stats as stats
import datetime as dt
import decimal
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import RandomForestRegressor as rfr
import sklearn.metrics as sm
import math

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

In [None]:
df = pd.read_csv('../Data/EvolyticsDataScienceExercise.csv')
revenueCols = [col for col in df.columns if 'purchase_revenue' in str(col)]
revenuedf = (df[revenueCols])
for col in revenueCols:
    revenuedf[col] = pd.to_numeric(revenuedf[col],errors='coerce')
df['orderRevenue'] = revenuedf.sum(axis=1)
df['hikeRevenue'] = 0
for i in range(df.shape[0]):
    if 'HIKE' in df.iloc[i,df.columns.get_loc('purchase_product_1')]:
        df.iloc[i,df.columns.get_loc('hikeRevenue')] = float(df.iloc[i,df.columns.get_loc('hikeRevenue')]) + float(df.iloc[i,df.columns.get_loc('purchase_revenue_product_1')])
    if 'HIKE' in df.iloc[i,df.columns.get_loc('purchase_product_2')]:
        df.iloc[i,df.columns.get_loc('hikeRevenue')] = float(df.iloc[i,df.columns.get_loc('hikeRevenue')]) + float(df.iloc[i,df.columns.get_loc('purchase_revenue_product_2')])
    if 'HIKE' in df.iloc[i,df.columns.get_loc('purchase_product_3')]:
        df.iloc[i,df.columns.get_loc('hikeRevenue')] = float(df.iloc[i,df.columns.get_loc('hikeRevenue')]) + float(df.iloc[i,df.columns.get_loc('purchase_revenue_product_3')])
    if 'HIKE' in df.iloc[i,df.columns.get_loc('purchase_product_4')]:
        df.iloc[i,df.columns.get_loc('hikeRevenue')] = float(df.iloc[i,df.columns.get_loc('hikeRevenue')]) + float(df.iloc[i,df.columns.get_loc('purchase_revenue_product_4')])

In [None]:
datetimelikecolumns = ['date','min_timestamp_date','min_timestamp_time','max_timestamp','date.1','upsell_timestamp_date','upsell_timestamp_time']
catagoricalCols = []
n = df.shape[0]

In [None]:
dropColumns = ['date','min_timestamp_date','min_timestamp_time','Recipe','purchase_flag','upgrade_and_purchase','hit either yes or no','max_timestamp','purchase_product_1', 
               'purchase_product_2', 'purchase_product_3', 
               'purchase_product_4', 'purchase_units_product_1','purchase_units_product_2', 'purchase_units_product_3', 'purchase_units_product_4',
               'purchase_revenue_product_1','purchase_revenue_product_2','purchase_revenue_product_3','purchase_revenue_product_4','IPD vs NonIPD',
               'date.1','upsell_timestamp_date','upsell_timestamp_time']
masterDF = df.drop(dropColumns,axis=1)

for col in masterDF.columns:
    col = str(col)
    parsedValues = [util.tryParse(x)[1] for x in masterDF[col] if util.tryParse(x)[0]]
    if (col == 'user_State' or len(parsedValues) == 0 or (len(list(set(masterDF[col]))) > 2 and len(list(set(masterDF[col]))) <= 30)) and col not in datetimelikecolumns:
        catagoricalCols.append(col)

for catcol in catagoricalCols:
    dummyDF = pd.get_dummies(masterDF[catcol])
    masterDF = pd.concat([masterDF,dummyDF],axis = 1)
    masterDF.drop(catcol,axis=1,inplace=True)

In [None]:
XTrain,XTest,purchaseTrain,purchaseTest = train_test_split(masterDF, df['purchase_flag'], test_size=0.2)
idTrain = XTrain['source_visitor_id']
idTest = XTest['source_visitor_id']
revenueTrain = XTrain['orderRevenue']
hikingTrain = XTrain['hikeRevenue']
revenueTest = XTest['orderRevenue']
hikingTest = XTest['hikeRevenue']
XTrain = XTrain.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1)
XTest = XTest.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1)

# Conversion Rate Experiments

## Training

In [None]:
conversionForest = rfc(max_depth = int(np.sqrt(len(XTrain.columns))),max_features = int(np.sqrt(len(XTrain.columns))))
conversionForest.fit(XTrain,purchaseTrain)
purchasePredict = conversionForest.predict(XTrain)
sm.accuracy_score(purchaseTrain,purchasePredict)

## Testing

In [None]:
purchaseTestPredict = conversionForest.predict(XTest)
sm.accuracy_score(purchaseTest,purchaseTestPredict)

## Importances

In [None]:
util.displayFeatureImportances(list(XTrain.columns),conversionForest)

In [None]:
sum(masterDF['no_thanks_flag'] & masterDF['yes_upgrade_flag'])

In [None]:
sum((masterDF['no_thanks_flag'] == False) & (masterDF['yes_upgrade_flag'] == False))

In [None]:
masterDF.shape[0]

## Experimentation

In [None]:
nUsers = len(list(set(masterDF['source_visitor_id'])))

In [None]:
controlConversionDF = masterDF.copy()
controlConversionDF['purchase_flag'] = conversionForest.predict(masterDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
controlConversionStatsDF = util.conversionPerUser(controlConversionDF)
print(f"Conversion Rate per User: {controlConversionStatsDF['conversionRate'].mean()} +/- {controlConversionStatsDF['conversionRate'].std()/np.sqrt(nUsers)}")

In [None]:
noThankYouDF = masterDF.copy()
noThankYouDF['no_thanks_flag'] = 1
noThankYouDF['yes_upgrade_flag'] = 0
noThankYouDF['purchase_flag'] = conversionForest.predict(noThankYouDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
noThankYouStatsDF = util.conversionPerUser(noThankYouDF)
print(f"Conversion Rate per User: {noThankYouStatsDF['conversionRate'].mean()} +/- {noThankYouStatsDF['conversionRate'].std()/np.sqrt(nUsers)}")

In [None]:
yesUpgradeDF = masterDF.copy()
yesUpgradeDF['no_thanks_flag'] = 0
yesUpgradeDF['yes_upgrade_flag'] = 1
yesUpgradeDF['purchase_flag'] = conversionForest.predict(yesUpgradeDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
yesUpgradeStatsDF = util.conversionPerUser(yesUpgradeDF)
print(f"Conversion Rate per User: {yesUpgradeStatsDF['conversionRate'].mean()} +/- {yesUpgradeStatsDF['conversionRate'].std()/np.sqrt(nUsers)}")

In [None]:
print('No vs control t-test')
print(stats.ttest_ind(controlConversionStatsDF['conversionRate'],noThankYouStatsDF['conversionRate']))
print('Yes vs control t-test')
print(stats.ttest_ind(controlConversionStatsDF['conversionRate'],yesUpgradeStatsDF['conversionRate']))
print('Yes vs No t-test')
print(stats.ttest_ind(noThankYouStatsDF['conversionRate'],yesUpgradeStatsDF['conversionRate']))

# Revenue Experiments

## Overall Revenue

### Training

In [None]:
revenueForest = rfr(max_depth = int(np.sqrt(len(XTrain.columns))),max_features = int(np.sqrt(len(XTrain.columns))))
revenueForest.fit(XTrain,revenueTrain)
revenuePredict = revenueForest.predict(XTrain)
math.sqrt(sm.mean_squared_error(revenueTrain,revenuePredict)) #Dollar ammount

### Testing

In [None]:
revenueTestPredict = revenueForest.predict(XTest)
math.sqrt(sm.mean_squared_error(revenueTest,revenueTestPredict)) #Dollar ammount

### Importances

In [None]:
util.displayFeatureImportances(list(XTrain.columns),revenueForest)

### Experimentation

In [None]:
nOrders = masterDF.shape[0]

In [None]:
controlRevenueDF = masterDF.copy()
controlRevenue = revenueForest.predict(masterDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"Control Revenue per Order: {np.mean(controlRevenue)} +/- {np.std(controlRevenue)/np.sqrt(nOrders)}")

In [None]:
noThankYouDF = masterDF.copy()
noThankYouDF['no_thanks_flag'] = 1
noThankYouDF['yes_upgrade_flag'] = 0
noThankYouRevenue = revenueForest.predict(noThankYouDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"No Thank You Revenue per Order: {np.mean(noThankYouRevenue)} +/- {np.std(noThankYouRevenue)/np.sqrt(nOrders)}")

In [None]:
yesUpgradeDF = masterDF.copy()
yesUpgradeDF['no_thanks_flag'] = 0
yesUpgradeDF['yes_upgrade_flag'] = 1
yesUpgradeRevenue = revenueForest.predict(yesUpgradeDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"Yes Upgrade Revenue per Order: {np.mean(yesUpgradeRevenue)} +/- {np.std(yesUpgradeRevenue)/np.sqrt(nOrders)}")

In [None]:
print('No vs control t-test')
print(stats.ttest_ind(controlRevenue,noThankYouRevenue))
print('Yes vs control t-test')
print(stats.ttest_ind(controlRevenue,yesUpgradeRevenue))
print('Yes vs No t-test')
print(stats.ttest_ind(noThankYouRevenue,yesUpgradeRevenue))

## Hiking Revenue

### Training

In [None]:
hikingForest = rfr(max_depth = int(np.sqrt(len(XTrain.columns))),max_features = int(np.sqrt(len(XTrain.columns))))
hikingForest.fit(XTrain,hikingTrain)
hikingPredict = hikingForest.predict(XTrain)
math.sqrt(sm.mean_squared_error(hikingTrain,hikingPredict)) #Dollar ammount

### Testing

In [None]:
hikingTestPredict = hikingForest.predict(XTest)
math.sqrt(sm.mean_squared_error(hikingTest,hikingTestPredict)) #Dollar ammount

### Importances

In [None]:
util.displayFeatureImportances(list(XTrain.columns),hikingForest)

### Experimentation

In [None]:
nOrders = masterDF.shape[0]

In [None]:
controlHikingDF = masterDF.copy()
controlHiking = hikingForest.predict(masterDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"Control Hiking Revenue per Order: {np.mean(controlHiking)} +/- {np.std(controlHiking)/np.sqrt(nOrders)}")

In [None]:
noThankYouDF = masterDF.copy()
noThankYouDF['no_thanks_flag'] = 1
noThankYouDF['yes_upgrade_flag'] = 0
noThankYouHiking = hikingForest.predict(noThankYouDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"No Thank You Hiking Revenue per Order: {np.mean(noThankYouHiking)} +/- {np.std(noThankYouHiking)/np.sqrt(nOrders)}")

In [None]:
yesUpgradeDF = masterDF.copy()
yesUpgradeDF['no_thanks_flag'] = 0
yesUpgradeDF['yes_upgrade_flag'] = 1
yesUpgradeHiking = hikingForest.predict(yesUpgradeDF.drop(['source_visitor_id','orderRevenue','hikeRevenue'],axis=1))
print(f"Yes Upgrade Revenue per Order: {np.mean(yesUpgradeHiking)} +/- {np.std(yesUpgradeHiking)/np.sqrt(nOrders)}")

In [None]:
print('No vs control t-test')
print(stats.ttest_ind(controlHiking,noThankYouHiking))
print('Yes vs control t-test')
print(stats.ttest_ind(controlHiking,yesUpgradeHiking))
print('Yes vs No t-test')
print(stats.ttest_ind(noThankYouHiking,yesUpgradeHiking))