In [1]:
# https://towardsdatascience.com/data-driven-growth-with-python-part-1-know-your-metrics-812781e66a5b
from __future__ import division
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb


tx_data = pd.read_csv('../data/OnlineRetail.csv', encoding= 'unicode_escape')

In [2]:
tx_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
# Our "North Star" metric: Revenue. Active Customer Count * order count * average revenue per order 

In [4]:
# Some basic engineering to clean the data.
# converting InvoiceDate from a string to a datetime. 
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])

# creating YearMonth field for the ease of reporting and visualization. 
tx_data['InvoiceYearMonth'] = tx_data['InvoiceDate'].map(lambda date: 100*date.year + date.month)

# calculate revenue for each row and create a new dataframe with yearmonth - Revenue columns
tx_data['Revenue'] = tx_data['UnitPrice'] * tx_data['Quantity']
tx_revenue = tx_data.groupby(['InvoiceYearMonth'])['Revenue'].sum().reset_index()

In [5]:
tx_revenue.head()

Unnamed: 0,InvoiceYearMonth,Revenue
0,201012,748957.02
1,201101,560000.26
2,201102,498062.65
3,201103,683267.08
4,201104,493207.121


In [6]:
# Need to visualize this

In [7]:
# Next, we can calculate the Monthly Revenue Growth Rate:
# pct_change function captures this well. 
# Make new colun with percent change growth
tx_revenue['MonthlyGrowth'] = tx_revenue['Revenue'].pct_change()

In [None]:
# Now we need to create Monthly Active Customers in order to understand which months had the most customers.

# create a new dataframe with UK customers only:
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)


# need to visualize this

# Monthly Active Customers
tx_monthly_active = tx_uk.groupby('InvoiceYearMonth')['CustomerID'].nunique().reset_index()

# Monthly Order Count
tx_monthly_sales = tx_uk.groupby('InvoiceYearMonth')['Quantity'].sum().reset_index()

# Average Revenue Per Order
tx_monthly_order_avg = tx_uk.groupby('InvoiceYearMonth')['Revenue'].mean().reset_index()

# New Customer Ratio - Identify new customers by looking at who is new in each segment of time and who is not
tx_min_purchase = tx_uk.groupby('CustomerID').InvoiceDate.min().reset_index() # Create a new df with CustomerID and the first purchase date
tx_min_purchase.columns = ['CustomerID', 'MinPurchaseDate'] # rename columns
tx_min_purchase['MinPurchaseYearMonth'] = tx_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)
# Merge first purchase date column to our main dataframe (tx_uk)
tx_uk = pd.merge(tx_uk, tx_min_purchase, on='CustomerID')
# Create a column called User Type and assign 'Existing'
# If Users's First Purchase Year Month before the selected Invoice Year Month, then 'New'
tx_uk['UserType'] = 'New' # just make them all 'New' then overwrite selected 
tx_uk.loc[tx_uk['InvoiceYearMonth']>tx_uk['MinPurchaseYearMonth'],'UserType'] = 'Existing'
