In [None]:
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
from arviz.plots.plot_utils import xarray_var_iter
import theano
import theano.tensor as tt
from sklearn.preprocessing import PowerTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr

%matplotlib inline

RANDOM_SEED = 42

In [None]:
stockx_data = pd.read_csv('StockX-Data-Contest-2019-3.csv')

In [None]:
#Seeing how many rows/columns we're working with
print(stockx_data.shape)

In [None]:
#Getting first glimpse at dataset
stockx_data.head(10)

In [None]:
stockx_data.isna().sum()

In [None]:
stockx_data.dtypes

In [None]:
def cleanCurrency(series):
    """
    Input: a dataframe series for currency (USD)
    Output: cleaned series as a float
    """
    #Replaces dollar sign and comma with empty string
    series = series.str.replace('$', '', regex=True)
    series = series.str.replace(',', '', regex=True)
    series = series.astype(float)
    return series

In [None]:
#Fix the Price fields to only be numeric
stockx_data['Sale Price'] = cleanCurrency(stockx_data['Sale Price'])
stockx_data['Retail Price'] = cleanCurrency(stockx_data['Retail Price'])

#Calculate Profit by subtracting Sale from Retail
stockx_data['Profit'] = stockx_data['Sale Price'] - stockx_data['Retail Price']

In [None]:
#Cleaning Sneaker Name column by removing hyphens
stockx_data['Sneaker Name'] = stockx_data['Sneaker Name'].str.replace('-', '')

#Making brand name a little more descriptive
stockx_data['Brand'] = stockx_data['Brand'].str.replace('Yeezy', 'Yeezy (Adidas)')
stockx_data['Brand'] = stockx_data['Brand'].str.replace('Off-White', 'Off-White (Nike)')

#Getting only 'object' dtype columns and then stripping trailing white spaces
stockx_data_obj = stockx_data.select_dtypes(['object'])
stockx_data[stockx_data_obj.columns] = stockx_data_obj.apply(lambda x: x.str.strip())

In [None]:
#Ensure date fields are the right dtype
stockx_data['Order Date'] = pd.to_datetime(stockx_data['Order Date'])
stockx_data['Release Date'] = pd.to_datetime(stockx_data['Release Date'])

#Calculate Duration of Days from Release Date to Order Date
stockx_data['Duration (days)'] = stockx_data['Order Date'] - stockx_data['Release Date']

In [None]:
#Univariate stats on the numeric measures
stockx_data.describe()

In [None]:
print(stockx_data['Order Date'].min(), stockx_data['Order Date'].max())

In [None]:
transformer = PowerTransformer()

stockx_data['Profit_transformed'] = transformer.fit_transform(stockx_data['Profit'].values.reshape(-1, 1))

In [None]:
#Grouping by Brand to see how the general distribution of sales are
brand_group = pd.DataFrame(stockx_data.groupby('Brand').size(), columns=['values']).reset_index().sort_values('values', ascending=False)
brand_group['values'] = brand_group['values'].astype(int)
plt.figure(figsize=(16, 8))
plt.rcParams.update({'font.size': 10})
ax = sns.barplot(x=brand_group['Brand'], y=brand_group['values'], data=brand_group, hue='Brand', palette="icefire")
ax.set(ylabel='Number of Shoes')
plt.show()

In [None]:
# plt.figure(figsize=(16, 8))
# plt.rcParams.update({'font.size': 10})
# ax = sns.boxplot(x=brand_group['Brand'], y=brand_group['values'], data=brand_group, hue=['Brand'], palette="icefire")
# ax.set(ylabel='Number of Shoes')
# plt.show()

In [None]:
#Grouping by Sneaker Name to see how the general distribution of ratings are
sneaker_group = pd.DataFrame(stockx_data.groupby(['Brand', 'Sneaker Name']).size(), columns=['values']).reset_index().sort_values('values', ascending=False)
sneaker_group['values'] = sneaker_group['values'].astype(int)
plt.figure(figsize=(20, 8))
plt.rcParams.update({'font.size': 10})
ax = sns.barplot(x=sneaker_group['Sneaker Name'], y=sneaker_group['values'], data=sneaker_group, hue='Brand', palette="icefire")
ax.set(ylabel='Number of Shoes')
plt.xticks(rotation=90)
plt.show()

In [None]:
sneaker_group[:10]

In [None]:
#Grouping by Region to see how the general distribution of ratings are
region_group = pd.DataFrame(stockx_data.groupby(['Buyer Region', 'Brand']).size(), columns=['values']).reset_index().sort_values(by='values', ascending=False)
region_group['values'] = region_group['values'].astype(int)
plt.figure(figsize=(20, 8))
plt.rcParams.update({'font.size': 10})
ax = sns.barplot(x=region_group['Buyer Region'], y=region_group['values'], data=region_group, hue='Brand', palette="icefire")
ax.set(ylabel='Number of Shoes')
plt.xticks(rotation=90)
plt.show()

In [None]:
region_group[:10]

In [None]:
order_group = pd.DataFrame(stockx_data.groupby(['Order Date', 'Brand']).size(), columns=['values']).reset_index().sort_values(by='values', ascending=False)
order_group['values'] = order_group['values'].astype(int)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 12))
plt.rcParams.update({'font.size': 10})
ax1 = sns.lineplot(x=order_group['Order Date'], y=order_group['values'], data=order_group, hue='Brand', palette="icefire", ax=ax1)
ax2 = sns.lineplot(x=stockx_data['Order Date'], y=stockx_data['Profit'], data=stockx_data, hue='Brand', palette="icefire", ax=ax2)
ax1.set(ylabel='Number of Shoes')
plt.show()

In [None]:
sns.pairplot(stockx_data, diag_kind='kde', hue='Brand', palette='icefire'); 

In [None]:
yeezy = stockx_data[stockx_data['Brand']=='Yeezy (Adidas)']
offwhite = stockx_data[stockx_data['Brand']=='Off-White (Nike)']

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 8))

ax1 = sns.histplot(yeezy['Profit'].values, kde=True, color='midnightblue', ax=ax1)
ax2 = sns.histplot(offwhite['Profit'].values, kde=True, color='maroon', ax=ax2)

ax1.set_title('Profit Distribution for Yeezy (Adidas) Products')
ax2.set_title('Profit Distribution for Off-White (Nike) Products')

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 8))

ax1 = sns.histplot(yeezy['Retail Price'], kde=True, color='midnightblue', ax=ax1)
ax2 = sns.histplot(offwhite['Retail Price'], kde=True, color='maroon', ax=ax2)

ax1.set_title('Retail Price Distribution for Yeezy (Adidas) Products')
ax2.set_title('Retail Price Distribution for Off-White (Nike) Products')

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 8))

ax1 = sns.histplot(yeezy['Profit_transformed'], kde=True, color='midnightblue', ax=ax1)
ax2 = sns.histplot(offwhite['Profit_transformed'], kde=True, color='maroon', ax=ax2)

ax1.set_title('Profit Distribution for Yeezy (Adidas) Products (transformed)')
ax2.set_title('Profit Distribution for Off-White (Nike) Products (transformed)')

plt.show()

In [None]:
yeezy['Profit'].mean(), offwhite['Profit'].mean()

In [None]:
yeezy['Profit_transformed'].mean(), offwhite['Profit_transformed'].mean()

In [None]:
with pm.Model() as model:
    mu_offwhite = pm.Uniform('mu_offwhite', 0, 1000)
    sigma_offwhite = pm.HalfNormal('sigma_offwhite', sd=10)
    nu_offwhite = pm.Exponential('nu_offwhite', 1/30)
    y_offwhite = pm.StudentT('y_offwhite', mu=mu_offwhite, sd=sigma_offwhite, nu=nu_offwhite,
                             observed=offwhite['Profit'])

    mu_yeezy = pm.Uniform('mu_yeezy', 0, 500)
    sigma_yeezy = pm.HalfNormal('sigma_yeezy', sd=10)
    nu_yeezy = pm.Exponential('nu_yeezy', 1/30)
    y_yeezy = pm.StudentT('y_yeezy', mu=mu_yeezy, sd=sigma_yeezy, nu=nu_yeezy,
                        observed=yeezy['Profit'])
    
pm.model_to_graphviz(model)

In [None]:
with model:
    start = pm.find_MAP()
    trace = pm.sample(10000, tune=2000, target_accept=.9, return_inferencedata=True, random_seed=RANDOM_SEED)

In [None]:
var_names = ['mu_offwhite', 'sigma_offwhite', 'mu_yeezy', 'sigma_yeezy']
lines = list(xarray_var_iter(trace.posterior[var_names].mean(dim=("chain", "draw"))))
az.plot_trace(trace, var_names=var_names, lines=lines);

In [None]:
lines = list(xarray_var_iter(trace.posterior[['nu_offwhite', 'nu_yeezy']].mean(dim=("chain", "draw"))))
az.plot_trace(trace, var_names=['nu_offwhite', 'nu_yeezy'], lines=lines);

In [None]:
with model:
    prior_checks = pm.sample_prior_predictive(random_seed=RANDOM_SEED)
    idata_prior = az.from_pymc3(prior=prior_checks)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8))
ax1 = az.plot_dist(idata_prior.prior['mu_yeezy'], kind='kde', rug=True, 
                   quantiles=[.25, .5, .75], color='midnightblue', ax=ax1)
ax2 = az.plot_dist(idata_prior.prior['mu_offwhite'], kind='kde', rug=True,
                   quantiles=[.25, .5, .75], color='maroon', ax=ax2)

ax1.set_title('Prior Distribution for mu_yeezy')
ax2.set_title('Prior Distribution for mu_offwhite')

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8))

ax1 = az.plot_dist(idata_prior.prior['sigma_yeezy'], kind='kde', rug=True, 
                   quantiles=[.25, .5, .75], color='midnightblue', ax=ax1)
ax2 = az.plot_dist(idata_prior.prior['sigma_offwhite'], kind='kde', rug=True,
                   quantiles=[.25, .5, .75], color='maroon', ax=ax2)

ax1.set_title('Prior Distribution for sigma_yeezy')
ax2.set_title('Prior Distribution for sigma_offwhite')

plt.show()

In [None]:
az.summary(trace)

In [None]:
with model:
    ppc = pm.sample_posterior_predictive(trace, var_names=var_names+['y_offwhite', 'y_yeezy'], random_seed=RANDOM_SEED)


In [None]:
az.plot_ppc(az.from_pymc3(posterior_predictive=ppc, model=model));

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10, 5))
ax1.hist([y.mean() for y in ppc['y_yeezy']], bins=19, alpha=0.5, color='midnightblue')
ax2.hist([y.mean() for y in ppc['y_offwhite']], bins=19, alpha=0.5, color='maroon')
ax1.axvline(yeezy['Profit'].mean(), color='r')
ax2.axvline(offwhite['Profit'].mean(), color='r')
for ax in [ax1, ax2]:
    ax.set(xlabel='x', ylabel='')
ax1.set_title('Posterior Predictive Check for y_yeezy')
ax2.set_title('Posterior Predictive Check for y_offwhite');

In [None]:
yeezy['Profit'].std()

In [None]:
with model:
    diff_means = pm.Deterministic('means difference', mu_yeezy-mu_offwhite)
    diff_std = pm.Deterministic('stds difference', sigma_yeezy-sigma_offwhite)
    effect_size = pm.Deterministic('effect size (cohens d)', diff_means/np.sqrt((yeezy['Profit'].std() ** 2 + offwhite['Profit'].std() ** 2) / 2))
    

In [None]:
with model:
    trace = pm.sample(5000, return_inferencedata=True, random_seed=RANDOM_SEED)

In [None]:
pm.plot_posterior(trace, var_names=['mu_yeezy', 'mu_offwhite', 'sigma_yeezy', 'sigma_offwhite'], color='#87ceeb');

In [None]:
pm.plot_posterior(trace, var_names=['means difference', 'stds difference', 'effect size (cohens d)'], ref_val=0, color='#87ceeb');

In [None]:
az.summary(trace, var_names=['means difference', 'stds difference', 'effect size (cohens d)'])