In [2]:
# Common packages
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import seaborn as sns

# Chapter-specific packages
import random # For functions sample() and shuffle()
# To rescale numeric variables
from sklearn.preprocessing import MinMaxScaler
# To one-hot encode cat. variables
from sklearn.preprocessing import OneHotEncoder

In [12]:

##### Loading the data #####
hist_data_df = pd.read_csv('chap9-historical_data.csv')
exp_data_df = pd.read_csv('chap9-experimental_data.csv')

### Minor data formatting

# Reformating categorical and id variables
hist_data_df['tier'] = pd.Categorical(hist_data_df.tier, categories=[3,2,1], ordered = True)
hist_data_df['ID'] = hist_data_df.ID.astype(str)
exp_data_df['tier'] = pd.Categorical(exp_data_df.tier, categories=[3,2,1], ordered = True)
exp_data_df['ID'] = exp_data_df.ID.astype(str)


In [4]:

def no_strat_assgnt(df, Nexp, k):
    temp = pd.DataFrame({'ID': df.ID.unique()})
    temp = temp.sample(Nexp)
    grp = list(range(k)) * int(Nexp / k)
    random.shuffle(grp)
    temp['grp'] = grp
    return temp

no_strat_assgnt(hist_data_df, 2000, k=4)

Unnamed: 0,ID,grp
1669,1670,2
1460,1461,1
4598,4599,3
668,669,3
3388,3389,0
...,...,...
3722,3723,1
938,939,0
2467,2468,3
4565,4566,2


In [14]:
hist_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   ID          175000 non-null  object  
 1   period      175000 non-null  int64   
 2   month       175000 non-null  int64   
 3   sq_ft       175000 non-null  float64 
 4   tier        175000 non-null  category
 5   avg_review  175000 non-null  float64 
 6   BPday       175000 non-null  float64 
dtypes: category(1), float64(3), int64(2), object(1)
memory usage: 8.2+ MB


In [25]:
def strat_prep_fun(df):
    temp = df.copy()

    temp = temp.groupby(['ID','tier'], observed=False).agg(
        sq_ft = ('sq_ft','mean'),
        avg_review = ('avg_review','mean'),
        BPday = ('BPday','mean')
    )
    temp = temp.dropna().reset_index()

    num_df = temp.copy().select_dtypes('float64')
    cat_df = temp.copy().select_dtypes('category')

    scaler = MinMaxScaler()
    scaler.fit(num_df)
    num_np = scaler.transform(num_df)
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(cat_df)
    cat_np = enc.transform(cat_df).toarray()

    data_np = np.concatenate((num_df, cat_np), axis=1)
    del num_df, num_np, cat_df, enc, scaler
    return data_np

prepared = strat_prep_fun(hist_data_df)

prepared

array([[821.67548629,   9.39342726,  45.93411704,   0.        ,
          1.        ,   0.        ],
       [977.68632137,  10.        ,  47.02031206,   0.        ,
          1.        ,   0.        ],
       [772.24643725,   5.05391337,  36.03493243,   1.        ,
          0.        ,   0.        ],
       ...,
       [931.23973925,   7.82448952,  42.00859311,   0.        ,
          0.        ,   1.        ],
       [792.20446113,   4.60368051,  32.83121812,   0.        ,
          0.        ,   1.        ],
       [859.06503328,   7.91434225,  42.09540755,   0.        ,
          0.        ,   1.        ]])