# Non-Bayesian recommendation list analysis

This notebook contains non-Bayesian analyses of the recommendation list inputs and outputs, for comparison purposes and also to analyze the control bias.

## Setup

In [1]:
import pickle

In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.special import logit, expit

In [4]:
with open('data/profile-data.pkl', 'rb') as f:
    profiles = pickle.load(f)
profiles.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5000 entries, (AZ, 34891) to (GR-I, 875836)
Data columns (total 11 columns):
count         5000 non-null int32
linked        5000 non-null int32
ambiguous     5000 non-null int32
male          5000 non-null int32
female        5000 non-null int32
dcknown       5000 non-null int32
dcyes         5000 non-null int32
PropDC        4980 non-null float64
Known         5000 non-null int32
PropFemale    5000 non-null float64
PropKnown     5000 non-null float64
dtypes: float64(3), int32(8)
memory usage: 486.9+ KB


In [5]:
with open('data/rec-data.pkl', 'rb') as f:
    recs = pickle.load(f)
recs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 18847 entries, (AZ, als, 34891) to (GR-I, wrls, 875836)
Data columns (total 11 columns):
ambiguous     18847 non-null int32
female        18847 non-null int32
male          18847 non-null int32
unknown       18847 non-null int32
Total         18847 non-null int64
Known         18847 non-null int32
PropKnown     18847 non-null float64
PropFemale    18771 non-null float64
dcknown       18847 non-null int64
dcyes         18847 non-null int32
PropDC        18769 non-null float64
dtypes: float64(3), int32(6), int64(2)
memory usage: 1.4+ MB


## Proportion Regression

In [54]:
def prop_regress(recs, key=None):
    if key is None:
        key = recs.name  # compatible with groupby
    else:
        recs = recs.loc[key]
    ds, algo = key
    prof = profiles.loc[ds]
    prof, recs = prof.align(recs)
    x = prof['PropFemale']
    y = recs['PropFemale']
    x = sm.add_constant(x)
    ols = sm.OLS(y, x).fit()
    return pd.DataFrame({
        'Slope': ols.params.loc['PropFemale'],
        'Intercept': ols.params.loc['const'],
        'R2': ols.rsquared,
        'Model': ols
    }, index=[0])

In [55]:
ols = prop_regress(recs, ('GR-I', 'wrls'))
ols

Unnamed: 0,Slope,Intercept,R2,Model
0,0.910866,0.029189,0.78157,<statsmodels.regression.linear_model.Regressio...


In [56]:
models = recs.groupby(['Set', 'Algorithm']).apply(prop_regress)
models

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Slope,Intercept,R2,Model
Set,Algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AZ,als,0,0.101738,0.328481,0.043429,<statsmodels.regression.linear_model.Regressio...
AZ,bpr-imp,0,0.622284,0.141704,0.447074,<statsmodels.regression.linear_model.Regressio...
AZ,item-item,0,,,,<statsmodels.regression.linear_model.Regressio...
AZ,item-item-imp,0,,,,<statsmodels.regression.linear_model.Regressio...
AZ,user-user,0,,,,<statsmodels.regression.linear_model.Regressio...
AZ,user-user-imp,0,,,,<statsmodels.regression.linear_model.Regressio...
AZ,wrls-imp,0,,,,<statsmodels.regression.linear_model.Regressio...
BX-E,item-item,0,0.127185,0.381523,0.06757,<statsmodels.regression.linear_model.Regressio...
BX-E,user-user,0,,,,<statsmodels.regression.linear_model.Regressio...
BX-I,bpr,0,0.824394,0.086882,0.60218,<statsmodels.regression.linear_model.Regressio...


## Log Odds Regression

In [74]:
def odds_regress(recs, key=None):
    if key is None:
        key = recs.name  # compatible with groupby
    else:
        recs = recs.loc[key]
    ds, algo = key
    prof = profiles.loc[ds]
    prof, recs = prof.align(recs)
    x = logit((prof['female'] + 1) / (prof['Known'] + 2))
    y = (recs['female'] + 1) / (recs['Known'] + 2)
    x = sm.add_constant(x)
    glm = sm.GLM(y, x, family=sm.families.Binomial()).fit()
    return glm

In [76]:
glm = odds_regress(recs, ('GR-I', 'wrls'))
glm.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,1000.0
Model:,GLM,Df Residuals:,998.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-395.56
Date:,"Fri, 16 Aug 2019",Deviance:,69.639
Time:,16:01:49,Pearson chi2:,68.9
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0804,0.072,-1.112,0.266,-0.222,0.061
0,0.8770,0.069,12.645,0.000,0.741,1.013


In [77]:
lo_models = recs.groupby(['Set', 'Algorithm']).apply(odds_regress)
lo_models

Set   Algorithm    
AZ    als              <statsmodels.genmod.generalized_linear_model.G...
      bpr-imp          <statsmodels.genmod.generalized_linear_model.G...
      item-item        <statsmodels.genmod.generalized_linear_model.G...
      item-item-imp    <statsmodels.genmod.generalized_linear_model.G...
      user-user        <statsmodels.genmod.generalized_linear_model.G...
      user-user-imp    <statsmodels.genmod.generalized_linear_model.G...
      wrls-imp         <statsmodels.genmod.generalized_linear_model.G...
BX-E  item-item        <statsmodels.genmod.generalized_linear_model.G...
      user-user        <statsmodels.genmod.generalized_linear_model.G...
BX-I  bpr              <statsmodels.genmod.generalized_linear_model.G...
      item-item        <statsmodels.genmod.generalized_linear_model.G...
      user-user        <statsmodels.genmod.generalized_linear_model.G...
      wrls             <statsmodels.genmod.generalized_linear_model.G...
GR-E  item-item        <statsmo