In [7]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pycodamath as coda
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [8]:
print('\nExercise 8.4')

data = pd.read_csv('../data/protein.csv', sep=';', index_col='Country')
response = data[['Fish', 'Milk', 'Eggs', 'Red meat', 'White meat']]
covariates = data[data.columns[-2:]]

balances = [[1, 1, 1, 1, -1], [1, 1, 1, -1, 0], [1, 1, -1, 0, 0], [1, -1, 0, 0, 0]]
psi = coda.extra.norm(balances)
ilr = response.coda.ilr(psi)
ilr = ilr.rename(columns={0: 'ilr0', 1: 'ilr1', 2: 'ilr2', 3: 'ilr3'})
ilr['covariates'] = covariates['label'].str[:5]
print(ilr)

beta = [ols(part + '~ covariates', data=ilr).fit().params[1] for part in ilr.columns[:-1]]
beta = pd.DataFrame(beta, columns=['beta'], index=ilr.columns[:-1]).T
clrbeta = pd.DataFrame(np.dot(beta, psi), index=['clrbeta'], columns=response.columns)
print(clrbeta.iloc[0].sort_values())
# group red meat on its own, fish and milk, eggs and white meat

balances = [[1, 1, 1, -1, 1], [-1, -1, 1, 0, 1], [-1, 1, 0, 0, 0], [0, 0, -1, 0, 1]]
psi = coda.extra.norm(balances)
ilr = response.coda.ilr(psi)
ilr = ilr.rename(columns={0: 'ilr0', 1: 'ilr1', 2: 'ilr2', 3: 'ilr3'})
ilr['covariates'] = covariates['label'].str[:5]

# Do ANOVA per part
for part in ilr.columns[:-1]:
    model = ols(part+' ~ covariates', data=ilr).fit()
    print(part)
    print('beta', model.params[1].round(2))
    print('t value', np.sqrt(model.fvalue).round(2))
    print('p value', model.pvalues[1].round(4))
    print()

# Now relabel samples and run again for a significant split.
# Try labeling the "nordic" countries as north and everything else as south.


Exercise 8.4
                    ilr0      ilr1      ilr2      ilr3 covariates
Country                                                          
Albania         0.190091 -2.036355  0.801354 -2.683816      South
Austria        -0.710825 -0.394585  0.332897 -1.590129      North
Belgium        -0.120771 -0.586243  0.630460 -0.960338      North
Bulgaria       -0.524209 -0.979703  0.554634 -1.367498      South
Czechoslovakia -0.718635 -0.741284  0.473420 -1.295831      North
Denmark        -0.075487 -0.075869  1.181775 -0.655022      North
West Germany   -0.508507 -0.283773  0.602851 -0.509503      North
Finland         0.483652 -0.140085  1.342668 -1.244253      North
France         -0.083845 -0.798566  0.948375 -0.869705      South
Greece          0.805076 -0.373746  1.054755 -0.772830      South
Hungary        -1.402063 -0.828572 -0.433261 -2.457973      North
Ireland        -0.221831 -0.666629  0.385267 -1.740838      North
Italy           0.131066 -0.486646  0.698819 -0.985438      So

  beta = [ols(part + '~ covariates', data=ilr).fit().params[1] for part in ilr.columns[:-1]]
  beta = [ols(part + '~ covariates', data=ilr).fit().params[1] for part in ilr.columns[:-1]]
  beta = [ols(part + '~ covariates', data=ilr).fit().params[1] for part in ilr.columns[:-1]]
  beta = [ols(part + '~ covariates', data=ilr).fit().params[1] for part in ilr.columns[:-1]]
  print('beta', model.params[1].round(2))
  print('p value', model.pvalues[1].round(4))
  print('beta', model.params[1].round(2))
  print('p value', model.pvalues[1].round(4))
  print('beta', model.params[1].round(2))
  print('p value', model.pvalues[1].round(4))
  print('beta', model.params[1].round(2))
  print('p value', model.pvalues[1].round(4))
