In [1]:
import numpy as np
import math
import copy
from   more_itertools import powerset
from operator import itemgetter
#import pandas as pd
from   ucimlrepo import fetch_ucirepo
from   sklearn.model_selection import train_test_split


In [2]:
# Load and split the data
wine_quality = fetch_ucirepo(id=186)
wine_subset  = wine_quality['data']['original'][wine_quality['data']['original']['color'] == 'white']
X = np.array(wine_subset[['fixed_acidity', 'volatile_acidity', 'citric_acid','residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']])
y = np.array(wine_subset['quality'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
player_names = ['fixed_acidity', 'volatile_acidity', 'citric_acid','residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']
N = set([i for i in range(X.shape[1])])

In [4]:
# Standardise the training data
X_mu, X_sigma, y_mu, y_sigma = X_train.mean(axis=0), X_train.std(axis=0), y_train.mean(axis=0), y_train.std(axis=0)

X_train = (X_train - X_mu) / X_sigma
y_train = (y_train - y_mu) / y_sigma

X_test = (X_test - X_mu) / X_sigma

$$X^TXw = X^Ty$$ 
$$w=(X^TX)^{-1}X^Ty$$

In [5]:
w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train)), X_train.T), y_train)

$$R^2  = \frac{1}{N}w^TX^TXw$$
$$R^2_i = \frac{1}{N}w_i (X^TXw)$$

In [6]:
R2  = np.matmul(w.T,np.matmul(np.matmul(X_train.T, X_train), w))/len(y_train)
R2i = np.matmul(np.diag(w),np.matmul(np.matmul(X_train.T, X_train), w))/len(y_train)
print(R2)
print(R2i) 

0.2843126626414035
[-5.15764443e-03  4.48079867e-02  1.16066061e-04 -3.99818409e-02
  1.29593436e-04  2.77336079e-03  1.86662385e-03  1.27245440e-01
  1.04933860e-02  5.15467153e-03  1.36865019e-01]


$$\hat{y} = Xw$$
$$MSE = \frac{1}{N}\sum(y-\hat{y})^2$$

In [7]:
yh  = np.matmul(X_test, w)
MSE = np.matmul((y_test - yh).T, (y_test - yh)) / len(y_test)

In [8]:
w

array([ 0.04371106, -0.2193717 , -0.00827766,  0.41280525, -0.00065067,
        0.09626305, -0.01148575, -0.42336479,  0.10171807,  0.08312901,
        0.3170512 ])

In [11]:
def predict(X_vector):
    y_standard = np.matmul(X_vector, w)
    y_actual   = (y_standard * y_sigma) + y_mu
    return y_actual

#E_f = np.mean(predict(X_test)) # should be same as sampling?
E_f = predict(np.mean(X_test, axis=0))

def value(X_vector, S, X_sample):
    X_local = copy.deepcopy(X_sample)
    X_local[:, list(S)] = X_vector[list(S)]
    y_actual = predict(X_local)
    return np.mean(y_actual) - E_f

def sample_v1(X_dist, sample_size):
    s_idx    = np.random.randint(low=0, high=len(X_dist), size=sample_size)
    return   X_dist[s_idx]

def sample_v2(X_dist, sample_size):
    return np.array([X_dist[np.random.randint(low=0, high=len(X_dist), size=sample_size),j] for j in N] ).transpose()

def sample_v3(X_dist, dummy):
    return np.mean(X_dist, axis=0).reshape(1,11)
    
   
def marginal(X_vector, j, S, sample_size):
    X_sample = sample_v3(X_test, sample_size)
    return value(X_vector, S.union(j), X_sample) - value(X_vector, S, X_sample)

def gamma(N,S):
    return math.factorial(len(S)) * math.factorial(len(N) - len(S) - 1) / math.factorial(len(N))

def phi(X_Vector, j, N, sample_size):
    players = N - j
    return np.sum([gamma(N, S) * marginal(X_vector, j, set(S), sample_size) for S in powerset(players)])


In [12]:
X_vector = X_test[0]
phi_T = E_f
phi_i = [phi(X_vector,{player}, N,1000) * y_sigma for player in N]
phi_i_sorted = sorted(enumerate(phi_i), key=itemgetter(1))
phi_i_sorted.reverse()
for player, p_i in phi_i_sorted:
    print(player_names[player].ljust(20), round(p_i,3))
print('Total phi:    ', round(sum(phi_i),3))
print('Total Reward: ', round(E_f + sum(phi_i),3) )
print('Expected:     ', round(E_f,3))
print('Predicted:    ', round(predict(X_vector),3))
print('Delta:        ', round(predict(X_vector) - (E_f + sum(phi_i)),3)  )


residual_sugar       0.293
alcohol              0.087
free_sulfur_dioxide  0.086
sulphates            0.057
density              0.017
chlorides            -0.0
total_sulfur_dioxide -0.002
citric_acid          -0.004
volatile_acidity     -0.027
fixed_acidity        -0.033
pH                   -0.05
Total phi:     0.424
Total Reward:  6.318
Expected:      5.893
Predicted:     6.372
Delta:         0.054


In [90]:
z= np.mean(X_test, axis=0).reshape(1,11)
z


array([[-0.06071995, -0.05394598,  0.06096112, -0.05766376,  0.00878483,
         0.06399892,  0.04271784, -0.07157771, -0.03416091,  0.00292092,
         0.02209133]])

In [38]:
np.array([X_test[np.random.randint(low=0, high=len(X_test), size=100),j] for j in N] ).transpose()

array([[-1.02448031e+00,  2.03383533e-01, -5.23883069e-01, ...,
        -5.94636337e-01,  2.02701757e+00, -3.43509781e-02],
       [-1.02448031e+00, -1.17466521e+00, -6.07395717e-01, ...,
        -5.28042506e-01,  1.93263316e-03,  4.00055080e-01],
       [-7.87618831e-01, -9.19126252e-02,  1.44218115e-01, ...,
        -1.28479522e-01,  9.70451518e-01,  1.86617554e+00],
       ...,
       [ 2.17314967e+00,  2.03383533e-01,  9.79344596e-01, ...,
        -1.26057464e+00, -3.50256052e-01,  2.11052895e+00],
       [-7.70343903e-02, -1.90344678e-01, -7.74421013e-01, ...,
        -1.28479522e-01,  6.18262833e-01, -1.06606538e+00],
       [-5.50757351e-01, -9.19126252e-02, -5.23883069e-01, ...,
         1.53636625e+00,  8.99798045e-02,  7.42505344e-02]])

In [44]:
sample_old(X_test,20)

array([[ 9.88842271e-01, -4.26965987e-02, -2.73345125e-01,
        -1.06072546e+00, -3.08977247e-01, -9.65201063e-01,
        -1.49780567e+00, -6.85348457e-01, -1.19398081e+00,
         6.18262833e-01,  1.55701671e-01],
       [ 2.78257830e-01,  3.01815586e-01,  1.14636989e+00,
         1.39127817e+00,  4.71031035e-01,  1.13377087e+00,
         1.40265402e+00,  1.23435272e+00, -5.94636337e-01,
         1.93263316e-03, -9.84614239e-01],
       [-5.50757351e-01,  3.01815586e-01, -5.23883069e-01,
         1.84736700e-01,  1.49851154e-01,  7.73947112e-01,
         7.36974747e-01, -2.78239415e-01,  5.37458786e-01,
        -3.50256052e-01,  4.81506217e-01],
       [-7.87618831e-01,  2.03383533e-01, -1.06319829e-01,
        -1.00234442e+00,  3.79265355e-01, -4.85436049e-01,
         1.06981438e+00, -4.66899703e-01,  7.37240278e-01,
        -5.26350395e-01, -3.33005148e-01],
       [-4.32326611e-01, -1.61760944e+00, -3.56857773e-01,
        -8.75852172e-01, -3.54860087e-01,  2.27321278e+00,
  

In [45]:
sample(X_test,20)