In [1]:
from scipy.stats import chi2_contingency
from scipy import stats
import scipy
import numpy as np
import random
from scipy.stats import norm
from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, pairwise_kernels
from sklearn.kernel_approximation import Nystroem

from osfs_util import *

In [14]:
def fast_cor(X, y):
    """
    As the dependency is a single variable there 
    is no need to loop through
    we can use a kernel trick/approximation to solve it quickly.
    
    We do not care if things are perfectly negatively correlated here
    """
    X_all = np.hstack([X, y.reshape(-1, 1)]).T
    
    # calculate correlation
    if X_all.shape[0] < 1000:
        K = pairwise_kernels(X_all, metric='cosine')
    else:
        K = Nystroem('cosine').fit_transform(X_all)
    
    #k_I = np.eye(K.shape[0])
    #cov = K.dot(np.linalg.pinv(k_I - K))
    d_inv = np.sqrt(np.diag(np.diag(K)))
    corr = d_inv.dot(K).dot(d_inv)
    return corr

# calculate the partial correlation 
# using the info from wikipedia
def fast_partial_cor(X, y):
    """
    Calculate the partial correlation
    """
    cor_f = fast_cor(X, y)
    # now take out the last row and column...
    cor_dim = cor_f.shape[0]-1
    cor_x = cor_f[:, :cor_dim][:cor_dim, :]
    cor_z = cor_f[cor_dim, :cor_dim].flatten()
    
    # calculate new correlation matrix
    cor_m = np.ones((cor_x.shape[1], cor_x.shape[1]))
    y_reshape = y.reshape(-1, 1)
    tri_idx = np.triu_indices(cor_m.shape[1], 0)
    for i, j in zip(tri_idx[0].flatten(), tri_idx[1].flatten()):
        corr = (cor_x[i, j] - cor_z[i]*cor_z[j])
        corr = corr/(np.sqrt(1-(cor_z[i]*cor_z[i]))*np.sqrt(1-(cor_z[j]*cor_z[j])))
        cor_m[i, j] = corr
        cor_m[j, i] = corr
    return cor_m #, cor_x 

def fast_fisher_test(X, y):
    """
    Claculate score between x, y, and z
    """
    #cor_m = partial_cor(X, y)
    cor_m = fast_partial_cor(X, y)
    #print(cor_m.shape)
    print(cor_m)
    #cor_m[cor_m == 0] = np.finfo(float).eps
    #cor_v = cor_m[0, 1]
    z_score = 0.5*np.log((1+cor_m)/(1-cor_m))
    z_n = len(set(list(y)))
    N = X.shape[0]
    test_stat = np.sqrt(N - z_n -3) * np.abs(z_score)
    p_val = 1-scipy.stats.norm.cdf(test_stat)
    np.fill_diagonal(p_val, float("inf"))
    return test_stat, p_val

In [16]:
X = np.random.normal(size=(10, 4))
y = np.random.choice([0, 1], replace=True, size=10)

In [17]:
fast_partial_cor(X, y)

array([[ 1.        , -0.35155508, -0.21116138, -0.32335698],
       [-0.35155508,  1.        , -0.360369  ,  0.70616073],
       [-0.21116138, -0.360369  ,  1.        , -0.06567142],
       [-0.32335698,  0.70616073, -0.06567142,  1.        ]])

In [18]:
partial_cor(X, y)

array([[ 1.        , -0.61291546,  0.33684095, -0.48963989],
       [-0.61291546,  1.        , -0.44922903,  0.70374419],
       [ 0.33684095, -0.44922903,  1.        , -0.1607842 ],
       [-0.48963989,  0.70374419, -0.1607842 ,  1.        ]])

In [20]:
fisher_test(X, y)

[[ 1.         -0.61291546  0.33684095 -0.48963989]
 [-0.61291546  1.         -0.44922903  0.70374419]
 [ 0.33684095 -0.44922903  1.         -0.1607842 ]
 [-0.48963989  0.70374419 -0.1607842   1.        ]]


  z_score = 0.5*np.log((1+cor_m)/(1-cor_m))


(array([[        inf,  1.59560841,  0.78379744,  1.19760795],
        [ 1.59560841,         inf,  1.08166202,  1.95584424],
        [ 0.78379744,  1.08166202,         inf,  0.36267146],
        [ 1.19760795,  1.95584424,  0.36267146,         inf]]),
 array([[        inf,  0.05528812,  0.21657949,  0.11553484],
        [ 0.05528812,         inf,  0.13970137,  0.02524175],
        [ 0.21657949,  0.13970137,         inf,  0.35842516],
        [ 0.11553484,  0.02524175,  0.35842516,         inf]]))

In [21]:
fast_fisher_test(X, y)

[[ 1.         -0.35155508 -0.21116138 -0.32335698]
 [-0.35155508  1.         -0.360369    0.70616073]
 [-0.21116138 -0.360369    1.         -0.06567142]
 [-0.32335698  0.70616073 -0.06567142  1.        ]]




(array([[ 41.84795378,   0.82112222,   0.47938306,   0.74995832],
        [  0.82112222,          inf,   0.8436906 ,   1.96658601],
        [  0.47938306,   0.8436906 ,          inf,   0.1470574 ],
        [  0.74995832,   1.96658601,   0.1470574 ,  40.29802957]]),
 array([[        inf,  0.20578833,  0.31583307,  0.22663991],
        [ 0.20578833,         inf,  0.19942116,  0.02461548],
        [ 0.31583307,  0.19942116,         inf,  0.44154336],
        [ 0.22663991,  0.02461548,  0.44154336,         inf]]))