This notebook is written to implement the model for Epistasis as defined in the research work titled 'Computational Framework for Statistical Epistasis Supports XOR Penetrance Function in a Living System'. 

In [1]:
# importing all the required packages
import pandas as pd
import numpy as np
import scipy.stats

In [2]:
# reading the data 
data = pd.read_csv("BMIwTail.csv") # the last column contains the phenotype values, in this case bmi

# storing the phenotype in the variable
phenotype = data.iloc[:,-1]

# defining the significance level 
alpha = .05

# storing the count of number of samples in the variable n
n = data.shape[0]

# defining degrees of freedom where n is number of samples
df = n-4

# defining the critical value
critical_value = scipy.stats.t.ppf(q=1-alpha/2,df=df)


Below is the algoritm implementation for the partial correlation of interaction of genes for Epistasis. We use two different ways of encoding the interaction terms. Firstly, the traditional method of cartesian product and then by using the XOR penetrance function.

In [3]:
# Algorithm for partial correlation of interaction of genes for Epistasis
"""Parameters for the method:
    snp1 - First snp in the pair to be checked for interaction
    snp2 - Second snp in the pair to be checked for interaction
    phenotype - Phenotype vector
"""
def epistatis(snp1, snp2, phenotype): 
    # removing the intercept by mean centering
    snp1_tilde = snp1 - snp1.mean()
    snp2_tilde = snp2 - snp2.mean()
    phenotype_tilde = phenotype - phenotype.mean()

    # declaring the interaction vector - will contain the interaction term of snp1 and snp2
    interaction_vector = pd.Series(dtype='float64')
    
    # defining the interaction vector - either using cartesian product or XOR
    interaction_vector = (snp1%2 + snp2%2)%2 # using XOR penetrance
    #interaction_vector = snp1.mul(snp2) # using cartesian product
    
    # mean centering the interaction vector 
    interaction_vector_tilde = interaction_vector - interaction_vector.mean()
   
    # computing the dot products as explained in the algorithm
    x = (snp1_tilde.dot(interaction_vector_tilde)/snp1_tilde.dot(snp1_tilde)) * (snp1_tilde)
    u = interaction_vector_tilde - x
    y = (((interaction_vector_tilde.dot(snp2_tilde)) * (snp1_tilde.dot(snp1_tilde))) - ((snp1_tilde.dot(snp2_tilde)) * ((snp1_tilde.dot(interaction_vector_tilde))))) / ((snp2_tilde.dot(snp2_tilde)) * (snp1_tilde.dot(snp1_tilde)) - ((snp1_tilde.dot(snp2_tilde)) * (snp1_tilde.dot(snp2_tilde))))
    z = snp2_tilde - (((snp1_tilde.dot(snp2_tilde)) / (snp1_tilde.dot(snp1_tilde))) * snp1_tilde)
    v = u - (y*z)
    r = (v.dot(phenotype_tilde)) / (v.dot(v))

    return r # returning the partial correlation coefficient

In [None]:
# applying the above defined method to the dataset

p_value_locus = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        r = epistatis(interacting_snp_1, interacting_snp_2, phenotype)
        
        # performing Fisher's ttest
        t_test = np.sqrt(df)*r/(np.sqrt(1-r*r))
        p_val = scipy.stats.t.sf(abs(t_test), df)
        p_value_locus.append((p_val, data.columns[i], data.columns[j]))
        # printing the interacting pairs with ttest values greater than the selected critical value
        if abs(t_test) > critical_value:
            print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], r, t_test, p_val))

In [None]:
# performing fdr correction to correct for multiple tests

import statsmodels.stats.multitest as ssm

all_p_values = [] # list of all the p_values of the pairwise combinations
for i in range(0, len(p_value_locus)):
    all_p_values.append(p_value_locus[i][0])

accept, corrected_pvals = ssm.fdrcorrection(np.asarray(all_p_values).flatten(), alpha=0.001)

for i in range(0, len(corrected_pvals)):
    # printing the significant pairs after p value correction
    if corrected_pvals[i] < 0.001:
        print(" p-value = {0} \t adjusted p-value = {1} \t loci 1 = {2} \t loci 2 = {3} ".format( p_value_locus[i][0],  corrected_pvals[i], p_value_locus[i][1], p_value_locus[i][2]))