In [None]:
# importing all the required packages
import pandas as pd
import numpy as np
import scipy.stats
import math
from time import process_time_ns
import statsmodels.stats.multitest as ssm
import statsmodels.api as sm

Simuated XOR Interactions

In [None]:
data = pd.read_csv("simulatedXOR.csv")
phenotype = data.iloc[:,-1] # extracting the phenotype

In [None]:
# defining and initializing required variables - Dataframes are the same size so only need to run this once

# defining the significance level 
alpha = .05

# storing the count of number of samples in the variable n
n = data.shape[0]

# defining degrees of freedom where n is number of samples
df = n-4

# defining the critical value
critical_value = scipy.stats.t.ppf(q=1-alpha/2,df=df)

In [None]:
# Algorithm for Interaction Coefficient for Pairwise XOR Epistasis
"""Parameters for the method:
    snp1 - First snp in the pair to be checked for interaction
    snp2 - Second snp in the pair to be checked for interaction
    phenotype - Phenotype vector
"""
def epistatis(snp1, snp2, phenotype): 
    try:
        # removing the intercept by mean centering
        snp1_tilde = snp1 - snp1.mean()
        snp2_tilde = snp2 - snp2.mean()
        phenotype_tilde = phenotype - phenotype.mean()

        # declaring the interaction vector - will contain the interaction term of snp1 and snp2
        interaction_vector = pd.Series(dtype='float64')
    
        # defining the interaction vector - either using cartesian product or XOR
        #interaction_vector = snp1.mul(snp2) # using cartesian product
        interaction_vector = (snp1%2 + snp2%2)%2 # using XOR penetrance
    
        # print(interaction_vector)
        # mean centering the interaction vector 
        interaction_vector_tilde = interaction_vector - interaction_vector.mean()
   
        # computing the dot products as explained in the algorithm
        x = (snp1_tilde.dot(interaction_vector_tilde)/snp1_tilde.dot(snp1_tilde)) * (snp1_tilde) # 2nd term in the v variable, using to breakdown the formula
        v = interaction_vector_tilde - x
        q2 = snp2_tilde - (((snp1_tilde.dot(snp2_tilde)) / (snp1_tilde.dot(snp1_tilde))) * snp1_tilde)
        v = v - ((interaction_vector_tilde.dot(q2)/q2.dot(q2))*q2)
        b3 = (v.dot(phenotype_tilde)) / (v.dot(v)) # interactionn coefficient, referred as beta_3 in the paper


        # residual calculation
        residual = phenotype_tilde - (snp1_tilde.dot(phenotype_tilde)/snp1_tilde.dot(snp1_tilde))*snp1_tilde
        residual = residual - (phenotype_tilde.dot(q2)/q2.dot(q2))*q2
        residual = residual - b3 * v

        # print(b3)

        v = pd.Series(np.squeeze(np.asarray(v)), dtype='float64')
        residual = pd.Series(np.squeeze(np.asarray(residual)), dtype='float64') 


        t_test = np.sqrt(snp1.shape[0]-4)*np.sqrt(v.dot(v))*b3/(np.sqrt(residual.dot(residual)))

        # t_test = np.sqrt(df)*b3/(np.sqrt(1-b3*b3))
        p_val = scipy.stats.t.sf(abs(t_test), df) * 2
    except Exception as e:
        print("Error pair detected with error: ", e)
        b3 = 0
        t_test = 0
        p_val = 1

    return b3, t_test, p_val # returning the interaction coefficient, t test value, and p value

In [None]:
# applying the above defined method to the simulated XOR dataset assuming an XOR interaction term

p_value_locus_original = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        #print(interacting_snp_1 - interacting_snp_1.mean())
        # Sandra's implementation
        b3, t_test, p_val = epistatis(interacting_snp_1, interacting_snp_2, phenotype)
        p_value_locus_original.append((p_val, data.columns[i], data.columns[j]))

        print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))
        
        # # printing the interacting pairs with ttest values greater than the selected critical value
        # if abs(t_test) > critical_value:
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

        # # printing exception cases
        # if b3==0 and t_test==0 and p_val==1:
        #     print("Error Pair: ")
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

In [None]:
# Convert the list to a DataFrame
df_2way_XOR = pd.DataFrame(p_value_locus_original, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_XOR.to_csv('simXORpvals_inhouse_XOR.csv', index=False)

In [None]:
# Nick's implementation of PLINK - Linear/Logistic Regression-based Test - Can only assume a Cartesian Interaction
def regression_based_test(interacting_snp_1, interacting_snp_2, phenotype, trait_type='quantitative', set_by='all', epi1=0.0001, epi2=0.01, vif=None):
    # Implement your regression-based test here using statsmodels

    # Example linear regression using statsmodels
    # Replace this with the actual implementation based on your data and requirements
    # X = data[['gA', 'gB', 'gAgB']]
    # y = data['phenotype']
    X = pd.DataFrame({'gA': interacting_snp_1, 'gB': interacting_snp_2, 'gAgB': interacting_snp_1.mul(interacting_snp_2)})

    if trait_type == 'quantitative':
        model = sm.OLS(phenotype, sm.add_constant(X))
    elif trait_type == 'case_control':
        model = sm.Logit(phenotype, sm.add_constant(X))
    
    results = model.fit()

    return results



p_value_locus_regression = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        regression_results = regression_based_test(interacting_snp_1, interacting_snp_2, phenotype)
        # print(regression_results.pvalues['gAgB'])
        p_value_locus_regression.append((regression_results.pvalues['gAgB'], data.columns[i], data.columns[j]))

In [None]:
# p_values for locus regression
print(p_value_locus_regression)

In [None]:
# Convert the list to a DataFrame
df_2way_reg = pd.DataFrame(p_value_locus_regression, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_reg.to_csv('simXORpvals_standardregression.csv', index=False)

In [None]:
# Algorithm for Interaction Coefficient for Pairwise Cartesian Epistasis
"""Parameters for the method:
    snp1 - First snp in the pair to be checked for interaction
    snp2 - Second snp in the pair to be checked for interaction
    phenotype - Phenotype vector
"""
def epistatis(snp1, snp2, phenotype): 
    try:
        # removing the intercept by mean centering
        snp1_tilde = snp1 - snp1.mean()
        snp2_tilde = snp2 - snp2.mean()
        phenotype_tilde = phenotype - phenotype.mean()

        # declaring the interaction vector - will contain the interaction term of snp1 and snp2
        interaction_vector = pd.Series(dtype='float64')
    
        # defining the interaction vector - either using cartesian product or XOR
        interaction_vector = snp1.mul(snp2) # using cartesian product
        #interaction_vector = (snp1%2 + snp2%2)%2 # using XOR penetrance
    
        # print(interaction_vector)
        # mean centering the interaction vector 
        interaction_vector_tilde = interaction_vector - interaction_vector.mean()
   
        # computing the dot products as explained in the algorithm
        x = (snp1_tilde.dot(interaction_vector_tilde)/snp1_tilde.dot(snp1_tilde)) * (snp1_tilde) # 2nd term in the v variable, using to breakdown the formula
        v = interaction_vector_tilde - x
        q2 = snp2_tilde - (((snp1_tilde.dot(snp2_tilde)) / (snp1_tilde.dot(snp1_tilde))) * snp1_tilde)
        v = v - ((interaction_vector_tilde.dot(q2)/q2.dot(q2))*q2)
        b3 = (v.dot(phenotype_tilde)) / (v.dot(v)) # interactionn coefficient, referred as beta_3 in the paper


        # residual calculation
        residual = phenotype_tilde - (snp1_tilde.dot(phenotype_tilde)/snp1_tilde.dot(snp1_tilde))*snp1_tilde
        residual = residual - (phenotype_tilde.dot(q2)/q2.dot(q2))*q2
        residual = residual - b3 * v

        # print(b3)

        v = pd.Series(np.squeeze(np.asarray(v)), dtype='float64')
        residual = pd.Series(np.squeeze(np.asarray(residual)), dtype='float64') 


        t_test = np.sqrt(snp1.shape[0]-4)*np.sqrt(v.dot(v))*b3/(np.sqrt(residual.dot(residual)))

        # t_test = np.sqrt(df)*b3/(np.sqrt(1-b3*b3))
        p_val = scipy.stats.t.sf(abs(t_test), df) * 2
    except Exception as e:
        print("Error pair detected with error: ", e)
        b3 = 0
        t_test = 0
        p_val = 1

    return b3, t_test, p_val # returning the interaction coefficient, t test value, and p value

In [None]:
# applying the above defined method to the simulated XOR dataset assuming a Cartesian interaction term

p_value_locus_original = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        #print(interacting_snp_1 - interacting_snp_1.mean())
        # Sandra's implementation
        b3, t_test, p_val = epistatis(interacting_snp_1, interacting_snp_2, phenotype)
        p_value_locus_original.append((p_val, data.columns[i], data.columns[j]))

        print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))
        
        # # printing the interacting pairs with ttest values greater than the selected critical value
        # if abs(t_test) > critical_value:
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

        # # printing exception cases
        # if b3==0 and t_test==0 and p_val==1:
        #     print("Error Pair: ")
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

In [None]:
# Convert the list to a DataFrame
df_2way_Cart = pd.DataFrame(p_value_locus_original, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_Cart.to_csv('simXORpvals_inhouse_Cartesian.csv', index=False)

Simulated Cartesian Interactions

In [None]:
data = pd.read_csv("simulatedCartesian.csv")
phenotype = data.iloc[:,-1] # extracting the phenotype

In [None]:
# defining and initializing required variables

# defining the significance level 
alpha = .05

# storing the count of number of samples in the variable n
n = data.shape[0]

# defining degrees of freedom where n is number of samples
df = n-4

# defining the critical value
critical_value = scipy.stats.t.ppf(q=1-alpha/2,df=df)

In [None]:
# Algorithm for Interaction Coefficient for Pairwise Cartesian Epistasis
"""Parameters for the method:
    snp1 - First snp in the pair to be checked for interaction
    snp2 - Second snp in the pair to be checked for interaction
    phenotype - Phenotype vector
"""
def epistatis(snp1, snp2, phenotype): 
    try:
        # removing the intercept by mean centering
        snp1_tilde = snp1 - snp1.mean()
        snp2_tilde = snp2 - snp2.mean()
        phenotype_tilde = phenotype - phenotype.mean()

        # declaring the interaction vector - will contain the interaction term of snp1 and snp2
        interaction_vector = pd.Series(dtype='float64')
    
        # defining the interaction vector - either using cartesian product or XOR
        interaction_vector = snp1.mul(snp2) # using cartesian product
        #interaction_vector = (snp1%2 + snp2%2)%2 # using XOR penetrance
    
        # print(interaction_vector)
        # mean centering the interaction vector 
        interaction_vector_tilde = interaction_vector - interaction_vector.mean()
   
        # computing the dot products as explained in the algorithm
        x = (snp1_tilde.dot(interaction_vector_tilde)/snp1_tilde.dot(snp1_tilde)) * (snp1_tilde) # 2nd term in the v variable, using to breakdown the formula
        v = interaction_vector_tilde - x
        q2 = snp2_tilde - (((snp1_tilde.dot(snp2_tilde)) / (snp1_tilde.dot(snp1_tilde))) * snp1_tilde)
        v = v - ((interaction_vector_tilde.dot(q2)/q2.dot(q2))*q2)
        b3 = (v.dot(phenotype_tilde)) / (v.dot(v)) # interactionn coefficient, referred as beta_3 in the paper


        # residual calculation
        residual = phenotype_tilde - (snp1_tilde.dot(phenotype_tilde)/snp1_tilde.dot(snp1_tilde))*snp1_tilde
        residual = residual - (phenotype_tilde.dot(q2)/q2.dot(q2))*q2
        residual = residual - b3 * v

        # print(b3)

        v = pd.Series(np.squeeze(np.asarray(v)), dtype='float64')
        residual = pd.Series(np.squeeze(np.asarray(residual)), dtype='float64') 


        t_test = np.sqrt(snp1.shape[0]-4)*np.sqrt(v.dot(v))*b3/(np.sqrt(residual.dot(residual)))

        # t_test = np.sqrt(df)*b3/(np.sqrt(1-b3*b3))
        p_val = scipy.stats.t.sf(abs(t_test), df) * 2
    except Exception as e:
        print("Error pair detected with error: ", e)
        b3 = 0
        t_test = 0
        p_val = 1

    return b3, t_test, p_val # returning the interaction coefficient, t test value, and p value

In [None]:
# applying the above defined method to the simulated XOR dataset assuming a Cartesian interaction term

p_value_locus_original = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        #print(interacting_snp_1 - interacting_snp_1.mean())
        # Sandra's implementation
        b3, t_test, p_val = epistatis(interacting_snp_1, interacting_snp_2, phenotype)
        p_value_locus_original.append((p_val, data.columns[i], data.columns[j]))

        print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))
        
        # # printing the interacting pairs with ttest values greater than the selected critical value
        # if abs(t_test) > critical_value:
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

        # # printing exception cases
        # if b3==0 and t_test==0 and p_val==1:
        #     print("Error Pair: ")
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

In [None]:
# Convert the list to a DataFrame
df_2way_XOR = pd.DataFrame(p_value_locus_original, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_XOR.to_csv('simCartpvals_inhouse_Cart.csv', index=False)

In [None]:
# Nick's implementation of PLINK - Linear/Logistic Regression-based Test - Can only assume a Cartesian Interaction
def regression_based_test(interacting_snp_1, interacting_snp_2, phenotype, trait_type='quantitative', set_by='all', epi1=0.0001, epi2=0.01, vif=None):
    # Implement your regression-based test here using statsmodels

    # Example linear regression using statsmodels
    # Replace this with the actual implementation based on your data and requirements
    # X = data[['gA', 'gB', 'gAgB']]
    # y = data['phenotype']
    X = pd.DataFrame({'gA': interacting_snp_1, 'gB': interacting_snp_2, 'gAgB': interacting_snp_1.mul(interacting_snp_2)})

    if trait_type == 'quantitative':
        model = sm.OLS(phenotype, sm.add_constant(X))
    elif trait_type == 'case_control':
        model = sm.Logit(phenotype, sm.add_constant(X))
    
    results = model.fit()

    return results



p_value_locus_regression = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        regression_results = regression_based_test(interacting_snp_1, interacting_snp_2, phenotype)
        # print(regression_results.pvalues['gAgB'])
        p_value_locus_regression.append((regression_results.pvalues['gAgB'], data.columns[i], data.columns[j]))

In [None]:
# p_values for locus regression
print(p_value_locus_regression)

In [None]:
# Convert the list to a DataFrame
df_2way_reg = pd.DataFrame(p_value_locus_regression, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_reg.to_csv('simCartpvals_standardregression.csv', index=False)

In [None]:
# Algorithm for Interaction Coefficient for Pairwise XOR Epistasis
"""Parameters for the method:
    snp1 - First snp in the pair to be checked for interaction
    snp2 - Second snp in the pair to be checked for interaction
    phenotype - Phenotype vector
"""
def epistatis(snp1, snp2, phenotype): 
    try:
        # removing the intercept by mean centering
        snp1_tilde = snp1 - snp1.mean()
        snp2_tilde = snp2 - snp2.mean()
        phenotype_tilde = phenotype - phenotype.mean()

        # declaring the interaction vector - will contain the interaction term of snp1 and snp2
        interaction_vector = pd.Series(dtype='float64')
    
        # defining the interaction vector - either using cartesian product or XOR
        #interaction_vector = snp1.mul(snp2) # using cartesian product
        interaction_vector = (snp1%2 + snp2%2)%2 # using XOR penetrance
    
        # print(interaction_vector)
        # mean centering the interaction vector 
        interaction_vector_tilde = interaction_vector - interaction_vector.mean()
   
        # computing the dot products as explained in the algorithm
        x = (snp1_tilde.dot(interaction_vector_tilde)/snp1_tilde.dot(snp1_tilde)) * (snp1_tilde) # 2nd term in the v variable, using to breakdown the formula
        v = interaction_vector_tilde - x
        q2 = snp2_tilde - (((snp1_tilde.dot(snp2_tilde)) / (snp1_tilde.dot(snp1_tilde))) * snp1_tilde)
        v = v - ((interaction_vector_tilde.dot(q2)/q2.dot(q2))*q2)
        b3 = (v.dot(phenotype_tilde)) / (v.dot(v)) # interactionn coefficient, referred as beta_3 in the paper


        # residual calculation
        residual = phenotype_tilde - (snp1_tilde.dot(phenotype_tilde)/snp1_tilde.dot(snp1_tilde))*snp1_tilde
        residual = residual - (phenotype_tilde.dot(q2)/q2.dot(q2))*q2
        residual = residual - b3 * v

        # print(b3)

        v = pd.Series(np.squeeze(np.asarray(v)), dtype='float64')
        residual = pd.Series(np.squeeze(np.asarray(residual)), dtype='float64') 


        t_test = np.sqrt(snp1.shape[0]-4)*np.sqrt(v.dot(v))*b3/(np.sqrt(residual.dot(residual)))

        # t_test = np.sqrt(df)*b3/(np.sqrt(1-b3*b3))
        p_val = scipy.stats.t.sf(abs(t_test), df) * 2
    except Exception as e:
        print("Error pair detected with error: ", e)
        b3 = 0
        t_test = 0
        p_val = 1

    return b3, t_test, p_val # returning the interaction coefficient, t test value, and p value

In [None]:
# applying the above defined method to the simulated XOR dataset assuming an XOR interaction term

p_value_locus_original = [] # list of tuples containing the p_value and the two interacting loci
for i in range(0, data.shape[1]-1):
    for j in range(i+1, data.shape[1]-1):
        interacting_snp_1 = data.iloc[:,i]
        interacting_snp_2 = data.iloc[:,j]
        #print(interacting_snp_1 - interacting_snp_1.mean())
        # Sandra's implementation
        b3, t_test, p_val = epistatis(interacting_snp_1, interacting_snp_2, phenotype)
        p_value_locus_original.append((p_val, data.columns[i], data.columns[j]))

        print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))
        
        # # printing the interacting pairs with ttest values greater than the selected critical value
        # if abs(t_test) > critical_value:
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

        # # printing exception cases
        # if b3==0 and t_test==0 and p_val==1:
        #     print("Error Pair: ")
        #     print(" Interacting SNP 1 = {0} \t  Interacting SNP 2 ={1} \t  Beta Coefficient(r) = {2}  \t t_test = {3} \t p_val = {4} ".format(data.columns[i], data.columns[j], b3, t_test, p_val))

In [None]:
# Convert the list to a DataFrame
df_2way_Cart = pd.DataFrame(p_value_locus_original, columns=['p_val','Interacting SNP 1', 'Interacting SNP 2'])

# Write the DataFrame to a CSV file
df_2way_Cart.to_csv('simCartpvals_inhouse_XOR.csv', index=False)