In [1]:
#Load required modules
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score
import pandas as pd

In [28]:
def parse_fitness_helper(i, j, ij, pred_tol = 0.1, class_tol = 0.1):
    pred_ij = i*j
    pred_match = np.isclose(pred_ij, ij, pred_tol)
    if (pred_match):
        return('independent')
    i_j_match = np.isclose(i,j,class_tol)
    i_ij_match = np.isclose(i,ij,class_tol)
    j_ij_match = np.isclose(j,ij,class_tol)

    if (i_j_match):
        if (j_ij_match and i_ij_match):
            return('coequal')
        elif (j < ij):
            return('synergistic')
    elif (i < j):
        if (j <= ij):
            return('masking')
        elif ((i <= ij) and (ij <= j)):
            return('suppressive')
    elif ((ij < i) and (i <= j)):
        return('antagonistic')

    return('unknown')

def parse_fitness(i, j, ij, pred_tol = 0.1, class_tol = 0.1):
    classification = parse_fitness_helper(i, j, ij, pred_tol, class_tol)
    if (classification == 'unknown'):
        classification = parse_fitness_helper(j, i, ij, pred_tol, class_tol)
    return(classification)

In [3]:
def import_fitness_data(filename):
    data = pd.read_csv(filename, sep="\t", header=0)
    return(data)
    

data = import_fitness_data('/home/dikshant/Documents/jensn lab/yeast/Data File S1. Raw genetic interaction datasets: Pair-wise interaction format/SGA_ExE.txt')
single_mutant_data = import_fitness_data('/home/dikshant/Documents/jensn lab/yeast/Data File S1. Raw genetic interaction datasets: Pair-wise interaction format/strain_ids_and_single_mutant_fitness.csv')
#single_mutant_data
data

Unnamed: 0,Query Strain ID,Query allele name,Array Strain ID,Array allele name,Arraytype/Temp,Genetic interaction score (ε),P-value,Query single mutant fitness (SMF),Array SMF,Double mutant fitness,Double mutant fitness standard deviation
0,YAL001C_tsq508,tfc3-g349e,YBL023C_tsa111,mcm2-1,TSA30,-0.0348,5.042000e-03,0.8285,0.9254,0.7319,0.0102
1,YAL001C_tsq508,tfc3-g349e,YBL026W_tsa1065,lsm2-5001,TSA30,-0.3529,3.591000e-06,0.8285,0.9408,0.4266,0.0790
2,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa274,stu1-5,TSA30,0.0126,4.625000e-01,0.8285,0.8925,0.7520,0.1338
3,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa454,stu1-8,TSA30,0.0043,4.998000e-01,0.8285,0.7988,0.6661,0.0831
4,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa643,stu1-6,TSA30,-0.1601,5.140000e-02,0.8285,0.7683,0.4764,0.1395
5,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa822,stu1-12,TSA30,0.0863,1.678000e-01,0.8285,0.9003,0.8322,0.0901
6,YAL001C_tsq508,tfc3-g349e,YBL034C_tsa950,stu1-7,TSA30,-0.1294,1.931000e-02,0.8285,0.6690,0.4249,0.0482
7,YAL001C_tsq508,tfc3-g349e,YBL035C_tsa365,pol12-ts,TSA30,-0.0741,1.807000e-01,0.8285,0.6586,0.4715,0.0792
8,YAL001C_tsq508,tfc3-g349e,YBL040C_tsa1072,erd2-5001,TSA30,0.0275,1.846000e-01,0.8285,0.6953,0.6036,0.0226
9,YAL001C_tsq508,tfc3-g349e,YBL041W_tsa1064,pre7-5001,TSA30,-0.1119,1.460000e-01,0.8285,0.8102,0.5593,0.1195


In [4]:
# build matrix
from itertools import chain

def search_list(mylist, search):
    idx = [i for i, s in enumerate(mylist) if search in s]
    return(idx)

def search_gene(mylist, gene):
    cats = ['_', '-']
    idxs = []
    for cat in cats:
        search = gene + cat
        indices = search_list(mylist, gene) #[i for i, s in enumerate(mylist) if search in s]
        idxs.append(indices)
    
    idxs = list(chain.from_iterable(idxs))
    return(idxs)

def append_list(mylist, add):
    mylist.append(add)
    mylist = list(chain.from_iterable(mylist))
    return(mylist)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def extract_single_mutant_fitness(df, gene, gene_column = 'Allele/Gene name', 
                                  fitness_column = 'Single mutant fitness (26°)'):
    idxs = search_gene(df[gene_column], gene)
    mean = np.mean(df[fitness_column][idxs])
    return(mean)

print(extract_single_mutant_fitness(single_mutant_data, 'rpc82'))

def extract_double_mutant_fitness(df, query_gene, array_gene, 
                                 query_column = 'Query allele name', array_column = 'Array allele name',
                                 fitness_column = 'Double mutant fitness'):
    q_idxs = search_gene(df[query_column], query_gene)
    a_idxs = search_gene(df[array_column], array_gene)
    
    dbl_mutant_idxs = intersection(q_idxs, a_idxs)
    dbl_mutant_idxs = np.unique(dbl_mutant_idxs)
    mean = np.mean(df[fitness_column][dbl_mutant_idxs])
    return(mean)
    
extract_double_mutant_fitness(data,'tfc3','stu1')

0.9281333333333333


0.63032

In [24]:
reference_genes = ['tfc3', 'arh1']
query_genes = ['mcm2', 'lsm2', 'stu1']

def build_interaction_mtx(reference_genes, query_genes, double_mutant_df, single_mutant_df):
    m = len(reference_genes)
    n = len(query_genes)
    mtx = np.zeros([n,m])
    
    for i in range(0,m):
        gene_i = reference_genes[i]
        for j in range(0,n):
            gene_j = query_genes[j]
            fitness = 0
            fitness = extract_double_mutant_fitness(double_mutant_df, gene_i, gene_j)
            mtx[j,i] = fitness
    return(mtx)

dbl_mutant_fit = build_interaction_mtx(reference_genes, query_genes, data, single_mutant_data)
print(dbl_mutant_fit)
print(dbl_mutant_fit[1][0])

[[0.7319  0.9397 ]
 [0.4266  0.9522 ]
 [0.63032 0.9461 ]]
0.4266


In [29]:
def build_fitness_mtx(reference_genes, query_genes, double_mutant_df, single_mutant_df):
    m = len(reference_genes)
    n = len(query_genes)
    interaction = pd.DataFrame(index=query_genes, columns=reference_genes)
    
    double_mutant_fitness = build_interaction_mtx(reference_genes, query_genes, double_mutant_df, single_mutant_df)
    
    for i in range(0,m):
        gene_i = reference_genes[i]
        reference_fitness = extract_single_mutant_fitness(single_mutant_df, gene_i)
        for j in range(0,n):
            gene_j = query_genes[j]
            query_fitness = extract_single_mutant_fitness(single_mutant_df, gene_j)
            dbl_fitness = double_mutant_fitness[j][i]
            interact_label = parse_fitness(reference_fitness, query_fitness, dbl_fitness)
            interaction[gene_i][gene_j] = interact_label
    
    
    print(interaction)
    # fitness = extract_single_mutant_fitness(single_mutant_df, gene_i)

build_fitness_mtx(reference_genes, query_genes, data, single_mutant_data)

         tfc3         arh1
mcm2  unknown  independent
lsm2  unknown  independent
stu1  unknown  independent
