# Create a randomized gene combinations based on co-occurrence matrix
* created by Jie

In [None]:
import tensorflow as tf 
import numpy as np 
import pandas as pd
import csv 
import os 
import matplotlib.pyplot as plt
%matplotlib inline
from platform import python_version
import ast

## Load gene pairs stats calculated based on co-occurrence matrix

In [2]:
matrix = pd.read_csv('data/gene_overexpression_cooccurence_pair_04042023.csv',converters={"Gene pair": ast.literal_eval})
display(matrix)

Unnamed: 0,Gene pair,Co-occurrence,weight factor
0,"(Pex13, Pex29)",0,1.000000
1,"(Pex2, Vps1)",0,1.000000
2,"(Pex10, Vps1)",0,1.000000
3,"(Pex8, Vps1)",0,1.000000
4,"(Pex29, Vps1)",0,1.000000
...,...,...,...
295,"(Pex30, Rtn1)",29,0.033333
296,"(Pex31, Sei1)",30,0.032258
297,"(Pex4, Pex5)",37,0.026316
298,"(Pex1, Pex5)",38,0.025641


In [3]:
# check type
type(matrix.iloc[1,0])

tuple

In [6]:
pairs = matrix['Gene pair']

In [10]:
weights = matrix['weight factor']

In [11]:
#normalize weights to be summed to 1
weights = weights/sum(weights)

In [18]:
def create_gene_combins(pair_pool,pair_n,weights,combin_n=30,replace=False):
    """
    pair_pool: all the pairs of genes
    pair_n: number of pairs we pick to create a new gene combinations
    weights: weight for each pair, the smaller the occurence, the greater the weight
    combin_n: how many gene_combins (samples) to create
    """
    gene_combins=[]
    for i in range(combin_n):
        tmp = np.random.choice(pair_pool,size=pair_n,p=weights,replace=replace)
        tmp_tup = ()
        for j in tmp:
            tmp_tup = tmp_tup + j
            ## remove duplicates
            tmp_tup = tuple(set(tmp_tup))
        gene_combins.append(tmp_tup)

    return gene_combins
    

In [19]:
gene_combins = create_gene_combins(pairs,7,weights,combin_n=100)

In [20]:
# display newly created gene combinations
for i in gene_combins:
    print(i)

('Rtn1', 'Pex22', 'Pex3', 'Pex19', 'Pex34', 'Pex13', 'Pex10', 'Vps1', 'Pex17', 'Pex32', 'Pex8')
('Rtn1', 'Pex2', 'Pex29', 'Pex11', 'Pex30', 'Vps1', 'Pex6', 'Pex17', 'Pex32', 'Pex15', 'Pex8')
('Pex32', 'Pex2', 'Pex22', 'Pex3', 'Pex19', 'Pex13', 'Pex29', 'Vps1', 'Pex6', 'Pex5', 'Pex15')
('Pex28', 'Rtn1', 'Pex2', 'Pex14', 'Pex34', 'Pex19', 'Pex29', 'Pex10', 'Vps1', 'Pex8')
('Pex28', 'Pex2', 'Pex14', 'Pex3', 'Pex19', 'Sei1', 'Pex13', 'Pex29', 'Vps1', 'Pex6', 'Pex32')
('Pex28', 'Pex34', 'Pex13', 'Pex29', 'Pex10', 'Pex12', 'Pex22', 'Pex31', 'Pex8')
('Pex28', 'Pex1', 'Pex29', 'Pex11', 'Pex30', 'Pex10', 'Vps1', 'Pex6', 'Pex12', 'Pex32', 'Pex8')
('Pex2', 'Pex14', 'Sei1', 'Pex34', 'Pex19', 'Pex13', 'Pex29', 'Vps1', 'Pex6', 'Pex12')
('Pex28', 'Pex32', 'Pex2', 'Pex34', 'Pex29', 'Pex11', 'Pex10', 'Vps1', 'Pex6', 'Pex17', 'Pex22', 'Pex31')
('Pex28', 'Rtn1', 'Pex14', 'Pex3', 'Pex19', 'Pex34', 'Pex1', 'Pex13', 'Pex4', 'Pex29', 'Pex17', 'Pex5')
('Rtn1', 'Pex2', 'Pex14', 'Sei1', 'Pex1', 'Pex3', 'Pex4', 

## Check duplicate
make sure generated gene combinations have not been tested in experiments yet 

In [3]:
old_data=pd.read_csv('data/overexpressed_jb_update_04042023_nodup.csv',sep=',')
display(old_data)

old_data = old_data.drop(old_data.columns[[0,-3,-2,-1]],axis = 1)
old_data = old_data.fillna(0)
display(old_data)

Unnamed: 0,index,Pex1,Pex2,Pex3,Pex4,Pex5,Pex6,Pex8,Pex10,Pex11,...,Pex30,Pex31,Pex32,Pex34,Sei1,Rtn1,Vps1,capacity_mean,capacity_std,capacity_instances
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0.735,0.2192,"(0.89, 0.58)"
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0.970,0.0000,"(0.97,)"
2,2,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,0,0.985,0.2192,"(0.83, 1.14)"
3,3,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,0,0.790,0.0000,"(0.79,)"
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.040,0.0000,"(1.04,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,105,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1.215,0.0600,"(1.17,1.26)"
106,106,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.150,0.0700,
107,107,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.990,0.0900,
108,108,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.050,0.0900,


Unnamed: 0,Pex1,Pex2,Pex3,Pex4,Pex5,Pex6,Pex8,Pex10,Pex11,Pex12,...,Pex22,Pex28,Pex29,Pex30,Pex31,Pex32,Pex34,Sei1,Rtn1,Vps1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
new_data = old_data.copy()
for gene_comb in gene_combins:
    # initiate
    new_data.loc[len(new_data)]= [0] *25
    for col in gene_comb:
        new_data.loc[new_data.index[-1],col] = 1
    


In [25]:
# check duplicates
new_data[new_data.duplicated(keep=False)]

Unnamed: 0,Pex1,Pex2,Pex3,Pex4,Pex5,Pex6,Pex8,Pex10,Pex11,Pex12,...,Pex22,Pex28,Pex29,Pex30,Pex31,Pex32,Pex34,Sei1,Rtn1,Vps1


## save those new gene combinations

In [32]:
new_data.iloc[-100:,:].to_csv('new_tests_recommends_04122023.csv')