In [1]:
import pandas as pd
import numpy as np
import pickle

import os

# append the parental path
import sys
sys.path.append('..')

In [2]:
# create a dictionary that first key is the module name, second key is the module name (color), value is the list of genes in that module
def get_module_dict(tissue, sign):
    # get the current working directory
    cwd = os.getcwd()

    folder = os.path.join('raw_data','B6 BTBR F2 {} WGCNA modules {} beta12 mod10.xlsx'.format(tissue, sign))
    path = os.path.join(cwd, folder)

    df = pd.read_excel(path, index_col=0)
    module_dict = {}
    # get the module name (color), get rid of the redundant ones
    modules = df.index.unique().to_list()

    # for each modules, get the gene names
    for module in modules:
        # get the gene names for each module
        module_dict[module] = df.loc[module].iloc[:,0].to_list()
    assert 'grey' not in modules

    # get rest of the genes

    folder = os.path.join('cleaned','{}_rna.csv'.format(tissue))
    path = os.path.join(cwd, folder)

    # load the csv file
    df = pd.read_csv(path, index_col=0)
    # get all genes first
    all_genes = []
    for module in module_dict:
        all_genes += module_dict[module]

    # get the unique genes
    all_genes = set(all_genes)
    # retirve all the genes in df
    df_genes = set(df.columns.to_list())
    # get the genes that are not in all the modules
    grey_genes = df_genes.difference(all_genes)
    module_dict['grey'] = list(grey_genes)

    if sign == 'SIGNED':
        module_dict['all'] = []

    assert len(df_genes) == len(all_genes) + len(grey_genes)
    return module_dict

In [3]:
tissues = ['islet', 'liver', 'adipose', 'kidney', 'gastroc']
signs = ['SIGNED', 'UNSIGNED']

module_dict = {}
for tissue in tissues:
    for sign in signs:
        module_dict[(tissue, sign)] = get_module_dict(tissue, sign)
        
# print the number of modules in each key pair
for key in module_dict:
    print(key, len(module_dict[key]))
    # get total genes in each key pair
    genes = []
    total = 0
    for module in module_dict[key]:
        genes += module_dict[key][module]
        total += len(module_dict[key][module])
    print(total)
    assert len(set(genes)) == len(genes)

# save the dictionary
with open('module_dict.pickle', 'wb') as handle:
    pickle.dump(module_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

('islet', 'SIGNED') 40
31463
('islet', 'UNSIGNED') 71
31463
('liver', 'SIGNED') 46
31463
('liver', 'UNSIGNED') 89
31463
('adipose', 'SIGNED') 59
31463
('adipose', 'UNSIGNED') 73
31463
('kidney', 'SIGNED') 58
31463
('kidney', 'UNSIGNED') 98
31463
('gastroc', 'SIGNED') 36
31463
('gastroc', 'UNSIGNED') 57
31463
