In [1]:
import anndata 
import pandas as pd 
import numpy as np 
import re

### This script is for saving

In [3]:
bulk = pd.read_csv('../../data/bulk/bulk_grouped.csv')

In [9]:
bulks = bulk.index.values

In [10]:
bulks

array(['Adipose Tissue', 'Muscle', 'Blood Vessel', 'Heart', 'Uterus',
       'Vagina', 'Breast', 'Skin', 'Salivary Gland', 'Brain',
       'Adrenal Gland', 'Thyroid', 'Lung', 'Spleen', 'Pancreas',
       'Esophagus', 'Stomach', 'Colon', 'Small Intestine', 'Prostate',
       'Testis', 'Nerve', 'Blood', 'Pituitary', 'Ovary', 'Liver',
       'Kidney', 'Cervix Uteri', 'Fallopian Tube', 'Bladder'],
      dtype=object)

In [11]:
sum_pseudo = pd.read_csv('../../data/pseudobulk/sum_pseudobulk.csv', sep = ",", header = 0, index_col=0)

In [12]:
sum_pseudo.index

Index(['Tongue_epithelial cell', 'Tongue_basal cell', 'Tongue_immune cell',
       'Tongue_fibroblast', 'Tongue_vein endothelial cell',
       'Tongue_pericyte cell', 'Tongue_endothelial cell of lymphatic vessel',
       'Tongue_endothelial cell of artery',
       'Tongue_capillary endothelial cell', 'Tongue_tongue muscle cell',
       ...
       'Liver_liver dendritic cell', 'Liver_nk cell', 'Liver_fibroblast',
       'Liver_hepatocyte', 'Liver_intrahepatic cholangiocyte', 'Liver_t cell',
       'Liver_neutrophil', 'Liver_plasma cell', 'Liver_erythrocyte',
       'Liver_endothelial cell'],
      dtype='object', length=474)

In [13]:
mylist = ['Bladder',
'Blood',
'Bone_Marrow',
'Eye',
'Fat',
'Heart',
'Kidney',
'Large_Intestine',
'Liver',
'Lung',
'Lymph_Node',
'Mammary',
'Muscle',
'Pancreas',
'Prostate',
'Salivary_Gland',
'Skin',
'Small_Intestine',
'Spleen',
'Thymus',
'Tongue',
'Trachea',
'Uterus',
'Vasculature']

pseudo_samples = np.array(mylist)

In [14]:
matches = pseudo_samples[np.isin(pseudo_samples, bulks)]
matches

array(['Bladder', 'Blood', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle',
       'Pancreas', 'Prostate', 'Skin', 'Spleen', 'Uterus'], dtype='<U15')

In [15]:
non_matches = pseudo_samples[~np.isin(pseudo_samples, bulks)]
non_matches

array(['Bone_Marrow', 'Eye', 'Fat', 'Large_Intestine', 'Lymph_Node',
       'Mammary', 'Salivary_Gland', 'Small_Intestine', 'Thymus', 'Tongue',
       'Trachea', 'Vasculature'], dtype='<U15')

In [16]:
#Bone_Marrow is present in pseudobulk, but not bulk.

# Identify what Organism Parts are in the Bulk, bot not the Pseudobulk, so we can delete the ones un the bulk for a more fair comparison


In [52]:
bulk_matches = bulks[np.isin(bulks, pseudo_samples)]
bulk_matches = np.sort(bulk_matches)
bulk_matches

array(['Bladder', 'Blood', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle',
       'Pancreas', 'Prostate', 'Skin', 'Spleen', 'Uterus'], dtype=object)

In [53]:
bulk_non_matches = bulks[~np.isin(bulks,pseudo_samples)]
bulk_non_matches

array(['Adipose Tissue', 'Blood Vessel', 'Vagina', 'Breast',
       'Salivary Gland', 'Brain', 'Adrenal Gland', 'Thyroid', 'Esophagus',
       'Stomach', 'Colon', 'Small Intestine', 'Testis', 'Nerve',
       'Pituitary', 'Ovary', 'Cervix Uteri', 'Fallopian Tube'],
      dtype=object)

In [54]:
# Adipose Tissue the bulk is "Fat" in the pseudobulk
# Add Adipose Tissue into our matches so we know to include it when we end up subsetting our bulk
bulk_matches = np.append(bulk_matches, "Adipose Tissue")
bulk_matches = np.append(bulk_matches, 'Blood Vessel')
bulk_matches = np.append(bulk_matches, 'Breast')
bulk_matches = np.append(bulk_matches, 'Salivary Gland')
bulk_matches = np.append(bulk_matches, 'Colon')
bulk_matches = np.append(bulk_matches, 'Small Intestine')







In [55]:
bulk_matches

array(['Bladder', 'Blood', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle',
       'Pancreas', 'Prostate', 'Skin', 'Spleen', 'Uterus',
       'Adipose Tissue', 'Blood Vessel', 'Breast', 'Salivary Gland',
       'Colon', 'Small Intestine'], dtype=object)

### Find the names for the pseudobulks now

In [56]:
pseudo_matches = pseudo_samples[np.isin(pseudo_samples, bulks)]
pseudo_matches = np.sort(pseudo_matches)

In [57]:
pseudo_matches

array(['Bladder', 'Blood', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle',
       'Pancreas', 'Prostate', 'Skin', 'Spleen', 'Uterus'], dtype='<U15')

In [58]:
pseudo_non_matches = pseudo_samples[~np.isin(pseudo_samples, bulks)]
pseudo_non_matches

array(['Bone_Marrow', 'Eye', 'Fat', 'Large_Intestine', 'Lymph_Node',
       'Mammary', 'Salivary_Gland', 'Small_Intestine', 'Thymus', 'Tongue',
       'Trachea', 'Vasculature'], dtype='<U15')

In [59]:
pseudo_matches = np.append(pseudo_matches, "Fat")
pseudo_matches = np.append(pseudo_matches, 'Vasculature')
pseudo_matches = np.append(pseudo_matches, 'Mammary')
pseudo_matches = np.append(pseudo_matches, 'Salivary_Gland')
pseudo_matches = np.append(pseudo_matches, "Large_Intestine")
pseudo_matches = np.append(pseudo_matches, 'Small_Intestine')

our goal here is to trim the bulks so we have the same types of smaples in the pseudobulk

# Save dataset


In [65]:
df = pd.DataFrame({'bulk_names':bulk_matches, 'pseudo_names':pseudo_matches})

In [68]:
df.to_csv('../../data/sample_names/bulk_pseudo.csv')