In [87]:
import pandas as pd
import json
import requests
from collections import defaultdict
import matplotlib.pyplot as plt

#### Get GTEX data

In [88]:
URL_ENDPOINT = "https://gtexportal.org/rest/v1/dataset/tissueInfo?datasetId=gtex_v8"

response = requests.get(URL_ENDPOINT)
response.raise_for_status()
data = response.json()

In [89]:
len(data['tissueInfo'])

54

In [90]:
data['tissueInfo']

[{'colorHex': 'FF6600',
  'colorRgb': '255,102,0',
  'datasetId': 'gtex_v8',
  'eGeneCount': 15607,
  'expressedGeneCount': 28830,
  'hasEGenes': True,
  'hasSGenes': True,
  'mappedInHubmap': False,
  'rnaSeqAndGenotypeSampleCount': 581,
  'rnaSeqSampleCount': 663,
  'rnaSeqSampleCountFemale': 218,
  'rnaSeqSampleCountMale': 445,
  'sGeneCount': 5113,
  'samplingSite': "Subcutaneous tissue beneath the leg's skin sample.",
  'tissueSite': 'Adipose Tissue',
  'tissueSiteDetail': 'Adipose - Subcutaneous',
  'tissueSiteDetailAbbr': 'ADPSBQ',
  'tissueSiteDetailId': 'Adipose_Subcutaneous',
  'uberonId': '0002190'},
 {'colorHex': 'FFAA00',
  'colorRgb': '255,170,0',
  'datasetId': 'gtex_v8',
  'eGeneCount': 12482,
  'expressedGeneCount': 28881,
  'hasEGenes': True,
  'hasSGenes': True,
  'mappedInHubmap': False,
  'rnaSeqAndGenotypeSampleCount': 469,
  'rnaSeqSampleCount': 541,
  'rnaSeqSampleCountFemale': 170,
  'rnaSeqSampleCountMale': 371,
  'sGeneCount': 4210,
  'samplingSite': 'Adipose

All fields present for each entry.

In [91]:
column_dict = {}
for tissue_data in data['tissueInfo']:
    #print(json.dumps(tissue_data))
    #list_data.extend(tissue_data.keys())
    for k in tissue_data.keys():
        if k not in column_dict.keys():
            column_dict[k] = 0
        column_dict[k] += 1
    #list_data = list(set(list_data))

print(column_dict)


{'colorHex': 54, 'colorRgb': 54, 'datasetId': 54, 'eGeneCount': 54, 'expressedGeneCount': 54, 'hasEGenes': 54, 'hasSGenes': 54, 'mappedInHubmap': 54, 'rnaSeqAndGenotypeSampleCount': 54, 'rnaSeqSampleCount': 54, 'rnaSeqSampleCountFemale': 54, 'rnaSeqSampleCountMale': 54, 'sGeneCount': 54, 'samplingSite': 54, 'tissueSite': 54, 'tissueSiteDetail': 54, 'tissueSiteDetailAbbr': 54, 'tissueSiteDetailId': 54, 'uberonId': 54}


#### Create a GTEX dataframe

In [92]:
gtex_df = pd.DataFrame(columns=column_dict.keys())
for i, tissue_data in enumerate(data['tissueInfo']):
    for col in gtex_df.columns:
        gtex_df.loc[i, col] = tissue_data[col]
gtex_df['uberonId'] = 'UBERON_'+ gtex_df['uberonId']

In [93]:
gtex_df

Unnamed: 0,colorHex,colorRgb,datasetId,eGeneCount,expressedGeneCount,hasEGenes,hasSGenes,mappedInHubmap,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSite,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId,uberonId
0,FF6600,2551020,gtex_v8,15607.0,28830,True,True,False,581,663,218.0,445.0,5113.0,Subcutaneous tissue beneath the leg's skin sam...,Adipose Tissue,Adipose - Subcutaneous,ADPSBQ,Adipose_Subcutaneous,UBERON_0002190
1,FFAA00,2551700,gtex_v8,12482.0,28881,True,True,False,469,541,170.0,371.0,4210.0,Adipose tissue on the large fold of parietal p...,Adipose Tissue,Adipose - Visceral (Omentum),ADPVSC,Adipose_Visceral_Omentum,UBERON_0010414
2,33DD33,5122151,gtex_v8,8123.0,28235,True,True,False,233,258,101.0,157.0,2369.0,"Left, followed by the right if necessary for s...",Adrenal Gland,Adrenal Gland,ADRNLG,Adrenal_Gland,UBERON_0002369
3,FF5555,2558585,gtex_v8,12493.0,28025,True,True,False,387,432,153.0,279.0,3740.0,Ascending aorta or other thoracic regions (non...,Blood Vessel,Artery - Aorta,ARTAORT,Artery_Aorta,UBERON_0001496
4,FFAA99,255170153,gtex_v8,6296.0,28462,True,True,True,213,240,94.0,146.0,2140.0,"Left and right, noncalcific regions only.",Blood Vessel,Artery - Coronary,ARTCRN,Artery_Coronary,UBERON_0001621
5,FF0000,25500,gtex_v8,15008.0,27217,True,True,False,584,663,209.0,454.0,4791.0,Left Tibial. Peripheral tibial artery from gas...,Blood Vessel,Artery - Tibial,ARTTBL,Artery_Tibial,UBERON_0007610
6,AA0000,17000,gtex_v8,,28949,False,False,False,21,21,7.0,14.0,,"Central posterior urinary bladder, trimming fr...",Bladder,Bladder,BLDDER,Bladder,UBERON_0001255
7,EEEE00,2382380,gtex_v8,3726.0,28196,True,True,False,129,152,45.0,107.0,892.0,Amygdala (sampled at Miami Brain Bank and pres...,Brain,Brain - Amygdala,BRNAMY,Brain_Amygdala,UBERON_0001876
8,EEEE00,2382380,gtex_v8,5640.0,28921,True,True,False,147,176,48.0,128.0,1238.0,Anterior cingulate cortex (sampled at Miami Br...,Brain,Brain - Anterior cingulate cortex (BA24),BRNACC,Brain_Anterior_cingulate_cortex_BA24,UBERON_0009835
9,EEEE00,2382380,gtex_v8,8362.0,29230,True,True,False,194,246,63.0,183.0,1809.0,Caudate (sampled at Miami Brain Bank and prese...,Brain,Brain - Caudate (basal ganglia),BRNCDT,Brain_Caudate_basal_ganglia,UBERON_0001873


In [94]:
from pyld import jsonld
import json

#### Get RUI data

In [95]:
# URL = 'https://github.com/hubmapconsortium/ccf-ui/blob/main/projects/ccf-eui/src/assets/gtex/data/rui_locations.jsonld'
#RUI_LOCATION_ENDPOINT = "https://github.com/hubmapconsortium/ccf-ui/blob/main/projects/ccf-eui/src/assets/gtex/data/rui_locations.jsonld"
RUI_LOCATION_ENDPOINT ="https://raw.githubusercontent.com/hubmapconsortium/ccf-ui/main/projects/ccf-eui/src/assets/gtex/data/rui_locations.jsonld"


response = requests.get(RUI_LOCATION_ENDPOINT)
response.raise_for_status()
rui_location_data = jsonld.expand(response.json())

In [96]:
len(rui_location_data)

14

In [12]:
rui_location_data[0]

{'@id': 'https://gtexportal.org/home/eqtls/tissue?tissueName=Kidney_Cortex#FDonors',
 '@type': ['http://purl.org/ccf/latest/ccf-entity.owl#Donor'],
 'http://purl.org/ccf/latest/ccf-entity.owl#consortium_name': [{'@value': 'GTEx'}],
 'http://www.w3.org/2000/01/rdf-schema#comment': [{'@value': 'Entered 9/17/2021, Kristin Ardlie, GTEx Project'}],
 'http://www.w3.org/2000/01/rdf-schema#label': [{'@value': 'Females (n=18), Mean Age 56.8 (range 30-69)'}],
 'http://www.w3.org/2000/01/rdf-schema#seeAlso': [{'@id': 'https://gtexportal.org/home/eqtls/tissue?tissueName=Kidney_Cortex'}],
 'http://purl.org/ccf/latest/ccf-entity.owl#provider_name': [{'@value': 'GTEx Project'}],
 'http://purl.org/ccf/latest/ccf-entity.owl#provider_uuid': [{'@value': '083882bb-6cc6-4c12-a205-eac37c1a2640'}],
 '@reverse': {'http://purl.org/ccf/latest/ccf-entity.owl#has_donor': [{'@id': 'https://gtexportal.org/home/eqtls/tissue?tissueName=Kidney_Cortex#FTissueBlocks',
    '@type': ['http://purl.org/ccf/latest/ccf-entity

In [97]:
def get_fields(data):

    dictionary = {}


    owl_url = 'http://purl.org/ccf/latest/ccf-entity.owl#'
    owl_url_2 = 'http://purl.org/ccf/latest/ccf.owl#'
    rdf_url = 'http://www.w3.org/2000/01/rdf-schema#'
    reverse = data['@reverse']
    has_donor = reverse[owl_url+'has_donor'][0]
    rui_tree = has_donor[owl_url+'has_spatial_entity'][0]['@reverse'][owl_url_2+'has_placement_source'][0]
    
    dictionary['tissue']                = data['@id'].split('=')[-1].split('#')[0].replace('_',' ')
    dictionary['consortium_name']   = data[owl_url + 'consortium_name'][0]['@value']
    dictionary['age']             = data[rdf_url+'label'][0]['@value']
    dictionary['provider_uuid']     = data[owl_url+'provider_uuid'][0]['@value']
    dictionary['sex']               = data[owl_url+'sex'][0]['@value']

    dictionary['comment']           = has_donor[owl_url+'has_dataset'][0][rdf_url+'comment'][0]['@value']
    dictionary['registered']        = has_donor[owl_url+'has_dataset'][0][rdf_url+'label'][0]['@value']

    dictionary['rui_location']      = has_donor[rdf_url+'comment'][0]['@value']
    dictionary['ccf_annotations']   = [v['@id'].split('/')[-1] for v in  has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'ccf_annotation']]
    dictionary['creation_date']     = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'creation_date'][0]['@value']

    dictionary['x_rotation']        = rui_tree[owl_url_2+'has_x_rotation'][0]['@value']
    dictionary['x_scaling']         = rui_tree[owl_url_2+'has_x_scaling'][0]['@value']
    dictionary['x_translation']     = rui_tree[owl_url_2+'has_x_translation'][0]['@value']

    dictionary['y_rotation']        = rui_tree[owl_url_2+'has_y_rotation'][0]['@value']
    dictionary['y_scaling']         = rui_tree[owl_url_2+'has_y_scaling'][0]['@value']
    dictionary['y_translation']     = rui_tree[owl_url_2+'has_y_translation'][0]['@value']

    dictionary['z_rotation']        = rui_tree[owl_url_2+'has_z_rotation'][0]['@value']
    dictionary['z_scaling']         = rui_tree[owl_url_2+'has_z_scaling'][0]['@value']
    dictionary['z_translation']     = rui_tree[owl_url_2+'has_z_translation'][0]['@value']
    
    dictionary['x_dimension']       = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'has_x_dimension'][0]['@value']
    dictionary['y_dimension']       = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'has_y_dimension'][0]['@value']
    dictionary['z_dimension']       = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'has_z_dimension'][0]['@value']

    dictionary['sample_type']       = has_donor[owl_url+'sample_type'][0]['@value']
    dictionary['section_count']       = has_donor[owl_url+'section_count'][0]['@value']
    dictionary['section_size']       = has_donor[owl_url+'section_size'][0]['@value']
    dictionary['section_units']       = has_donor[owl_url+'section_units'][0]['@value']

    try:
        dictionary['slice_count']     = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'ccf_slice_count'][0]['@value'] or "unknown"
    except:
        dictionary['slice_count']     = "unknown"

    try:
        dictionary['slice_thickness']       = has_donor[owl_url+'has_spatial_entity'][0][owl_url_2+'ccf_slice_thickness'][0]['@value']
    except:
        dictionary['slice_thickness']       = "unknown"
        
    return dictionary    


In [98]:
df = None
for i,data in enumerate(rui_location_data):
    r_data = get_fields(data)
    print(f'{i+1}')
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(columns=r_data.keys())
    
    df.loc[len(df)] = list(r_data.values())


1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [100]:
df

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,creation_date,...,z_translation,x_dimension,y_dimension,z_dimension,sample_type,section_count,section_size,section_units,slice_count,slice_thickness
0,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",2021-09-17,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
1,Heart Atrial Appendage,GTEx,"Females (n=119), Mean Age 54.7 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",2021-09-17,...,80.223,10,10,8,Tissue Block,1,0.11,millimeter,2,4000
2,Heart Left Ventricle,GTEx,"Females (n=122), Mean Age 52.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",2021-09-17,...,83.766,10,10,8,Tissue Block,1,0.11,millimeter,2,4000
3,Spleen,GTEx,"Females (n=86), Mean Age 49.5 (range 23-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, UBERON_000124...",2021-09-17,...,37.735,10,10,8,Tissue Block,1,0.11,millimeter,2,4000
4,Colon Sigmoid,GTEx,"Females (n=113), Mean Age 51.1 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",2021-09-17,...,180.201,10,10,4,Tissue Block,1,0.11,millimeter,2,2000
5,Colon Transverse,GTEx,"Females (n=136), Mean Age 48.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",2021-09-17,...,108.882,10,10,4,Tissue Block,1,0.11,millimeter,2,2000
6,Lung,GTEx,"Females (n=166), Mean Age 53.4 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",2021-09-22,...,71.67,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown
7,Heart Atrial Appendage,GTEx,"Males (n=253), Mean Age 55.6 (range 20-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",2021-09-17,...,90.093,10,10,8,Tissue Block,1,0.11,millimeter,2,4000
8,Heart Left Ventricle,GTEx,"Males (n=253), Mean Age 55.6 (range 20-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",2021-09-17,...,60.132,10,10,8,Tissue Block,1,0.11,millimeter,2,4000
9,Spleen,GTEx,"Males (n=141), Mean Age 49.8 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, fma15828, fma...",2021-09-17,...,42.816,10,10,8,Tissue Block,1,0.11,millimeter,2,4000


In [101]:
df.columns

Index(['tissue', 'consortium_name', 'age', 'provider_uuid', 'sex', 'comment',
       'registered', 'rui_location', 'ccf_annotations', 'creation_date',
       'x_rotation', 'x_scaling', 'x_translation', 'y_rotation', 'y_scaling',
       'y_translation', 'z_rotation', 'z_scaling', 'z_translation',
       'x_dimension', 'y_dimension', 'z_dimension', 'sample_type',
       'section_count', 'section_size', 'section_units', 'slice_count',
       'slice_thickness'],
      dtype='object')

### Unrolled the ccf_annotations (containing list of UBERON ID) and created new rows belonging to each unique UBERON ID

In [121]:
rui_df = pd.DataFrame(columns=[*df.columns[:9], 'uberonId', *df.columns[9:]])
count=0
for i, data in df.iterrows():
    print(f"{i} : {data['tissue']}\t\t\tUBERON IDS = {len(data['ccf_annotations'])}")
    count+=len(data['ccf_annotations'])
    for uberon in data['ccf_annotations']:
        rui_df.loc[len(rui_df)] = [*data.values[:9],  uberon , *data.values[9:] ]

print(f"Total Uberon IDs : {count}")

0 : Kidney Cortex			UBERON IDS = 9
1 : Heart Atrial Appendage			UBERON IDS = 3
2 : Heart Left Ventricle			UBERON IDS = 4
3 : Spleen			UBERON IDS = 5
4 : Colon Sigmoid			UBERON IDS = 4
5 : Colon Transverse			UBERON IDS = 4
6 : Lung			UBERON IDS = 6
7 : Heart Atrial Appendage			UBERON IDS = 3
8 : Heart Left Ventricle			UBERON IDS = 4
9 : Spleen			UBERON IDS = 5
10 : Kidney Cortex			UBERON IDS = 9
11 : Colon Sigmoid			UBERON IDS = 4
12 : Colon Transverse			UBERON IDS = 4
13 : Lung			UBERON IDS = 9
Total Uberon IDs : 73


In [122]:
rui_df

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId,...,z_translation,x_dimension,y_dimension,z_dimension,sample_type,section_count,section_size,section_units,slice_count,slice_thickness
0,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0013702,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
1,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002113,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
2,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0004538,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
3,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002015,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
4,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0000362,...,40.782,20,10,4,Tissue Block,1,0.11,millimeter,2,4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002048,...,78.402,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown
69,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002168,...,78.402,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown
70,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0008952,...,78.402,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown
71,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",fma7386,...,78.402,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown


# EDA

In [123]:
gtex_df.columns

Index(['colorHex', 'colorRgb', 'datasetId', 'eGeneCount', 'expressedGeneCount',
       'hasEGenes', 'hasSGenes', 'mappedInHubmap',
       'rnaSeqAndGenotypeSampleCount', 'rnaSeqSampleCount',
       'rnaSeqSampleCountFemale', 'rnaSeqSampleCountMale', 'sGeneCount',
       'samplingSite', 'tissueSite', 'tissueSiteDetail',
       'tissueSiteDetailAbbr', 'tissueSiteDetailId', 'uberonId'],
      dtype='object')

In [124]:
rui_df['uberonId'].value_counts()

UBERON_0013702    14
UBERON_0000059     4
UBERON_0001155     4
UBERON_0000948     4
UBERON_0002048     4
UBERON_0002015     2
UBERON_0001159     2
UBERON_0004200     2
UBERON_0000362     2
UBERON_0002080     2
UBERON_0001004     2
UBERON_0001284     2
UBERON_0002084     2
UBERON_0001157     2
UBERON_0001225     2
UBERON_0002106     2
UBERON_0001248     2
UBERON_0002189     2
fma15828           2
UBERON_0002078     2
fma15837           2
UBERON_0002113     2
UBERON_0004538     2
UBERON_0003406     1
UBERON_0008952     1
UBERON_0002185     1
fma7385            1
UBERON_0002168     1
fma7386            1
fma7426            1
Name: uberonId, dtype: int64

In [125]:
gtex_df['uberonId'].value_counts()

UBERON_0002037        2
UBERON_0002038        1
UBERON_0002369        1
UBERON_0000996        1
UBERON_0006469        1
UBERON_0000945        1
UBERON_0003889        1
UBERON_0006920        1
UBERON_0000458        1
UBERON_0000007        1
UBERON_0000992        1
UBERON_0001293        1
UBERON_0002106        1
UBERON_0001873        1
UBERON_0001954        1
UBERON_0036149        1
UBERON_0006566        1
UBERON_0013756        1
UBERON_0006330        1
UBERON_0001874        1
UBERON_0001114        1
UBERON_0000473        1
UBERON_0010414        1
UBERON_0012249        1
UBERON_0009835        1
UBERON_0008367        1
UBERON_0001157        1
UBERON_0007610        1
UBERON_0001876        1
UBERON_0002190        1
UBERON_EFO_0002009    1
UBERON_0004648        1
UBERON_0001898        1
UBERON_0004550        1
UBERON_0002367        1
UBERON_0001323        1
UBERON_0001621        1
UBERON_0009834        1
UBERON_0001225        1
UBERON_0002046        1
UBERON_0008952        1
UBERON_0004264  

In [126]:
gtex_df['tissueSite'].value_counts()

Brain              13
Blood Vessel        4
Skin                3
Esophagus           3
Adipose Tissue      2
Colon               2
Kidney              2
Cervix Uteri        2
Heart               2
Thyroid             1
Pituitary           1
Prostate            1
Spleen              1
Small Intestine     1
Nerve               1
Adrenal Gland       1
Bladder             1
Uterus              1
Fallopian Tube      1
Testis              1
Liver               1
Pancreas            1
Vagina              1
Stomach             1
Blood               1
Lung                1
Ovary               1
Salivary Gland      1
Breast              1
Muscle              1
Name: tissueSite, dtype: int64

In [127]:
rui_df['tissue'].value_counts()

Kidney Cortex             18
Lung                      15
Spleen                    10
Colon Sigmoid              8
Colon Transverse           8
Heart Left Ventricle       8
Heart Atrial Appendage     6
Name: tissue, dtype: int64

#### Creating new column 'tissueSite' in RUI dataframe that relates to the 'tissueSite' column in GTE_X_V8

In [128]:
rui_df['tissueSite'] = rui_df['tissue'].str.replace(" ", "_") + '_'
rui_df['tissueSite'] = rui_df['tissueSite'].str.split("_").str.get(0)

In [129]:
rui_df['tissueSite'].value_counts()

Kidney    18
Colon     16
Lung      15
Heart     14
Spleen    10
Name: tissueSite, dtype: int64

In [130]:
gtex_df['tissueSite'].value_counts()

Brain              13
Blood Vessel        4
Skin                3
Esophagus           3
Adipose Tissue      2
Colon               2
Kidney              2
Cervix Uteri        2
Heart               2
Thyroid             1
Pituitary           1
Prostate            1
Spleen              1
Small Intestine     1
Nerve               1
Adrenal Gland       1
Bladder             1
Uterus              1
Fallopian Tube      1
Testis              1
Liver               1
Pancreas            1
Vagina              1
Stomach             1
Blood               1
Lung                1
Ovary               1
Salivary Gland      1
Breast              1
Muscle              1
Name: tissueSite, dtype: int64

In [131]:
rui_df

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId,...,x_dimension,y_dimension,z_dimension,sample_type,section_count,section_size,section_units,slice_count,slice_thickness,tissueSite
0,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0013702,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
1,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002113,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
2,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0004538,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
3,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002015,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
4,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0000362,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002048,...,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown,Lung
69,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002168,...,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown,Lung
70,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0008952,...,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown,Lung
71,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",fma7386,...,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown,Lung


In [132]:
#gtex_df['uberonId'].value_counts()
list(gtex_df['uberonId'].unique())

['UBERON_0002190',
 'UBERON_0010414',
 'UBERON_0002369',
 'UBERON_0001496',
 'UBERON_0001621',
 'UBERON_0007610',
 'UBERON_0001255',
 'UBERON_0001876',
 'UBERON_0009835',
 'UBERON_0001873',
 'UBERON_0002037',
 'UBERON_0001870',
 'UBERON_0009834',
 'UBERON_0001954',
 'UBERON_0001898',
 'UBERON_0001882',
 'UBERON_0001874',
 'UBERON_0006469',
 'UBERON_0002038',
 'UBERON_0008367',
 'UBERON_EFO_0000572',
 'UBERON_EFO_0002009',
 'UBERON_0012249',
 'UBERON_0000458',
 'UBERON_0001159',
 'UBERON_0001157',
 'UBERON_0004550',
 'UBERON_0006920',
 'UBERON_0004648',
 'UBERON_0003889',
 'UBERON_0006631',
 'UBERON_0006566',
 'UBERON_0001225',
 'UBERON_0001293',
 'UBERON_0001114',
 'UBERON_0008952',
 'UBERON_0006330',
 'UBERON_0011907',
 'UBERON_0001323',
 'UBERON_0000992',
 'UBERON_0001150',
 'UBERON_0000007',
 'UBERON_0002367',
 'UBERON_0036149',
 'UBERON_0004264',
 'UBERON_0001211',
 'UBERON_0002106',
 'UBERON_0000945',
 'UBERON_0000473',
 'UBERON_0002046',
 'UBERON_0000995',
 'UBERON_0000996',
 'UB

In [133]:
list(rui_df['uberonId'].unique())

['UBERON_0013702',
 'UBERON_0002113',
 'UBERON_0004538',
 'UBERON_0002015',
 'UBERON_0000362',
 'UBERON_0004200',
 'UBERON_0001225',
 'UBERON_0001284',
 'UBERON_0002189',
 'UBERON_0000948',
 'UBERON_0002084',
 'UBERON_0002078',
 'UBERON_0002080',
 'UBERON_0002106',
 'UBERON_0001248',
 'fma15837',
 'fma15828',
 'UBERON_0000059',
 'UBERON_0001155',
 'UBERON_0001157',
 'UBERON_0001159',
 'UBERON_0001004',
 'UBERON_0002048',
 'UBERON_0003406',
 'UBERON_0002185',
 'fma7426',
 'UBERON_0002168',
 'UBERON_0008952',
 'fma7386',
 'fma7385']

In [134]:
common_uberon = list(set(rui_df['uberonId'].unique()).intersection(set(gtex_df['uberonId'].unique())))
common_uberon

['UBERON_0008952',
 'UBERON_0001159',
 'UBERON_0002106',
 'UBERON_0001157',
 'UBERON_0001225']

In [135]:
common_tissue = list(set(rui_df['tissueSite'].unique()).intersection(set(gtex_df['tissueSite'].unique())))
common_tissue

['Colon', 'Heart', 'Lung', 'Kidney', 'Spleen']

In [136]:
rui_hl = set(rui_df[rui_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].unique())
rui_df[rui_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].unique()


array(['UBERON_0013702', 'UBERON_0000948', 'UBERON_0002084',
       'UBERON_0002078', 'UBERON_0002080', 'UBERON_0001004',
       'UBERON_0002048', 'UBERON_0003406', 'UBERON_0002185', 'fma7426',
       'UBERON_0002168', 'UBERON_0008952', 'fma7386', 'fma7385'],
      dtype=object)

In [137]:
rui_df[rui_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].value_counts()

UBERON_0013702    6
UBERON_0002048    4
UBERON_0000948    4
UBERON_0002084    2
UBERON_0001004    2
UBERON_0002080    2
UBERON_0002078    2
UBERON_0008952    1
UBERON_0003406    1
UBERON_0002185    1
UBERON_0002168    1
fma7386           1
fma7426           1
fma7385           1
Name: uberonId, dtype: int64

In [138]:
gtex_hl = set(gtex_df[gtex_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].unique())
gtex_df[gtex_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].unique()

array(['UBERON_0006631', 'UBERON_0006566', 'UBERON_0008952'], dtype=object)

In [139]:
rui_hl = set(rui_df[rui_df['tissueSite'].isin(['Lung'])]['uberonId'].unique())
rui_df[rui_df['tissueSite'].isin(['Lung'])]['uberonId'].unique()

array(['UBERON_0013702', 'UBERON_0001004', 'UBERON_0002048',
       'UBERON_0003406', 'UBERON_0002185', 'fma7426', 'UBERON_0002168',
       'UBERON_0008952', 'fma7386', 'fma7385'], dtype=object)

In [140]:
gtex_hl.intersection(rui_hl)

{'UBERON_0008952'}

### Lung UBERON mismatches

In [141]:
#gtex_df[gtex_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].value_counts()
gtex_l = set(gtex_df[gtex_df['tissueSite'] == 'Lung']['uberonId'].unique())
gtex_df[gtex_df['tissueSite'] == 'Lung']['uberonId'].unique()

array(['UBERON_0008952'], dtype=object)

In [142]:
rui_l = set(rui_df[rui_df['tissueSite'] == 'Lung']['uberonId'].unique())
rui_df[rui_df['tissueSite'] == 'Lung']['uberonId'].unique()

array(['UBERON_0013702', 'UBERON_0001004', 'UBERON_0002048',
       'UBERON_0003406', 'UBERON_0002185', 'fma7426', 'UBERON_0002168',
       'UBERON_0008952', 'fma7386', 'fma7385'], dtype=object)

In [143]:
gtex_l.intersection(rui_l)

{'UBERON_0008952'}

### Heart UBERON Mismatches

In [144]:
rui_h = set(rui_df[rui_df['tissueSite'] == 'Heart']['uberonId'].unique())
rui_df[rui_df['tissueSite'] == 'Heart']['uberonId'].unique()

array(['UBERON_0013702', 'UBERON_0000948', 'UBERON_0002084',
       'UBERON_0002078', 'UBERON_0002080'], dtype=object)

In [145]:
#gtex_df[gtex_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].value_counts()
gtex_h = set(gtex_df[gtex_df['tissueSite'] == 'Heart']['uberonId'].unique())
gtex_df[gtex_df['tissueSite'] == 'Heart']['uberonId'].unique()

array(['UBERON_0006631', 'UBERON_0006566'], dtype=object)

In [146]:
gtex_h.intersection(rui_h)

set()

In [147]:
gtex_df[gtex_df['tissueSite'].isin(['Heart', 'Lung'])]['uberonId'].value_counts()

UBERON_0006566    1
UBERON_0008952    1
UBERON_0006631    1
Name: uberonId, dtype: int64

In [148]:
rui_df[rui_df['uberonId'].isin(common_uberon)]

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId,...,x_dimension,y_dimension,z_dimension,sample_type,section_count,section_size,section_units,slice_count,slice_thickness,tissueSite
6,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0001225,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
17,Spleen,GTEx,"Females (n=86), Mean Age 49.5 (range 23-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, UBERON_000124...",UBERON_0002106,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Spleen
24,Colon Sigmoid,GTEx,"Females (n=113), Mean Age 51.1 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",UBERON_0001157,...,10,10,4,Tissue Block,1,0.11,millimeter,2,2000,Colon
28,Colon Transverse,GTEx,"Females (n=136), Mean Age 48.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",UBERON_0001159,...,10,10,4,Tissue Block,1,0.11,millimeter,2,2000,Colon
43,Spleen,GTEx,"Males (n=141), Mean Age 49.8 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, fma15828, fma...",UBERON_0002106,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Spleen
53,Kidney Cortex,GTEx,"Males (n=55), Mean Age 55.8 (range 27-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0001225,...,20,10,4,Tissue Block,1,0.11,millimeter,2,4000,Kidney
59,Colon Sigmoid,GTEx,"Males (n=55), Mean Age 55.8 (range 27-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",UBERON_0001157,...,10,10,4,Tissue Block,1,0.11,millimeter,2,2000,Colon
63,Colon Transverse,GTEx,"Males (n=232), Mean Age 50.1 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",UBERON_0001159,...,10,10,4,Tissue Block,1,0.11,millimeter,2,2000,Colon
70,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0008952,...,10,10,10,Tissue Block,1,0.11,millimeter,unknown,unknown,Lung


In [149]:
gtex_df[gtex_df['uberonId'].isin(common_uberon)]

Unnamed: 0,colorHex,colorRgb,datasetId,eGeneCount,expressedGeneCount,hasEGenes,hasSGenes,mappedInHubmap,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSite,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId,uberonId
25,EEBB77,238187119,gtex_v8,10550,28454,True,True,True,318,373,133,240,3269,"Sigmoid colon, Obtain muscularis only; discard...",Colon,Colon - Sigmoid,CLNSGM,Colon_Sigmoid,UBERON_0001159
26,CC9955,20415385,gtex_v8,11686,29574,True,True,True,368,406,147,259,3459,"Transverse, Full thickness: mucosa and muscularis",Colon,Colon - Transverse,CLNTRN,Colon_Transverse,UBERON_0001157
33,22FFDD,34255221,gtex_v8,1260,29263,True,True,True,73,85,19,66,547,Left kidney cortex.,Kidney,Kidney - Cortex,KDNCTX,Kidney_Cortex,UBERON_0001225
36,99FF00,1532550,gtex_v8,14113,30049,True,True,True,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,Lung,LUNG,Lung,UBERON_0008952
47,778855,11913685,gtex_v8,10783,29856,True,True,True,227,241,87,154,2837,"Central region, 5 mm below capsule.",Spleen,Spleen,SPLEEN,Spleen,UBERON_0002106


In [150]:
list(gtex_df[gtex_df['uberonId'].isin(common_uberon)]['samplingSite'])

['Sigmoid colon, Obtain muscularis only; discard mucosa.',
 'Transverse, Full thickness: mucosa and muscularis',
 'Left kidney cortex.',
 'Inferior segment of left upper lobe, 1 cm below the pleural surface.',
 'Central region, 5 mm below capsule.']

#### Inner join of GTEX_V8 and RUI Locations dataframes : Primary Key = UBERON ID

In [151]:
uberon_inner_join_df = pd.merge(rui_df,gtex_df, left_on='uberonId', right_on='uberonId', how='inner', suffixes= ('_rui','_gtex'))

In [152]:
uberon_inner_join_df.head()

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId,...,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSite_gtex,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId
0,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0001225,...,73,85,19,66,547,Left kidney cortex.,Kidney,Kidney - Cortex,KDNCTX,Kidney_Cortex
1,Kidney Cortex,GTEx,"Males (n=55), Mean Age 55.8 (range 27-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0001225,...,73,85,19,66,547,Left kidney cortex.,Kidney,Kidney - Cortex,KDNCTX,Kidney_Cortex
2,Spleen,GTEx,"Females (n=86), Mean Age 49.5 (range 23-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, UBERON_000124...",UBERON_0002106,...,227,241,87,154,2837,"Central region, 5 mm below capsule.",Spleen,Spleen,SPLEEN,Spleen
3,Spleen,GTEx,"Males (n=141), Mean Age 49.8 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002106, fma15828, fma...",UBERON_0002106,...,227,241,87,154,2837,"Central region, 5 mm below capsule.",Spleen,Spleen,SPLEEN,Spleen
4,Colon Sigmoid,GTEx,"Females (n=113), Mean Age 51.1 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000059, UBERON_000115...",UBERON_0001157,...,368,406,147,259,3459,"Transverse, Full thickness: mucosa and muscularis",Colon,Colon - Transverse,CLNTRN,Colon_Transverse


In [153]:
uberon_inner_join_df.to_csv('gtex_vs_rui.csv', index=None)

In [154]:
#tissue_inner_join_df = pd.merge(rui_df,gtex_df, on=['tissueSite'], how='inner', suffixes= ('_rui','_gtex'))
tissue_inner_join_df = pd.merge(rui_df,gtex_df, left_on='tissueSite', right_on='tissueSite', how='inner', suffixes= ('_rui','_gtex'))

In [155]:
tissue_inner_join_df

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId_rui,...,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId,uberonId_gtex
0,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0013702,...,73,85,19,66,547,Left kidney cortex.,Kidney - Cortex,KDNCTX,Kidney_Cortex,UBERON_0001225
1,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0013702,...,4,4,1,3,,Left kidney medulla adjacent to Cortex.,Kidney - Medulla,KDNMDL,Kidney_Medulla,UBERON_0001293
2,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002113,...,73,85,19,66,547,Left kidney cortex.,Kidney - Cortex,KDNCTX,Kidney_Cortex,UBERON_0001225
3,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0002113,...,4,4,1,3,,Left kidney medulla adjacent to Cortex.,Kidney - Medulla,KDNMDL,Kidney_Medulla,UBERON_0001293
4,Kidney Cortex,GTEx,"Females (n=18), Mean Age 56.8 (range 30-69)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 9/17/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0002113, UBERON_000453...",UBERON_0004538,...,73,85,19,66,547,Left kidney cortex.,Kidney - Cortex,KDNCTX,Kidney_Cortex,UBERON_0001225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002048,...,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,LUNG,Lung,UBERON_0008952
117,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0002168,...,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,LUNG,Lung,UBERON_0008952
118,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",UBERON_0008952,...,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,LUNG,Lung,UBERON_0008952
119,Lung,GTEx,"Males (n=349), Mean Age 53.5 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0001004, UBERON_000204...",fma7386,...,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,LUNG,Lung,UBERON_0008952


In [156]:
gtex_df[gtex_df['tissueSite']=='Heart']

Unnamed: 0,colorHex,colorRgb,datasetId,eGeneCount,expressedGeneCount,hasEGenes,hasSGenes,mappedInHubmap,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSite,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId,uberonId
31,9900FF,1530255,gtex_v8,10991,27818,True,True,True,372,429,136,293,3055,"Right atrial appendage, tip (if fatty or disco...",Heart,Heart - Atrial Appendage,HRTAA,Heart_Atrial_Appendage,UBERON_0006631
32,660099,1020153,gtex_v8,9642,26037,True,True,True,386,432,138,294,2357,"Anterior left ventricle, 1 cm above apex and 1...",Heart,Heart - Left Ventricle,HRTLV,Heart_Left_Ventricle,UBERON_0006566


In [157]:
gtex_df[gtex_df['tissueSite']=='Lung']

Unnamed: 0,colorHex,colorRgb,datasetId,eGeneCount,expressedGeneCount,hasEGenes,hasSGenes,mappedInHubmap,rnaSeqAndGenotypeSampleCount,rnaSeqSampleCount,rnaSeqSampleCountFemale,rnaSeqSampleCountMale,sGeneCount,samplingSite,tissueSite,tissueSiteDetail,tissueSiteDetailAbbr,tissueSiteDetailId,uberonId
36,99FF00,1532550,gtex_v8,14113,30049,True,True,True,515,578,183,395,4774,"Inferior segment of left upper lobe, 1 cm belo...",Lung,Lung,LUNG,Lung,UBERON_0008952


In [158]:
rui_df[rui_df['tissueSite'] == 'Heart']

Unnamed: 0,tissue,consortium_name,age,provider_uuid,sex,comment,registered,rui_location,ccf_annotations,uberonId,...,x_dimension,y_dimension,z_dimension,sample_type,section_count,section_size,section_units,slice_count,slice_thickness,tissueSite
9,Heart Atrial Appendage,GTEx,"Females (n=119), Mean Age 54.7 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0013702,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
10,Heart Atrial Appendage,GTEx,"Females (n=119), Mean Age 54.7 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0000948,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
11,Heart Atrial Appendage,GTEx,"Females (n=119), Mean Age 54.7 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0002084,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
12,Heart Left Ventricle,GTEx,"Females (n=122), Mean Age 52.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",UBERON_0013702,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
13,Heart Left Ventricle,GTEx,"Females (n=122), Mean Age 52.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",UBERON_0000948,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
14,Heart Left Ventricle,GTEx,"Females (n=122), Mean Age 52.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",UBERON_0002078,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
15,Heart Left Ventricle,GTEx,"Females (n=122), Mean Age 52.9 (range 21-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Female,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_000207...",UBERON_0002080,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
35,Heart Atrial Appendage,GTEx,"Males (n=253), Mean Age 55.6 (range 20-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0013702,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
36,Heart Atrial Appendage,GTEx,"Males (n=253), Mean Age 55.6 (range 20-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0000948,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
37,Heart Atrial Appendage,GTEx,"Males (n=253), Mean Age 55.6 (range 20-70)",083882bb-6cc6-4c12-a205-eac37c1a2640,Male,Data/Assay Types: XXX,"Registered 5/18/2021, Kristin Ardlie, GTEx Pro...","1.08 x 4.06 x 0.11 millimeter, 0.11 millimeter...","[UBERON_0013702, UBERON_0000948, UBERON_0002084]",UBERON_0002084,...,10,10,8,Tissue Block,1,0.11,millimeter,2,4000,Heart
