In [335]:
## Finding MARKER GENES from GTEx

In [336]:
# Import modules

import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
import os

In [337]:
# Getting the dataset to analyze

GTEx_file = "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"

In [338]:
# Converting to DataFrame

df = pd.read_csv(GTEx_file, sep='\t', skiprows=2)
df

Unnamed: 0,Name,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,ENSG00000223972.5,DDX11L1,0.000000,0.000000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.166403,0.000000,0.000000,0.000000,0.00000
1,ENSG00000227232.5,WASH7P,4.064030,3.371110,2.68549,4.04762,3.90076,3.639630,5.16375,1.43859,...,5.932980,6.13265,4.193780,5.926310,3.062480,4.702530,6.272550,7.190010,5.745540,2.64743
2,ENSG00000278267.1,MIR6859-1,0.000000,0.000000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
3,ENSG00000243485.5,MIR1302-2HG,0.000000,0.000000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.054223,0.000000,0.000000,0.000000,0.00000
4,ENSG00000237613.2,FAM138A,0.000000,0.000000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56195,ENSG00000198695.2,MT-ND6,3517.220000,3598.090000,6637.79000,6374.49000,4972.73000,4635.000000,8275.20000,9631.77000,...,8919.460000,8113.22000,9764.490000,5477.360000,9628.750000,1676.590000,2520.410000,3644.000000,2237.430000,960.29100
56196,ENSG00000210194.1,MT-TE,11.903800,11.203400,24.41390,22.65940,20.58800,18.257600,27.50560,40.58860,...,32.932100,31.33400,35.456000,24.934900,33.496300,2.871770,5.795670,10.370100,5.724510,3.49736
56197,ENSG00000198727.2,MT-CYB,23839.100000,25511.700000,28799.40000,12919.40000,15943.20000,13051.500000,23663.20000,32931.80000,...,19058.200000,17223.10000,24521.300000,11578.600000,27193.100000,15337.800000,19868.500000,15025.000000,12803.400000,2305.49000
56198,ENSG00000210195.2,MT-TT,0.616205,0.697191,1.50928,0.00000,0.00000,0.544659,0.00000,3.44476,...,0.884701,1.11364,0.815455,0.783492,0.803858,0.443781,0.000000,0.000000,0.000000,0.00000


In [339]:
# Deleting the mitochondrial genes from the dataset

df['Description'] = df['Description'].astype(str)
df_without_mt = df[~df['Description'].str.startswith('MT-')]
df_without_mt


Unnamed: 0,Name,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,ENSG00000223972.5,DDX11L1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.166403,0.00000,0.00000,0.00000,0.00000
1,ENSG00000227232.5,WASH7P,4.06403,3.37111,2.68549,4.04762,3.90076,3.63963,5.16375,1.43859,...,5.93298,6.13265,4.19378,5.92631,3.06248,4.702530,6.27255,7.19001,5.74554,2.64743
2,ENSG00000278267.1,MIR6859-1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
3,ENSG00000243485.5,MIR1302-2HG,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.054223,0.00000,0.00000,0.00000,0.00000
4,ENSG00000237613.2,FAM138A,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56158,ENSG00000223484.7_PAR_Y,TRPC6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56159,ENSG00000124334.17_PAR_Y,IL9R,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56160,ENSG00000185203.12_PAR_Y,WASIR1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56161,ENSG00000182484.15_PAR_Y,WASH6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000


In [340]:
#Delete all genes that are considered ribo 

delete_genes = ["FAU", "MRPL13", "RPL10", "RPL10A", "RPL10L", "RPL11", "RPL12", "RPL13", "RPL13A", "RPL14",
    "RPL15", "RPL17", "RPL18", "RPL18A", "RPL19", "RPL21", "RPL22", "RPL22L1", "RPL23", "RPL23A",
    "RPL24", "RPL26", "RPL26L1", "RPL27", "RPL27A", "RPL28", "RPL29", "RPL3", "RPL30", "RPL31",
    "RPL32", "RPL34", "RPL35", "RPL35A", "RPL36", "RPL36A", "RPL36AL", "RPL37", "RPL37A", "RPL38",
    "RPL39", "RPL3L", "RPL4", "RPL41", "RPL5", "RPL6", "RPL7", "RPL7A", "RPL8", "RPL9", "RPLP0",
    "RPLP1", "RPLP2", "RPS10", "RPS11", "RPS12", "RPS13", "RPS15", "RPS15A", "RPS16", "RPS17",
    "RPS18", "RPS19", "RPS2", "RPS20", "RPS21", "RPS23", "RPS24", "RPS25", "RPS26", "RPS27",
    "RPS27A", "RPS27L", "RPS28", "RPS29", "RPS3", "RPS3A", "RPS4X", "RPS4Y1", "RPS5", "RPS6",
    "RPS7", "RPS8", "RPS9", "RPSA", "RSL24D1", "RSL24D1P11", "UBA52"]

df_no_mt_ribo = df_without_mt[~df_without_mt.iloc[:, 1].isin(delete_genes)]
df_no_mt_ribo

Unnamed: 0,Name,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,ENSG00000223972.5,DDX11L1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.166403,0.00000,0.00000,0.00000,0.00000
1,ENSG00000227232.5,WASH7P,4.06403,3.37111,2.68549,4.04762,3.90076,3.63963,5.16375,1.43859,...,5.93298,6.13265,4.19378,5.92631,3.06248,4.702530,6.27255,7.19001,5.74554,2.64743
2,ENSG00000278267.1,MIR6859-1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
3,ENSG00000243485.5,MIR1302-2HG,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.054223,0.00000,0.00000,0.00000,0.00000
4,ENSG00000237613.2,FAM138A,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56158,ENSG00000223484.7_PAR_Y,TRPC6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56159,ENSG00000124334.17_PAR_Y,IL9R,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56160,ENSG00000185203.12_PAR_Y,WASIR1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
56161,ENSG00000182484.15_PAR_Y,WASH6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000


In [341]:
# Checking if the change worked - expecting *not* to find the gene

gene_name = "RPL22"
is_in_description = gene_name in df_no_mt_ribo.iloc[:, 1].values
is_in_description

# Indeed got "False"

False

In [342]:
# Organizing the dataframe to the wanted index

df_no_mt_ribo.set_index("Description", inplace=True)
df_no_mt_ribo

Unnamed: 0_level_0,Name,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DDX11L1,ENSG00000223972.5,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.166403,0.00000,0.00000,0.00000,0.00000
WASH7P,ENSG00000227232.5,4.06403,3.37111,2.68549,4.04762,3.90076,3.63963,5.16375,1.43859,1.69285,...,5.93298,6.13265,4.19378,5.92631,3.06248,4.702530,6.27255,7.19001,5.74554,2.64743
MIR6859-1,ENSG00000278267.1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
MIR1302-2HG,ENSG00000243485.5,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.054223,0.00000,0.00000,0.00000,0.00000
FAM138A,ENSG00000237613.2,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRPC6P,ENSG00000223484.7_PAR_Y,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
IL9R,ENSG00000124334.17_PAR_Y,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
WASIR1,ENSG00000185203.12_PAR_Y,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
WASH6P,ENSG00000182484.15_PAR_Y,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000


In [343]:
# Dropping unnecessary coulombs

df_filtered = df_no_mt_ribo.drop("Name", axis = 1)
df_filtered

Unnamed: 0_level_0,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DDX11L1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.166403,0.00000,0.00000,0.00000,0.00000
WASH7P,4.06403,3.37111,2.68549,4.04762,3.90076,3.63963,5.16375,1.43859,1.69285,1.566050,...,5.93298,6.13265,4.19378,5.92631,3.06248,4.702530,6.27255,7.19001,5.74554,2.64743
MIR6859-1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
MIR1302-2HG,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.024264,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.054223,0.00000,0.00000,0.00000,0.00000
FAM138A,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRPC6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
IL9R,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
WASIR1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000
WASH6P,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000


In [344]:
# Making 3 sub-dataframes for each fat tissue.
# Each sub-dataframe doesn't have the other 2 fat tissues. 

df_only_adipose_subcutaneous = df_filtered.drop(['Adipose - Visceral (Omentum)', 'Breast - Mammary Tissue'], axis = 1)
df_only_adipose_visceral = df_filtered.drop(['Adipose - Subcutaneous', 'Breast - Mammary Tissue'], axis = 1)
df_only_breast = df_filtered.drop(["Adipose - Visceral (Omentum)", 'Adipose - Subcutaneous'], axis = 1)


In [353]:
# Creating a function to get marker genes for each sample in a dictionary

def get_marker_genes(df, FC_THRESH=2, MAX_EXP_THRESH=1e-4, PN=1e-9):

    df = df / df.sum(axis=0)
    marker_genes_dict = dict()

    # Loop through each tissue/sample
    for ii, sample in enumerate(df.columns):
        sample_expression = df[sample]
        other_samples_max_expression = df[df.columns[df.columns != sample]].max(axis=1)
        all_samples_max_exp = df.max(axis=1)

        # Calculate the ratio of expression in the current sample vs other samples
        tissue_ratio = (sample_expression[all_samples_max_exp > MAX_EXP_THRESH] + PN) / (other_samples_max_expression[all_samples_max_exp > MAX_EXP_THRESH] + PN)
        marker_genes_dict[sample] = tissue_ratio[tissue_ratio > FC_THRESH]

    return marker_genes_dict

get_marker_genes(df_filtered)

{'Adipose - Subcutaneous': Description
 LEP    3.311776
 dtype: float64,
 'Adipose - Visceral (Omentum)': Description
 ITLN1    6.112849
 IL6      2.245161
 MT1A     2.960495
 dtype: float64,
 'Adrenal Gland': Description
 DHCR24      2.416061
 MCOLN3      2.060277
 HSD3B2    233.794937
 SOAT1       6.311243
 EPHX1       2.404092
              ...    
 LONP1       2.706349
 CTSA        2.226805
 C2CD2       4.027418
 RGN         2.156162
 ZNF275      2.127335
 Length: 69, dtype: float64,
 'Artery - Aorta': Description
 ITGA10      2.061956
 FRZB        2.223676
 IGFBP2      2.022894
 LMCD1       2.264796
 CYTL1       5.214830
 MRAP2       2.555021
 SMOC2       2.221285
 ELN         2.426664
 CRISPLD1    3.281175
 ITGA8       2.057680
 ANO1        2.397138
 FBLN5       2.575412
 CDH13       2.112110
 MYH10       2.701660
 dtype: float64,
 'Artery - Coronary': Series([], dtype: float64),
 'Artery - Tibial': Description
 ID2        2.506518
 RHOB       2.128685
 FHL5       3.565790
 HEY2 

In [346]:
# Applying the marker gene function to the 3 sub-dataframes

only_adipose_visceral = get_marker_genes(df_only_adipose_visceral)
only_adipose_subcutaneous = get_marker_genes(df_only_adipose_subcutaneous)
only_breast = get_marker_genes(df_only_breast)

In [347]:
#Making a list for each fat tissue, with gene_name and exp_value

vis = list(only_adipose_visceral.values())
sub = list(only_adipose_subcutaneous.values())
bre = list(only_breast.values())

In [348]:
# Converting to lists to have only gene names 

lst_vis = []
df_split_vis = pd.DataFrame(vis)
for gene in df_split_vis:
    lst_vis.append(gene)

lst_sub = []
df_split_sub = pd.DataFrame(sub)
for gene in df_split_sub:
    lst_sub.append(gene)

lst_bre = []
df_split_bre = pd.DataFrame(bre)
for gene in df_split_bre:
    lst_bre.append(gene)


In [351]:
#Combine visceral (vis) and subcutaneous(sub), and delete genes from breast(bre)

union_vis_sub = lst_sub + lst_vis         
union_vis_sub = list(set(union_vis_sub)) 
sorted_lst = sorted(union_vis_sub)
intersect_bre_union = list(set(union_vis_sub).intersection(set(lst_bre)))
vis_genes_not_in_intersection = [gene for gene in lst_vis if gene not in intersect_bre_union]
sub_genes_not_in_intersection = [gene for gene in lst_sub if gene not in intersect_bre_union]


In [352]:
# Saving the final vis and sub lists, that dont have breast

subdirectory = "C:/python/Fat_tissue_analysis"
if not os.path.exists(subdirectory):
    os.makedirs(subdirectory)
file_path_vis = os.path.join(subdirectory, "vis.csv")
df_vis = pd.DataFrame(vis_genes_not_in_intersection, columns=["Gene"])
df_vis.to_csv(file_path_vis, index=False)

file_path_sub = os.path.join(subdirectory, "sub.csv")
df_sub = pd.DataFrame(sub_genes_not_in_intersection, columns=["Gene"])
df_sub.to_csv(file_path_sub, index=False)