<a href="https://colab.research.google.com/github/Dowell-Lab/psea/blob/main/notebook_examples/one_comorbid_many_gene.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages we will use

In [32]:
import pandas as pd
import plotly.express as px
from scipy.stats import zscore
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.ensemble import GradientBoostingRegressor

# read in the files we will look at

In [33]:
adjpvalcol = "p_value_BenjaminiHochberg"
cuttoff = 0.1

In [34]:
url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/psea_scores_20240923-152820.adjpval.csv"
df = pd.read_csv(url, index_col=0)
url2="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/genes.csv"
genedf = pd.read_csv(url2, index_col=0)

In [35]:
genedf["value"]=genedf["gene_id"]

In [36]:
#this brings in the medical disorders for all the individuals with Trisomy 21 in the Human Trisome Project
url = "https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/comorbid_file.csv"
comorbid_df = pd.read_csv(url, index_col=0)



In [37]:
#This brings in the normalized counts for all the individuals with Trisomy 21 in the Human Trisome Project. These are not the real count data but are similar to reall count data.
#df=pd.read_csv('/content/drive/MyDrive/normcounts.csv')
url_counts="https://media.githubusercontent.com/media/Dowell-Lab/psea/refs/heads/main/testdata/value_expression_large.csv"
gene_exp_df=pd.read_csv(url_counts, index_col=0)



# Create a data frame of each gene linked to a comorbid

In [38]:
def filter_condition_gene_links_by_threshold(df, cuttoff, adjpvalcol):
  df_nona = df.dropna()
  df_nona_threshold = df_nona[df_nona[adjpvalcol]<cuttoff]
  df_nona_threshold_names = df_nona_threshold.merge(genedf, how ="inner", on="value")
  return df_nona_threshold_names


def acomorbidanditsgenes_metadata(acomorbid, genedf, cuttoff=0.1, adjpvalcol="p_value_BenjaminiHochberg"):
  df_nona_threshold_names = filter_condition_gene_links_by_threshold(df, cuttoff, adjpvalcol)
  aconditiondf = df_nona_threshold_names[df_nona_threshold_names["binary_attribute"]==acondition]
  aconditiondf = aconditiondf.sort_values(["NES"])
  return aconditiondf


def acomorbidanditsgenesexp(acomorbid, gene_exp_df, comorbid_df, df, genedf, cuttoff=0.1, adjpvalcol="p_value_BenjaminiHochberg", collect="all"):
  aconditiondf = acomorbidanditsgenes_metadata(acomorbid, genedf, cuttoff=0.1, adjpvalcol=adjpvalcol)
  if collect=="high":
    aconditiondf = aconditiondf[aconditiondf["NES"]<0]
  if collect=="low":
    aconditiondf = aconditiondf[aconditiondf["NES"]>0]
  geneensmbllist = sorted(aconditiondf["value"].to_list())
  patientgenesexpdf = gene_exp_df[geneensmbllist+["Patient"]]
  onecomborbiddf = comorbid_df[[acomorbid, "Patient"]]
  merge_df = onecomborbiddf.merge(patientgenesexpdf, on="Patient")
  return merge_df

def splitpeoplewithandwithout(acomorbid, adfofexpressionandcomorbid):
  withcomorid = adfofexpressionandcomorbid[adfofexpressionandcomorbid[acomorbid]==1].copy()
  without_comorid = adfofexpressionandcomorbid[adfofexpressionandcomorbid[acomorbid]==0].copy()
  withcomorid = withcomorid.drop(columns=["Patient", acomorbid])
  without_comorid = without_comorid.drop(columns=["Patient", acomorbid])
  return withcomorid, without_comorid

def zscoregenes(acomorbid, genexpconditiondf):
    dontuse_columns = ["Patient", acomorbid]
    gene_names = [thiscolname for thiscolname in genexpconditiondf.columns if thiscolname not in dontuse_columns]
    #df.apply(zscore)
    transpose_df = genexpconditiondf.drop(columns=dontuse_columns)
    transpose_df = transpose_df.apply(zscore)
    transpose_df["Patient"] = genexpconditiondf["Patient"]
    transpose_df[acomorbid] = genexpconditiondf[acomorbid]
    transpose_df = transpose_df[["Patient", acomorbid]+gene_names]
    return transpose_df


# Here are all the conditions with genes link

In [39]:
df_nona_threshold_names = filter_condition_gene_links_by_threshold(df, cuttoff,  adjpvalcol)
df_nona_threshold_names["binary_attribute"].unique()

array(['patent_foramen_ovale', 'obesity_disorder', 'depressive_disorder',
       'hypothyroidism', 'atrioventricular_septal_defect',
       'skeletal_system_disorder', 'otorhinolaryngologic_disease',
       'obstructive_sleep_apnea_syndrome', 'ventricular_septal_defect',
       'atopic_eczema', 'patent_ductus_arteriosus',
       'respiratory_system_disorder', 'digestive_system_disorder',
       'heart_disorder', 'gastroesophageal_reflux_disease',
       'eye_disorder', 'asthma', 'nervous_system_disorder',
       'eustachian_tube_disorder', 'psychiatric_disorder', 'tic_disorder',
       'constipation_disorder', 'hearing_loss_disorder', 'strabismus',
       'vitamin_D_deficiency', 'sleep_apnea_syndrome',
       'attention_deficit-hyperactivity_disorder', 'atrial_septal_defect',
       'folliculitis', 'skin_disorder', 'anxiety',
       'conductive_hearing_loss_disorder', 'congenital_heart_disease'],
      dtype=object)

# Look at the metadata for the genes linked to that comorbid

In [40]:
acondition = "respiratory_system_disorder"
#acondition = "strabismus"
#acondition="eustachian_tube_disorder"
#acondition="gastroesophageal_reflux_disease"


In [52]:
gene_connected_metadata = acomorbidanditsgenes_metadata(acondition, genedf)
gene_connected_metadata[["binary_attribute", "value", "NES", "gene_name", "p_value_BenjaminiYekutieli"]]

Unnamed: 0,binary_attribute,value,NES,gene_name,p_value_BenjaminiYekutieli
76,respiratory_system_disorder,ENSG00000183527,-4.367379,PSMG1,0.008419
103,respiratory_system_disorder,ENSG00000154640,-4.144038,BTG3,0.019197
188,respiratory_system_disorder,ENSG00000234883,-4.023974,MIR155HG,0.057409
160,respiratory_system_disorder,ENSG00000205581,-3.974979,HMGN1,0.042551
140,respiratory_system_disorder,ENSG00000159200,-3.933645,RCAN1,0.033943
124,respiratory_system_disorder,ENSG00000273271,-3.916407,AP000254.1,0.028674
168,respiratory_system_disorder,ENSG00000205670,-3.897384,SMIM11A,0.049366
248,respiratory_system_disorder,ENSG00000159079,-3.785886,CFAP298,0.116866
250,respiratory_system_disorder,ENSG00000154723,-3.736676,ATP5PF,0.118893
312,respiratory_system_disorder,ENSG00000156256,-3.657672,USP16,0.185187


In [42]:
protien_coding_gene_connected_metadata = gene_connected_metadata[gene_connected_metadata["gene_biotype"]=="protein_coding"]
protien_coding_gene_connected_metadata

Unnamed: 0,binary_attribute,value,runpsea,NES,pval,p_value_bonf,p_value_holm,p_value_BenjaminiHochberg,p_value_BenjaminiYekutieli,seqnames,...,end,width,strand,source,type,gene_id,gene_version,gene_name,gene_source,gene_biotype
76,respiratory_system_disorder,ENSG00000183527,included,-4.367379,4e-06,0.063431,0.063144,0.000817,0.008419,21,...,39183851,9083,-,ensembl_havana,gene,ENSG00000183527,11,PSMG1,ensembl_havana,protein_coding
103,respiratory_system_disorder,ENSG00000154640,included,-4.144038,1.1e-05,0.192556,0.191387,0.001863,0.019197,21,...,17612947,19295,-,ensembl_havana,gene,ENSG00000154640,14,BTG3,ensembl_havana,protein_coding
160,respiratory_system_disorder,ENSG00000205581,included,-3.974979,4e-05,0.664725,0.658393,0.004129,0.042551,21,...,39349647,7333,-,ensembl_havana,gene,ENSG00000205581,10,HMGN1,ensembl_havana,protein_coding
140,respiratory_system_disorder,ENSG00000159200,included,-3.933645,2.8e-05,0.464375,0.460504,0.003293,0.033943,21,...,34615142,102001,-,ensembl_havana,gene,ENSG00000159200,17,RCAN1,ensembl_havana,protein_coding
168,respiratory_system_disorder,ENSG00000205670,included,-3.897384,4.8e-05,0.813593,0.805455,0.00479,0.049366,21,...,34407866,32387,+,ensembl_havana,gene,ENSG00000205670,10,SMIM11A,ensembl_havana,protein_coding
248,respiratory_system_disorder,ENSG00000159079,included,-3.785886,0.000168,1.0,1.0,0.011339,0.116866,21,...,32612866,20788,-,ensembl_havana,gene,ENSG00000159079,18,CFAP298,ensembl_havana,protein_coding
250,respiratory_system_disorder,ENSG00000154723,included,-3.736676,0.000172,1.0,1.0,0.011536,0.118893,21,...,25735673,19171,-,ensembl_havana,gene,ENSG00000154723,12,ATP5PF,ensembl_havana,protein_coding
312,respiratory_system_disorder,ENSG00000156256,included,-3.657672,0.000335,1.0,1.0,0.017968,0.185187,21,...,29054488,29860,+,ensembl_havana,gene,ENSG00000156256,14,USP16,ensembl_havana,protein_coding
326,respiratory_system_disorder,ENSG00000142188,included,-3.623632,0.000377,1.0,1.0,0.01936,0.199527,21,...,33480011,47527,-,ensembl_havana,gene,ENSG00000142188,16,TMEM50B,ensembl_havana,protein_coding
350,respiratory_system_disorder,ENSG00000154719,included,-3.476012,0.000464,1.0,1.0,0.022103,0.227794,21,...,25607517,21862,-,ensembl_havana,gene,ENSG00000154719,13,MRPL39,ensembl_havana,protein_coding


# Look at the patient expression for each of the genes in the data

### gene expression

In [43]:
genexpconditiondf = acomorbidanditsgenesexp(acondition,gene_exp_df, comorbid_df, df, genedf)
#genexpconditiondf = acomorbidanditsgenesexp(acondition,gene_exp_df, comorbid_df, df, genedf, collect="high") #use if you want genes up in the disorder
#genexpconditiondf = acomorbidanditsgenesexp(acondition,gene_exp_df, comorbid_df, df, genedf, collect="low") #use if you want genes down in the disorder



### Genes split on if they are up or down in the condition

In [44]:
zscore_genexpconditiondf = zscoregenes(acondition, genexpconditiondf)


In [45]:
wco, woco = splitpeoplewithandwithout(acondition, zscore_genexpconditiondf)


In [46]:
wco = wco.reset_index(drop=True)
woco = woco.reset_index(drop=True)

In [47]:
fig = px.imshow(wco,color_continuous_scale='Viridis', zmin=-3, zmax=3)
fig.show()

In [48]:
fig = px.imshow(woco,color_continuous_scale='Viridis', zmin=-3, zmax=3)
fig.show()