<a href="https://colab.research.google.com/github/Dowell-Lab/pop_inf_beta/blob/main/colab/genes_are_different.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd

In [25]:
indir="https://raw.githubusercontent.com/Dowell-Lab/pop_inf_beta/refs/heads/main/testdata/"


In [26]:
metadataPRO = "metaPRO.txt"
metadataRNA = "metaRNA.txt"
popdiffPRO = "pop_PROgenebody_PROseq_res.txt"
popdiffRNA = "pop_RNAgene_RNAseq_res.txt"
normcountsPRO = "normalized__master__PRO.csv"
normcountsRNA = "normalized__master__RNA.csv"
PROsufix= "_PROgenebody_PROseq_res.txt"
RNAsufix= "_RNAgene_RNAseq_res.txt"

# Pull in the meta data for the experiment

In [27]:
fn = indir+metadataPRO
PROmeta = pd.read_csv(fn)
fn = indir+metadataRNA
RNAmeta = pd.read_csv(fn)

In [28]:
people = PROmeta["genotype"].unique()
people

array(['ChenChao', 'Dave', 'Eric', 'Ethan', 'Khaondo', 'Niyilolawa',
       'Pedro', 'Srivathani'], dtype=object)

# Pull in the normalized counts for every gene in every person

In [29]:
PROcounts = pd.read_csv(indir+normcountsPRO, index_col=0)
RNAcounts = pd.read_csv(indir+normcountsRNA, index_col=0)

# Which genes are different in the population after treating the population with interferon beta. This was measured via both PRO-seq and RNA-seq.

In [50]:
popdiffPROdf = pd.read_csv(indir+popdiffPRO, index_col=0)
popdiffPROdf = popdiffPROdf.sort_values("padj")
popdiffRNAdf = pd.read_csv(indir+popdiffRNA, index_col=0)
popdiffRNAdf = popdiffRNAdf.sort_values("padj")

In [51]:
popdiffRNAdf

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
DTX3L,11268.033605,3.254709,0.107344,30.320488,6.156466e-202,1.412540e-197
USP18,3161.533571,4.362786,0.148979,29.284542,1.631945e-188,1.872167e-184
SLFN5,12528.305968,3.220319,0.110883,29.042399,1.919481e-185,1.468019e-181
APOL6,15036.688842,3.437850,0.118853,28.925306,5.739085e-184,2.633551e-180
PLSCR1,3834.971435,3.494580,0.120801,28.928439,5.241196e-184,2.633551e-180
...,...,...,...,...,...,...
SPRY3_1,0.000000,,,,,
VAMP7_1,0.000000,,,,,
IL9R_1,0.000000,,,,,
WASIR1_1,0.000000,,,,,


# Find the adjusted p-value for each gene in each person and each assay

In [40]:
padj_per_person = pd.DataFrame(index=popdiffRNAdf.index)
padj_per_person["pop_RNA"] = popdiffRNAdf["padj"]
padj_per_person["pop_PRO"] = popdiffPROdf["padj"]

for person in people:
    print(person)
    persondiffPROdf = pd.read_csv(indir+person+PROsufix, index_col=0)
    persondiffRNAdf = pd.read_csv(indir+person+RNAsufix, index_col=0)
    persondiffPROdf = persondiffPROdf[["padj"]]
    persondiffRNAdf = persondiffRNAdf[["padj"]]
    persondiffPROdf.columns = [person+"_PRO"]
    persondiffRNAdf.columns = [person+"_RNA"]
    padj_per_person = pd.concat([padj_per_person, persondiffPROdf, persondiffRNAdf], axis=1)




ChenChao
Dave
Eric
Ethan
Khaondo
Niyilolawa
Pedro
Srivathani


# Create a data frame with only genes that significant in at least one comparison (In one person or in the population in RNA seq or in PRO-seq)

In [48]:
threshold=0.01
sig_padj_per_person = padj_per_person.copy()
sig_padj_per_person["min_padj"] = sig_padj_per_person.min(axis=1)
sig_padj_per_person = sig_padj_per_person[sig_padj_per_person["min_padj"]<threshold] #If it was significant in any experiment keep the gene
sig_padj_per_person.sort_values("pop_RNA")

Unnamed: 0,pop_RNA,pop_PRO,ChenChao_PRO,ChenChao_RNA,Dave_PRO,Dave_RNA,Eric_PRO,Eric_RNA,Ethan_PRO,Ethan_RNA,Khaondo_PRO,Khaondo_RNA,Niyilolawa_PRO,Niyilolawa_RNA,Pedro_PRO,Pedro_RNA,Srivathani_PRO,Srivathani_RNA,min_padj
DTX3L,1.412540e-197,4.360128e-38,4.360128e-38,1.412540e-197,8.583627e-23,2.587407e-101,5.030985e-40,4.767178e-268,2.992660e-40,4.880995e-209,7.572255e-32,5.633599e-147,4.193822e-42,2.036026e-172,4.756796e-29,3.995864e-146,1.883170e-34,1.222962e-157,4.767178e-268
USP18,1.872167e-184,1.459065e-49,1.459065e-49,1.872167e-184,9.385243e-24,1.964340e-118,1.788279e-36,8.571460e-183,4.936345e-38,3.440160e-153,5.891097e-45,3.310760e-158,9.124827e-51,1.337141e-151,1.405153e-54,4.613727e-204,2.219790e-34,1.107774e-130,4.613727e-204
SLFN5,1.468019e-181,3.964858e-40,3.964858e-40,1.468019e-181,9.834914e-33,8.481532e-160,1.833730e-45,1.594368e-248,7.212161e-48,2.358637e-245,3.448248e-39,3.053801e-146,7.942320e-51,5.944580e-198,1.275917e-51,3.865349e-190,6.941539e-39,4.603887e-140,1.594368e-248
APOL6,2.633551e-180,5.671051e-59,5.671051e-59,2.633551e-180,4.336187e-30,1.981444e-76,2.624352e-38,5.449274e-150,6.167975e-61,5.945410e-205,2.149333e-68,8.888193e-184,2.688942e-61,1.228000e-155,2.663202e-60,2.117945e-166,1.498494e-46,6.630205e-149,5.945410e-205
PLSCR1,2.633551e-180,1.717607e-47,1.717607e-47,2.633551e-180,1.238461e-46,3.088674e-154,3.160305e-63,4.538355e-259,7.128719e-68,2.078463e-241,2.170758e-38,3.081114e-111,1.389872e-55,6.054947e-192,2.624259e-54,9.887244e-206,4.472556e-44,4.491567e-145,4.538355e-259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LOC105376265,,2.007870e-04,2.007870e-04,,6.742309e-01,,5.931764e-01,,8.342790e-01,6.122152e-01,4.293560e-01,7.672554e-01,4.917870e-01,9.981534e-01,6.917063e-01,,9.354765e-01,,2.007870e-04
LOC613206,,1.927038e-01,1.927038e-01,,1.661774e-01,,2.189470e-01,,2.149226e-01,,2.802404e-03,,3.562111e-02,,8.582688e-01,,9.397871e-01,,2.802404e-03
LOC124902295,,6.924051e-02,6.924051e-02,,1.055255e-01,,5.728212e-02,,1.318091e-02,7.982670e-01,1.371913e-01,1.262902e-01,2.161579e-03,9.769784e-01,6.384940e-02,,2.274832e-01,,2.161579e-03
LOC105376314,,1.077472e-02,1.077472e-02,,6.619048e-02,,9.118980e-01,,5.859693e-03,,2.907501e-02,,3.398457e-02,,7.813282e-02,,3.742192e-01,,5.859693e-03


# Find genes that are significant in the population but not in any one person

In [60]:
keepcols = [cn for cn in sig_padj_per_person.columns if "pop" in cn]
pop_adj_df = sig_padj_per_person[keepcols]
keepcols = [cn for cn in sig_padj_per_person.columns if not "pop" in cn]
people_adj_df = sig_padj_per_person[keepcols]
keepcols = [cn for cn in people_adj_df.columns if "RNA" in cn]
people_adj_df_RNA = people_adj_df[keepcols]
keepcols = [cn for cn in people_adj_df.columns if "PRO" in cn]
people_adj_df_PRO = people_adj_df[keepcols]
keepcols = [cn for cn in pop_adj_df.columns if "RNA" in cn]
pop_adj_df_RNA = pop_adj_df[keepcols]
keepcols = [cn for cn in pop_adj_df.columns if "PRO" in cn]
pop_adj_df_PRO = pop_adj_df[keepcols]

In [65]:
sig_pop_RNA = pop_adj_df_RNA[pop_adj_df_RNA["pop_RNA"]<threshold]
sig_pop_PRO = pop_adj_df_PRO[pop_adj_df_PRO["pop_PRO"]<threshold]
sig_people_RNA = people_adj_df_RNA[people_adj_df_RNA.min(axis=1)<threshold]
sig_people_PRO = people_adj_df_PRO[people_adj_df_PRO.min(axis=1)<threshold]

In [70]:
RNA_genes_pop_not_people = [gn for gn in sig_pop_RNA.index if gn not in sig_people_RNA.index]
print("RNA genes", RNA_genes_pop_not_people)
PRO_genes_pop_not_people = [gn for gn in sig_pop_PRO.index if gn not in sig_people_PRO.index]
print("PRO genes", PRO_genes_pop_not_people)

RNA genes []
PRO genes []


# Find genes that are significant in people not in population

In [71]:
RNA_genes_pop_not_people = [gn for gn in sig_people_RNA.index if gn not in sig_pop_PRO.index]
print("RNA genes", RNA_genes_pop_not_people)
PRO_genes_pop_not_people = [gn for gn in sig_people_PRO.index if gn not in sig_pop_PRO.index]
print("PRO genes", PRO_genes_pop_not_people)

RNA genes ['LOC124900384', 'LOC729737', 'LOC100132287', 'LINC00115', 'LINC01128', 'PLEKHN1', 'HES4', 'AGRN', 'TNFRSF4', 'B3GALT6', 'UBE2J2', 'ACAP3', 'PUSL1', 'CPTP', 'DVL1', 'MRPL20-AS1', 'MRPL20-DT', 'ATAD3B', 'SLC35E2B', 'SLC35E2A', 'NADK', 'PRKCZ', 'SKI', 'PANK4', 'PRXL2B', 'TPRG1L', 'WRAP73', 'TP73', 'TP73-AS3', 'LRRC47', 'DFFB', 'NPHP4', 'KCNAB2', 'ICMT', 'GPR153', 'ZBTB48', 'THAP3', 'ERRFI1', 'GPR157', 'TMEM201', 'PIK3CD', 'CTNNBIP1', 'CASZ1', 'TARDBP', 'UBIAD1', 'AGTRAP', 'MTHFR', 'NPPA-AS1', 'KIAA2013', 'TNFRSF8', 'MIR7846', 'TNFRSF1B', 'TMEM51', 'ARHGEF19', 'RNU1-1', 'RCC2', 'IFFO2', 'EMC1-AS1', 'SLC66A1', 'HTR6', 'PINK1', 'EPHB2', 'LUZP1', 'ZNF436', 'E2F2', 'ID3', 'ELOA', 'CNR2', 'IFNLR1', 'MIR6731', 'RUNX3-AS1', 'LINC02793', 'MACO1', 'PAQR7', 'PDIK1L', 'CNKSR1', 'ZDHHC18', 'GPN2', 'LOC101928391', 'MAP3K6', 'AHDC1', 'SMPDL3B', 'XKR8', 'EYA3', 'PTAFR', 'SNHG3', 'RCC1', 'RAB42', 'SRSF4', 'MECR', 'LINC01226', 'LOC124903917', 'LOC128031832', 'TMEM39B', 'IQCC', 'MTMR9LP', 'MARCKS

# Plot one gene

In [74]:

# Reshape the PROcounts DataFrame from wide to long format
PROcounts_long = PROcounts.reset_index().melt(id_vars='index', var_name='sample', value_name='count')

# Split the sample column into 'name' and 'assay' columns
PROcounts_long[['assay', 'treatment',"person", "replicate"]] = PROcounts_long['sample'].str.split('.', expand=True)

# Display the reshaped DataFrame (optional)
PROcounts_long

# Reshape the RNAcounts DataFrame from wide to long format
RNAcounts_long = RNAcounts.reset_index().melt(id_vars='index', var_name='sample', value_name='count')

# Split the sample column into 'name' and 'assay' columns
RNAcounts_long[['assay', 'treatment',"person", "replicate"]] = RNAcounts_long['sample'].str.split('.', expand=True)

# Display the reshaped DataFrame (optional)
RNAcounts_long

Unnamed: 0,index,sample,count,assay,treatment,person,replicate
0,DDX11L1,PRO.BSA.ChenChao.1,0.859218,PRO,BSA,ChenChao,1
1,WASH7P,PRO.BSA.ChenChao.1,68.737431,PRO,BSA,ChenChao,1
2,MIR6859-1,PRO.BSA.ChenChao.1,0.000000,PRO,BSA,ChenChao,1
3,MIR1302-2,PRO.BSA.ChenChao.1,0.000000,PRO,BSA,ChenChao,1
4,MIR1302-2HG,PRO.BSA.ChenChao.1,0.000000,PRO,BSA,ChenChao,1
...,...,...,...,...,...,...,...
1351771,SPRY3_1,PRO.IFNB.Srivathani.2,0.000000,PRO,IFNB,Srivathani,2
1351772,VAMP7_1,PRO.IFNB.Srivathani.2,0.000000,PRO,IFNB,Srivathani,2
1351773,IL9R_1,PRO.IFNB.Srivathani.2,0.000000,PRO,IFNB,Srivathani,2
1351774,WASIR1_1,PRO.IFNB.Srivathani.2,0.000000,PRO,IFNB,Srivathani,2


Unnamed: 0,index,sample,count,assay,treatment,person,replicate
0,DDX11L1,RNA.BSA.ChenChao.1,0.000000,RNA,BSA,ChenChao,1
1,WASH7P,RNA.BSA.ChenChao.1,329.667183,RNA,BSA,ChenChao,1
2,MIR6859-1,RNA.BSA.ChenChao.1,5.908014,RNA,BSA,ChenChao,1
3,MIR1302-2HG,RNA.BSA.ChenChao.1,0.000000,RNA,BSA,ChenChao,1
4,MIR1302-2,RNA.BSA.ChenChao.1,0.000000,RNA,BSA,ChenChao,1
...,...,...,...,...,...,...,...
2027659,SPRY3_1,RNA.IFN.Srivathani.3,0.000000,RNA,IFN,Srivathani,3
2027660,VAMP7_1,RNA.IFN.Srivathani.3,0.000000,RNA,IFN,Srivathani,3
2027661,IL9R_1,RNA.IFN.Srivathani.3,0.000000,RNA,IFN,Srivathani,3
2027662,WASIR1_1,RNA.IFN.Srivathani.3,0.000000,RNA,IFN,Srivathani,3


In [80]:
onegene="STMN1"
one_gene_PROcounts_long = PROcounts_long[PROcounts_long["index"]==onegene]
one_gene_RNAcounts_long = RNAcounts_long[RNAcounts_long["index"]==onegene]
one_gene_long = pd.concat([one_gene_PROcounts_long, one_gene_RNAcounts_long])
one_gene_long

Unnamed: 0,index,sample,count,assay,treatment,person,replicate
665,STMN1,PRO.BSA.ChenChao.1,1532.844714,PRO,BSA,ChenChao,1
42908,STMN1,PRO.BSA.ChenChao.2,1415.304513,PRO,BSA,ChenChao,2
85151,STMN1,PRO.BSA.Dave.1,815.451741,PRO,BSA,Dave,1
127394,STMN1,PRO.BSA.Dave.3,1118.321847,PRO,BSA,Dave,3
169637,STMN1,PRO.BSA.Eric.1,1970.331842,PRO,BSA,Eric,1
...,...,...,...,...,...,...,...
1817114,STMN1,RNA.IFN.Pedro.2,7625.063336,RNA,IFN,Pedro,2
1859357,STMN1,RNA.IFN.Pedro.3,7627.898256,RNA,IFN,Pedro,3
1901600,STMN1,RNA.IFN.Srivathani.1,6140.611613,RNA,IFN,Srivathani,1
1943843,STMN1,RNA.IFN.Srivathani.2,6718.818481,RNA,IFN,Srivathani,2
