<a href="https://colab.research.google.com/github/Dowell-Lab/pop_inf_beta/blob/main/colab/genes_are_different.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import plotly.express as px

In [2]:
indir="https://raw.githubusercontent.com/Dowell-Lab/pop_inf_beta/refs/heads/main/testdata/"


In [3]:
metadataPRO = "metaPRO.txt"
metadataRNA = "metaRNA.txt"
popdiffPRO = "pop_PROgenebody_PROseq_res.txt"
popdiffRNA = "pop_RNAgene_RNAseq_res.txt"
normcountsPRO = "normalized__master__PRO.csv"
normcountsRNA = "normalized__master__RNA.csv"
PROsufix= "_PROgenebody_PROseq_res.txt"
RNAsufix= "_RNAgene_RNAseq_res.txt"

# Pull in the meta data for the experiment

In [4]:
fn = indir+metadataPRO
PROmeta = pd.read_csv(fn)
fn = indir+metadataRNA
RNAmeta = pd.read_csv(fn)

In [5]:
people = PROmeta["genotype"].unique()
people

array(['ChenChao', 'Dave', 'Eric', 'Ethan', 'Khaondo', 'Niyilolawa',
       'Pedro', 'Srivathani'], dtype=object)

# Pull in the normalized counts for every gene in every person

In [6]:
PROcounts = pd.read_csv(indir+normcountsPRO, index_col=0)
RNAcounts = pd.read_csv(indir+normcountsRNA, index_col=0)

# Which genes are different in the population after treating the population with interferon beta. This was measured via both PRO-seq and RNA-seq.

In [7]:
popdiffPROdf = pd.read_csv(indir+popdiffPRO, index_col=0)
popdiffPROdf = popdiffPROdf.sort_values("padj")
popdiffRNAdf = pd.read_csv(indir+popdiffRNA, index_col=0)
popdiffRNAdf = popdiffRNAdf.sort_values("padj")

In [8]:
popdiffRNAdf

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
DTX3L,11268.033605,3.254709,0.107344,30.320488,6.156466e-202,1.412540e-197
USP18,3161.533571,4.362786,0.148979,29.284542,1.631945e-188,1.872167e-184
SLFN5,12528.305968,3.220319,0.110883,29.042399,1.919481e-185,1.468019e-181
APOL6,15036.688842,3.437850,0.118853,28.925306,5.739085e-184,2.633551e-180
PLSCR1,3834.971435,3.494580,0.120801,28.928439,5.241196e-184,2.633551e-180
...,...,...,...,...,...,...
SPRY3_1,0.000000,,,,,
VAMP7_1,0.000000,,,,,
IL9R_1,0.000000,,,,,
WASIR1_1,0.000000,,,,,


# Find the adjusted p-value for each gene in each person and each assay

In [None]:
padj_per_person = pd.DataFrame(index=popdiffRNAdf.index)
padj_per_person["pop_RNA"] = popdiffRNAdf["padj"]
padj_per_person["pop_PRO"] = popdiffPROdf["padj"]

for person in people:
    print(person)
    persondiffPROdf = pd.read_csv(indir+person+PROsufix, index_col=0)
    persondiffRNAdf = pd.read_csv(indir+person+RNAsufix, index_col=0)
    persondiffPROdf = persondiffPROdf[["padj"]]
    persondiffRNAdf = persondiffRNAdf[["padj"]]
    persondiffPROdf.columns = [person+"_PRO"]
    persondiffRNAdf.columns = [person+"_RNA"]
    padj_per_person = pd.concat([padj_per_person, persondiffPROdf, persondiffRNAdf], axis=1)




ChenChao
Dave
Eric
Ethan
Khaondo
Niyilolawa
Pedro
Srivathani


# Create a data frame with only genes that significant in at least one comparison (In one person or in the population in RNA seq or in PRO-seq)

In [None]:
threshold=0.01
sig_padj_per_person = padj_per_person.copy()
sig_padj_per_person["min_padj"] = sig_padj_per_person.min(axis=1)
sig_padj_per_person = sig_padj_per_person[sig_padj_per_person["min_padj"]<threshold] #If it was significant in any experiment keep the gene
sig_padj_per_person.sort_values("pop_RNA")

# Find genes that are significant in the population but not in any one person

In [None]:
keepcols = [cn for cn in sig_padj_per_person.columns if "pop" in cn]
pop_adj_df = sig_padj_per_person[keepcols]
keepcols = [cn for cn in sig_padj_per_person.columns if not "pop" in cn]
people_adj_df = sig_padj_per_person[keepcols]
keepcols = [cn for cn in people_adj_df.columns if "RNA" in cn]
people_adj_df_RNA = people_adj_df[keepcols]
keepcols = [cn for cn in people_adj_df.columns if "PRO" in cn]
people_adj_df_PRO = people_adj_df[keepcols]
keepcols = [cn for cn in pop_adj_df.columns if "RNA" in cn]
pop_adj_df_RNA = pop_adj_df[keepcols]
keepcols = [cn for cn in pop_adj_df.columns if "PRO" in cn]
pop_adj_df_PRO = pop_adj_df[keepcols]

In [None]:
sig_pop_RNA = pop_adj_df_RNA[pop_adj_df_RNA["pop_RNA"]<threshold]
sig_pop_PRO = pop_adj_df_PRO[pop_adj_df_PRO["pop_PRO"]<threshold]
sig_people_RNA = people_adj_df_RNA[people_adj_df_RNA.min(axis=1)<threshold]
sig_people_PRO = people_adj_df_PRO[people_adj_df_PRO.min(axis=1)<threshold]

In [None]:
RNA_genes_pop_not_people = [gn for gn in sig_pop_RNA.index if gn not in sig_people_RNA.index]
print("RNA genes", RNA_genes_pop_not_people)
PRO_genes_pop_not_people = [gn for gn in sig_pop_PRO.index if gn not in sig_people_PRO.index]
print("PRO genes", PRO_genes_pop_not_people)

# Find genes that are significant in people not in population

In [None]:
RNA_genes_pop_not_people = [gn for gn in sig_people_RNA.index if gn not in sig_pop_RNA.index]
print("RNA genes", RNA_genes_pop_not_people)
PRO_genes_pop_not_people = [gn for gn in sig_people_PRO.index if gn not in sig_pop_PRO.index]
print("PRO genes", PRO_genes_pop_not_people)

# Plot one gene

In [None]:

# Reshape the PROcounts DataFrame from wide to long format
PROcounts_long = PROcounts.reset_index().melt(id_vars='index', var_name='sample', value_name='count')

# Split the sample column into 'name' and 'assay' columns
PROcounts_long[['assay', 'treatment',"person", "replicate"]] = PROcounts_long['sample'].str.split('.', expand=True)

# Display the reshaped DataFrame (optional)
PROcounts_long

# Reshape the RNAcounts DataFrame from wide to long format
RNAcounts_long = RNAcounts.reset_index().melt(id_vars='index', var_name='sample', value_name='count')

# Split the sample column into 'name' and 'assay' columns
RNAcounts_long[['assay', 'treatment',"person", "replicate"]] = RNAcounts_long['sample'].str.split('.', expand=True)

# Display the reshaped DataFrame (optional)
RNAcounts_long

In [None]:
def getonegene(onegene="STMN1"):
  one_gene_PROcounts_long = PROcounts_long[PROcounts_long["index"]==onegene]
  one_gene_RNAcounts_long = RNAcounts_long[RNAcounts_long["index"]==onegene]
  one_gene_long = pd.concat([one_gene_PROcounts_long, one_gene_RNAcounts_long])
  return one_gene_long

  # prompt: make two new columns in one_gene_long that come from the treatment column and the values are the count column

def one_gene_long_to_IFN_BSA_split(one_gene_long):
# Create new columns 'treatment_count' and 'treatment_count'
  one_gene_long['experiment']= one_gene_long['assay'] + '.' + one_gene_long['person'] + '.' + one_gene_long['replicate']
  one_gene_long_BSA = one_gene_long[one_gene_long['treatment'].str.contains('BSA')]
  one_gene_long_IFN = one_gene_long[one_gene_long['treatment'].str.contains('IFN')]
  one_gene_long_BSA = one_gene_long_BSA[["experiment", "assay", "person", "replicate", "count"]]
  one_gene_long_IFN = one_gene_long_IFN[["experiment", "count"]]
  combo = one_gene_long_BSA.merge(one_gene_long_IFN, on="experiment", suffixes=("_BSA", "_IFN"))
  return combo

In [None]:
#onegene="STMN1"
onegene="DTX3L"
one_gene_long = getonegene(onegene=onegene)
onegene_BSA_IFN = one_gene_long_to_IFN_BSA_split(one_gene_long)
onegene_BSA_IFN_RNA = onegene_BSA_IFN[onegene_BSA_IFN["assay"]=="RNA"]
onegene_BSA_IFN_PRO = onegene_BSA_IFN[onegene_BSA_IFN["assay"]=="PRO"]


In [None]:
fig = px.scatter(onegene_BSA_IFN_RNA, x="count_BSA", y="count_IFN", trendline="ols", color="person")
fig.add_shape(type="line",
              x0=min(onegene_BSA_IFN_RNA['count_BSA']), y0=min(onegene_BSA_IFN_RNA['count_IFN']),
              x1=max(onegene_BSA_IFN_RNA['count_BSA']), y1=max(onegene_BSA_IFN_RNA['count_IFN']),
              line=dict(color="red", width=2,  dash='dot'))
fig.show()

In [None]:

fig = px.scatter(onegene_BSA_IFN_RNA, x="count_BSA", y="count_IFN", trendline="ols", color="person")
fig.add_shape(type="line",
              x0=min(onegene_BSA_IFN_RNA['count_BSA']), y0=min(onegene_BSA_IFN_RNA['count_IFN']),
              x1=max(onegene_BSA_IFN_RNA['count_BSA']), y1=max(onegene_BSA_IFN_RNA['count_IFN']),
              line=dict(color="red", width=2,  dash='dot'))

# Force the x and y axes to have the same range
axis_range = [min(min(onegene_BSA_IFN_RNA['count_BSA']), min(onegene_BSA_IFN_RNA['count_IFN'])),
              max(max(onegene_BSA_IFN_RNA['count_BSA']), max(onegene_BSA_IFN_RNA['count_IFN']))]
fig.update_xaxes(range=axis_range)
fig.update_yaxes(range=axis_range)

fig.show()

In [None]:
fig = px.scatter(onegene_BSA_IFN_PRO, x="count_BSA", y="count_IFN", trendline="ols", color="person")
fig.add_shape(type="line",
              x0=min(onegene_BSA_IFN_PRO['count_BSA']), y0=min(onegene_BSA_IFN_PRO['count_IFN']),
              x1=max(onegene_BSA_IFN_PRO['count_BSA']), y1=max(onegene_BSA_IFN_PRO['count_IFN']),
              line=dict(color="red", width=2,  dash='dot'))
fig.show()

In [None]:
# prompt: force the x and y axis in the graph above to be the same values


fig = px.scatter(onegene_BSA_IFN_PRO, x="count_BSA", y="count_IFN", trendline="ols", color="person")
fig.add_shape(type="line",
              x0=min(onegene_BSA_IFN_PRO['count_BSA']), y0=min(onegene_BSA_IFN_PRO['count_IFN']),
              x1=max(onegene_BSA_IFN_PRO['count_BSA']), y1=max(onegene_BSA_IFN_PRO['count_IFN']),
              line=dict(color="red", width=2,  dash='dot'))

# Force the x and y axes to have the same range
axis_range = [min(min(onegene_BSA_IFN_PRO['count_BSA']), min(onegene_BSA_IFN_PRO['count_IFN'])),
              max(max(onegene_BSA_IFN_PRO['count_BSA']), max(onegene_BSA_IFN_PRO['count_IFN']))]
fig.update_xaxes(range=axis_range)
fig.update_yaxes(range=axis_range)

fig.show()