In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from matplotlib.backends.backend_pdf import PdfPages

# 1.Load data

In [2]:
insertion_statistics = pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_DIT_HAP/15_insertion_level_depletion_analysis/insertions_LFC.tsv", index_col=[0,1,2,3], header=[0,1], sep="\t")
fitting_results = pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_DIT_HAP/16_insertion_level_curve_fitting/insertions_LFC_fitted.tsv", index_col=[0,1,2,3], sep="\t")
gene_clusters = pd.read_excel("../../resources/20250317_plateau_GWMs.xlsx")
annotations = pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_DIT_HAP/12_concatenated/annotations.tsv", index_col=[0,1,2,3], sep="\t")
ingene_index = annotations.query("Type != 'Intergenic region' and Distance_to_stop_codon > 4").index


# 2.Selected genes

In [3]:
selected_genes = [
    "alm1",
    "atp25",
    "bst1",
    "cdc12",
    "cdc15",
    "cdc7",
    "cdk9",
    "cid14",
    "cog6",
    "cwf24",
    "cwf3",
    "dbl2",
    "dcp1",
    "dfr1",
    "esf1",
    "eso1",
    "fcp1",
    "gmh4",
    "gpi17",
    "gpi18",
    "hfi1",
    "hsf1",
    "its3",
    "mcm4",
    "mdm34",
    "met9",
    "mis6",
    "mms19",
    "mre11",
    "nbs1",
    "not1",
    "nup189",
    "pan1",
    "peg1",
    "pep3",
    "rib2",
    "rib7",
    "rlp7",
    "rpt2",
    "sec8",
    "sip1",
    "sog2",
    "spt7",
    "ssr2",
    "tfg1",
    "tif471",
    "tpr1",
    "uba2",
    "vps11",
    "vps16",
    "lsh1",
    "cdc12",
    "gpi18",
    "cid14",
    "efg1",
    "tup12",
    "mrp11",
    "atp25",
    "mre11",
    "sld3",
    "wrs1",
    "prh1",
    "mis6",
    "meu31",
    "met9",
    "cdc15",
    "taf2",
    "cdc123",
    "vps41",
    "bst1",
    "rlp7",
    "bfr2",
    "ulp2",
    "swc4",
    "rpn9",
    "pst2",
    "fta2",
    "cit1",
    "trl1",
    "ssr2",
    "vps11",
    "rpp101",
    "vma5",
    "abo1",
    "peg1",
    "ntr1",
    "alm1",
    "nup189",
    "sen1",
    "rpl702",
    "mms19",
    "pub1",
    "its3",
    "pan1",
    "tif471",
    "tpr1",
    "hsf1",
    "sea3",
    "ino80",
    "sip1",
    "mtr10",
    "uba2",
    "pdc2",
    "ucp3",
    "ste20",
    "cyr1",
    "taf51",
    "cwf24",
    "naf1",
    "rsm27",
    "cog6",
    "cdc7",
    "spt7",
    "imp2",
    "cbp1",
    "sog2",
    "hfi1",
    "tad2",
    "ani1",
    "cnp3",
    "eso1",
    "utp6",
    "cog3",
    "dbl2",
    "rib2",
    "sec8",
    "asp1",
    "mcm4",
    "cwf5",
    "git3",
    "ekc1",
    "ksg1",
    "tfg1",
    "bdp1",
    "pep3",
    "syp1",
    "med14",
    "nse6",
    "lid2",
    "cut8",
    "tfa2",
    "dia4",
    "drp1",
    "mrpl1",
    "rpc2",
    "mcb1",
    "pog1",
    "mor2",
    "rpm2",
    "ssr3",
    "fab1",
    "hem15",
    "pcp1",
    "ams2",
    "dbp5",
    "rpn7",
    "los1",
    "zas1",
    "stt4",
    "mrpl10",
    "mrp51",
    "ero12",
    "mms1",
    "raf2",
    "dna2",
    "gpi12",
    "ssr4",
    "omh6",
    "cup1",
    "sin1",
    "nup120",
    "tim23",
    "yta12",
    "rrp9",
    "smi1",
    "rng3",
    "rad52",
    "pdi1",
    "nnk1",
    "rok1",
    "alp5",
    "iml1",
    "ppp1",
    "elp1",
    "urb1",
    "SPAC8F11.04",
    "npl4",
    "cul4",
    "sgd1",
    "gta1",
    "rpc17",
    "end4",
    "hmg1",
    "vam6",
    "syf2",
    "isa1",
    "cwf25",
    "enp1",
    "rrp6",
    "cwf4",
    "sec39",
    "sfb3",
    "pef1",
    "tti1",
    "utp14",
    "mpe1",
    "gem1",
    "SPBC15D4.11c",
    "rna15",
    "etd1",
    "eca39",
    "mrps26",
    "for3",
    "fkh2",
    "cnx1",
    "rce1",
    "cut4",
    "tpz1",
    "cut20",
    "lac1",
    "cut23",
    "mis18",
    "mis14",
    "its8",
    "srb4",
    "cnp20",
    "mrp20",
    "ned1",
    "ptr8",
    "aur1",
    "vid21",
    "oct1",
    "SPAC806.02c",
    "tim50",
    "mis19",
    "tit1",
    "pep12",
    "zip1",
    "myo1",
    "nak1",
    "nuc2",
    "pga1",
    "smc5",
    "arp5",
    "fes1",
    "ypt1",
    "nop16",
    "ebp2",
    "rgf3",
    "pkd2",
    "sfc1",
    "rps402",
    "trz1",
    "vps25",
    "lcb1",
    "gea1",
    "rsm7",
    "cdc13",
    "asc1",
    "ada1",
    "rng2",
    "vma8",
    "hem3",
    "rad55",
    "cog2",
    "ulp1",
    "msl1",
    "nup186",
    "saf3",
    "pfd6",
    "cft1",
    "ppc89",
    "pob1",
    "dpm3",
    "enp2",
    "nup85",
    "pir2",
    "mdj1",
    "npp106",
    "fta4",
    "erd1",
    "cdc28",
    "rsd1",
    "flx1",
    "SPCC613.08",
    "plc1",
    "git7",
    "pct1",
    "yml6",
    "tho7",
    "tif303",
    "ini1",
    "prp43",
    "grn1",
    "bpl1",
    "tho5",
    "lip2",
    "rrn7",
    "scj1",
    "sec16",
    "orc4",
    "ura1",
    "sap62",
    "hsp78",
    "pst1",
    "cay1",
    "vps52",
    "srp21",
    "amo1",
    "rrp40",
    "rps1002",
    "rct1",
    "uso1",
    "prp16",
    "rsa1",
    "lip5",
    "ysh1",
    "nop52",
    "prp39",
    "rqh1",
    "rcl1",
    "swf1",
    "sec27",
    "nse2",
    "msw1",
    "kin1",
    "cut9",
    "prp22",
    "vps51",
    "pop100",
    "mob2",
    "tfb4",
    "ost3",
    "spf30",
    "tho1",
    "mca1",
    "cnd3",
    "emp65",
    "mcm3",
    "pfh1",
    "rfc1",
    "air1",
    "uba1",
    "med18",
    "pus7",
    "lam1",
    "cdc10",
    "naa20",
    "brr6",
    "mis15",
    "kei1",
    "taf10",
    "sec5",
    "cwf22",
    "uri1",
    "shk1",
    "vma13",
    "jmj3",
    "mrpl25",
    "omh3",
    "SPBC1861.05",
    "trz2",
    "pbn1",
    "cyb502",
    "mrpl31",
    "alp4",
    "mto1",
    "sec22",
    "krs1",
    "tsc3",
    "exo8",
    "mts4",
    "hrs1",
    "taf6",
    "sec3",
    "ufd1",
    "pof1",
    "rpf2",
    "ync13",
    "res1",
    "ckb1",
    "glo3",
    "rrp42",
    "arp8",
    "itr2",
    "sam1",
    "trs23",
    "vps901",
    "prp24",
]

selected_genes = list(set(selected_genes))

In [26]:
print("\n".join(selected_genes))

arp8
erd1
vps25
rib2
srb4
its3
prh1
eso1
cog2
msl1
nnk1
glo3
tho1
tim50
cay1
cul4
vma13
npp106
tfg1
rpm2
jmj3
itr2
drp1
bst1
ero12
rlp7
peg1
taf51
ypt1
sec16
pct1
nop52
sfc1
vma5
rrp40
dfr1
ssr2
cbp1
myo1
gta1
sec22
brr6
dbp5
kei1
spf30
meu31
rna15
gpi12
ync13
kin1
ste20
dcp1
uri1
imp2
enp2
ams2
hem15
srp21
yta12
cdc13
saf3
SPAC8F11.04
SPCC613.08
grn1
mrpl31
pan1
vps52
fcp1
rpp101
tif303
cut20
fes1
tpr1
tho7
sgd1
tpz1
mcm4
cdc28
rcl1
pga1
alp5
pdc2
pus7
vps11
sog2
rad55
uso1
rrn7
ani1
scj1
lcb1
arp5
zas1
gea1
ulp1
emp65
cog6
atp25
mis15
ysh1
los1
sam1
mto1
rgf3
fta4
ucp3
urb1
bpl1
tad2
wrs1
cwf24
cnp20
stt4
alm1
mms19
ssr3
prp43
pst2
cit1
utp14
vma8
trs23
SPBC15D4.11c
trl1
ptr8
ssr4
smc5
lsh1
mrpl1
swc4
dbl2
pfh1
yml6
rad52
nup186
rng2
rng3
pof1
trz1
mdj1
taf2
enp1
esf1
sen1
orc4
rpc17
swf1
krs1
ufd1
pkd2
bdp1
vps51
rce1
pep3
cut4
cdc123
tif471
cdk9
etd1
rpt2
cut23
dia4
sec5
mrpl25
for3
cwf5
mor2
pir2
utp6
vam6
tim23
asc1
its8
rpn7
sap62
rps402
cnp3
tfb4
naf1
rrp42
SPAC806.02c
cnd3
mrp

In [4]:
gene_clusters.query("Name in @selected_genes")["cluster_order"].value_counts()

cluster_order
7     89
8     69
9     54
4     34
10    32
3     23
5     15
11    10
12     9
1      8
2      6
6      2
Name: count, dtype: int64

In [6]:
in_selected_gene = annotations.loc[ingene_index].query("Name in @selected_genes")

In [7]:
fitting_results_in_selected_gene = fitting_results.merge(
    in_selected_gene, left_index=True, right_index=True, how="inner", suffixes=("", "_annotation")
)


In [23]:
def create_scatter_plot(df, xlabels, ylabels):
    """
    Create a scatter plot with Gaussian KDE coloring.
    
    Parameters:
    df (DataFrame): DataFrame containing the data.
    xlabel (str): Label for the x-axis.
    ylabel (str): Label for the y-axis.
    """
    fig, ax = plt.subplots(len(xlabels), len(ylabels), figsize=(5*len(ylabels), 5 * len(xlabels)))
    for i, xlabel in enumerate(xlabels):
        for j, ylabel in enumerate(ylabels):
            x = df[xlabel]
            y = df[ylabel]
            x, y = x[~x.isna() & ~y.isna()], y[~x.isna() & ~y.isna()]
            xy = np.vstack([x, y])
            z = gaussian_kde(xy)(xy)
            ax[i, j].scatter(x, y, c=z, s=20)
            ax[i, j].set_xlabel(xlabel)
            ax[i, j].set_ylabel(ylabel)
            ax[i, j].set_title(f'{xlabel} vs {ylabel}')
    plt.tight_layout()
    return fig

In [None]:
# create_scatter_plot(fitting_results_in_selected_gene, [""])

TypeError: create_scatter_plot() missing 1 required positional argument: 'ylabels'

In [17]:
fitting_results_in_selected_gene["Length_Bin"] = pd.cut(
    fitting_results_in_selected_gene["ParentalRegion_length"],
    bins=np.arange(0, 6000, 200),
    include_lowest=True,
)
fitting_results_in_selected_gene["Length_Bin_Label"] = fitting_results_in_selected_gene["Length_Bin"].apply(lambda x: f"{int(x.left)}-{int(x.right)}")

In [25]:
with PdfPages("./genes_with_domain_differences.pdf") as pdf:
    for length_bin, group in fitting_results_in_selected_gene.groupby("Length_Bin_Label"):
        print(length_bin)
        if group.empty:
            continue
        fig = create_scatter_plot(group, ["Distance_to_stop_codon", "Fraction_to_stop_codon"], ["t2", "t3", "t4", "A", "um"])
        fig.suptitle(f"Length Bin: {length_bin}", y=1.05, fontsize=16)
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches='tight')
        plt.close(fig)

  for length_bin, group in fitting_results_in_selected_gene.groupby("Length_Bin_Label"):


0-200
200-400
400-600
600-800
800-1000
1000-1200
1200-1400
1400-1600
1600-1800
1800-2000
2000-2200
2200-2400
2400-2600
2600-2800
2800-3000
3000-3200
3200-3400
3400-3600
3600-3800
3800-4000
4000-4200
4200-4400
4400-4600
4600-4800
4800-5000
5000-5200
5200-5400
5400-5600
5600-5800
