In [1]:
from pathlib import Path
import pandas as pd
from dataclasses import dataclass

# 1. Configuration

In [2]:
@dataclass
class Config:
    project_dir: Path = Path("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_diploid")

    @property
    def annotation(self) -> pd.DataFrame:
        return pd.read_csv(self.project_dir / "12_concatenated/annotations.tsv", sep="\t", index_col=[0,1,2,3])
    
    @property
    def insertion_level_results(self) -> pd.DataFrame:
        return pd.read_csv(self.project_dir / "14_insertion_level_depletion_analysis/LFC.tsv", sep="\t", index_col=[0,1,2,3])
        
    @property
    def insertion_density_results(self) -> pd.DataFrame:
        return pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/reports/HD_diploid/insertion_density_analysis/insertion_density_analysis.tsv", sep="\t")


# 2. Load data

In [3]:
data_config = Config()

# 3. Analysis

In [4]:
total_insertions_dict = data_config.insertion_density_results.set_index("Name")["total_insertions"].to_dict()

In [11]:
depletion_from_diploid_to_spores = data_config.insertion_level_results.query("YES0 > 1").index

In [12]:
dropout_genes = data_config.annotation.loc[depletion_from_diploid_to_spores].query("Type != 'Intergenic region'")["Name"].value_counts().to_frame("Count")

In [13]:
dropout_genes["total_insertions"] = dropout_genes.index.map(total_insertions_dict)
dropout_genes["enrichment_ratio"] = dropout_genes["Count"] / dropout_genes["total_insertions"]

In [14]:
sig_dropout_genes = dropout_genes.query("enrichment_ratio > 0.5 and total_insertions > 2")
sig_dropout_genes


Unnamed: 0_level_0,Count,total_insertions,enrichment_ratio
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cox1,45,71.0,0.633803
cob1,21,41.0,0.512195
cob1-I1,14,25.0,0.560000
cop1,13,19.0,0.684211
atp6,13,22.0,0.590909
...,...,...,...
atp16,2,3.0,0.666667
isd11,2,3.0,0.666667
fta3,2,3.0,0.666667
mes1,2,3.0,0.666667


In [15]:
print("\n".join(sig_dropout_genes.index))

cox1
cob1
cob1-I1
cop1
atp6
arg6
cox3
tif471
rpt5
rpt1
cox1-I1b
atg13
pik3
mat2-Pi
sec26
cox1-I2b
rpt4
ste11
rpt6
fps1
css1
ura4
efr3
atp9
nab2
rpn6
rsc58
rpn3
trs20
hrd1
SPAPB17E12.14c
puf1
msk1
scl1
SPBC36.11
tsc13
SPCC1235.01
ymr1
var1
deb1
rps1802
atp12
snd301
tfg3
SPAC186.02c
tim23
mug74
SPCC297.06c
SPBC18E5.14c
atp20
mug65
stm1
nre1
rmr1
SPAC222.18
SPBC405.02c
ght2
der1
efm2
rps2801
tip1
sec31
SPAC24B11.05
SPAC4A8.02c
imt3
cpc2
rpl2101
gpx1
tif308
oss1
SPBC14C8.11c
wtf20
mug166
rpl44
fbp1
prp2
arg5
aim18
SPAC18G6.12c
ivn1
SPAC11D3.16c
nup61
SPAC56F8.15
erf1
tcd1
atp16
isd11
fta3
mes1
SPAC4H3.09


In [21]:
dropout_genes.query("Name == 'mei2'")

Unnamed: 0_level_0,Count,total_insertions,enrichment_ratio
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mei2,4,16.0,0.25


In [None]:
data_config.insertion_level_results

In [20]:
data_config.annotation.query("Name == 'mei2'")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Chr_Interval,Start_Interval,End_Interval,Transcript,Length,Strand_Interval,Feature,Systematic ID,Type,Accumulated_CDS_bases,...,Distance_to_region_end,Fraction_to_region_start,Fraction_to_region_end,Distance_to_start_codon,Distance_to_stop_codon,Fraction_to_start_codon,Fraction_to_stop_codon,Residue_affected,Residue_frame,Insertion_direction
Chr,Coordinate,Strand,Target,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
I,4511069,+,ACAA,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,2223,0.013,0.987,2223.0,30.0,0.987,0.013,742.0,0.0,Reverse
I,4511069,-,ACAA,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,2223,0.013,0.987,2223.0,30.0,0.987,0.013,742.0,0.0,Forward
I,4511197,+,TTCT,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,2095,0.070,0.930,2095.0,158.0,0.930,0.070,699.0,1.0,Reverse
I,4511197,-,TTCT,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,2095,0.070,0.930,2095.0,158.0,0.930,0.070,699.0,1.0,Forward
I,4511283,+,TTCA,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,2009,0.108,0.892,2009.0,244.0,0.892,0.108,670.0,2.0,Reverse
I,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
I,4513116,-,TGAA,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,176,0.922,0.078,176.0,2077.0,0.078,0.922,59.0,2.0,Forward
I,4513153,+,TAAT,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,139,0.938,0.062,139.0,2114.0,0.062,0.938,47.0,1.0,Reverse
I,4513153,-,TAAT,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,139,0.938,0.062,139.0,2114.0,0.062,0.938,47.0,1.0,Forward
I,4513162,+,TTAG,I,4511039,4513292,SPAC27D7.03c.1,2253,-,CDS,SPAC27D7.03c,Coding gene,0.0,...,130,0.942,0.058,130.0,2123.0,0.058,0.942,44.0,1.0,Reverse
