# 1. Import libraries

In [1]:
import sys
from pathlib import Path
from tqdm import tqdm

# scientific
import pandas as pd
import numpy as np

# bio
import gffutils

# custom
sys.path.append(str(Path.cwd().parent / "src"))
from utils import update_sysIDs
from enrichment_functions import OntologyDataConfig, load_ontology_data
from pombe_feature_functions import config, DNA_level_features, extract_protein_features_from_peptide_sequence
from protein_structure_functions import pLDDT_statistics_report

# 2. Constants

In [2]:
ALPHAFOLD_DATABASE_DIR = Path("../../../resource/AlphaFold_Dataset/20251107_downloaded/UP000002485_284812_SCHPO_v6")

# 3. Configurations

In [3]:
cfg = config()
db = gffutils.create_db(cfg.gff3_file, cfg.database_file, force=True)
db = gffutils.FeatureDB(cfg.database_file)

In [4]:
gene_meta = pd.read_csv(cfg.gene_meta_file, sep="\t")
uniprot2id = dict(zip(gene_meta["uniprot_id"], gene_meta["gene_systematic_id"]))

# 4. Collect features

## 4.1 DNA-level features

In [None]:
mRNAs = list(db.features_of_type('mRNA'))
coding_gene_DNA_features = []

for ix, mRNA in enumerate(mRNAs):
    gene_DNA_features = DNA_level_features.from_gffutils_feature(mRNA, db, cfg)
    coding_gene_DNA_features.append(gene_DNA_features)

coding_gene_DNA_features_df = pd.DataFrame(coding_gene_DNA_features)
coding_genes = coding_gene_DNA_features_df['Gene_id'].unique().tolist()

In [None]:
coding_gene_DNA_features_df.info()

In [None]:
coding_gene_DNA_features_df.head()

In [None]:
coding_gene_DNA_features_df.describe()

## 4.2 RNA-level features

### RNA abundance

In [None]:
mRNA_abundance = pd.read_excel("../../resources/Literature/PMID23101633/mmc1.xlsx", sheet_name="Table_S2", comment="#").set_index('Systematic.name')
mRNA_abundance = mRNA_abundance[['MM1.tot.cpc_ex', 'MM2.tot.cpc_ex', 'MN1.tot.cpc_ex', 'MN2.tot.cpc_ex']].copy()
mRNA_abundance.columns = pd.MultiIndex.from_tuples([("EMM_Proliferating_Cell_RNA_Abundance", "replicate1"), ("EMM_Proliferating_Cell_RNA_Abundance", "replicate2"), ("EMM_Nitrogen_Starved_Cell_RNA_Abundance", "replicate1"), ("EMM_Nitrogen_Starved_Cell_RNA_Abundance", "replicate2")], name=["Condition", "Replicate"])
mean_mRNA_abundance = mRNA_abundance.T.groupby(level="Condition").mean().T
std_mRNA_abundance = mRNA_abundance.T.groupby(level="Condition").std().T
cv_mRNA_abundance = std_mRNA_abundance / mean_mRNA_abundance
mRNA_abundance_statistics = pd.concat([mean_mRNA_abundance, std_mRNA_abundance, cv_mRNA_abundance], axis=1, keys=['mean', 'std', 'cv'])
mRNA_abundance_statistics.index = update_sysIDs(mRNA_abundance_statistics.index.tolist(), cfg.gene_meta_file)
coding_gene_mRNA_abundace_statistics = mRNA_abundance_statistics[mRNA_abundance_statistics.index.isin(coding_genes)].copy().dropna().round(3)
coding_gene_mRNA_abundace_statistics.columns = ["_".join(col).strip() for col in coding_gene_mRNA_abundace_statistics.columns.values]

### RNA kinetics

In [None]:
mRNA_kinetics = pd.read_excel("../../resources/Literature/PMID27825301/12864_2016_3237_MOESM22_ESM.xls", sheet_name="Table")
mRNA_kinetics = mRNA_kinetics[["Gene ID", "tAIg", "HL - Mata (5)", "SR - Mata (5)"]].set_index("Gene ID")
mRNA_kinetics.columns = ["tAIg", "mRNA_half_life_minutes", "mRNA_synthesis_rate_per_minute"]
mRNA_kinetics.index = update_sysIDs(mRNA_kinetics.index.tolist(), cfg.gene_meta_file)
coding_gene_mRNA_kinetics = mRNA_kinetics[mRNA_kinetics.index.isin(coding_genes)].copy().dropna().round(3)

### Concatenate RNA features

In [None]:
coding_gene_RNA_features_df = pd.merge(coding_gene_mRNA_abundace_statistics, coding_gene_mRNA_kinetics, left_index=True, right_index=True, how="outer")

In [None]:
coding_gene_RNA_features_df.info()

## 4.3 Protein-level features

### Protein basic properties

In [None]:
protein_metadata = cfg.protein_meta

### Protein features from the peptide sequences

In [5]:
protein_features_from_peptide_sequences = extract_protein_features_from_peptide_sequence(cfg.PomBase_resource_dir / "genome_sequence_and_features" / "peptide.fa")





### Protein abundance

In [None]:
gene_abundance = pd.read_csv(cfg.PomBase_resource_dir / "RNA_metadata" / "quantitative_gene_expression.tsv", sep="\t")
EMM_proliferating_cell_protein_abundance = gene_abundance.query("reference == 'PMID:23101633' and type == 'protein' and condition == 'glucose MM,standard temperature'")[["gene_systematic_id", "copies_per_cell"]].dropna().astype({"copies_per_cell": float})
EMMN_quiescent_cell_protein_abundance = gene_abundance.query("reference == 'PMID:23101633' and type == 'protein' and condition == 'glucose MM,nitrogen absent,standard temperature'")[["gene_systematic_id", "copies_per_cell"]].dropna().astype({"copies_per_cell": float})
protein_abundance = pd.merge(EMM_proliferating_cell_protein_abundance, EMMN_quiescent_cell_protein_abundance, left_on="gene_systematic_id", right_on="gene_systematic_id", suffixes=("_EMM_Proliferating_Cell", "_EMMN_Quiescent_Cell")).set_index("gene_systematic_id")

### Protein turnover

In [None]:
protein_kinetics = pd.read_excel("../../resources/Literature/PMID25466257/1-s2.0-S2211124714009346-mmc3.xlsx", na_values=["n.d."]).dropna(subset=["Degradation rates (min-1)", "t1/2 (min)"])
protein_kinetics["ENSG"] = protein_kinetics["ENSG"].fillna(protein_kinetics["Gene name"])
protein_kinetics = protein_kinetics[["ENSG", "t1/2 (min)"]].set_index("t1/2 (min)")
protein_kinetics = protein_kinetics["ENSG"].str.split(";").explode().reset_index()
protein_kinetics["gene_systeamtic_id"] = update_sysIDs(protein_kinetics["ENSG"].tolist(), cfg.gene_meta_file)

### pLDDT features

In [None]:
pLDDTs = pLDDT_statistics_report(
    structure_dir=ALPHAFOLD_DATABASE_DIR,
    structure_format="pdb.gz"
)

In [None]:
pLDDTs["Systematic_ID"] = pLDDTs["uniprot_id"].map(uniprot2id)

### Domain features

In [16]:
protein_domains = pd.read_csv(cfg.PomBase_resource_dir / "Protein_features" / "protein_families_and_domains.tsv", sep="\t")
pfam_domain_counts = protein_domains.query("database == 'PFAM'").groupby("systematic_id").size().rename("PFAM_domain_count")

### Concatenate protein features

## 4.4 Evolutionary-level features

### Ortholog count

In [None]:
def get_ortholog_counts(ortholog_file: Path) -> pd.Series:
    """Get the number of orthologs for each gene from the ortholog file."""
    ortholog_df = pd.read_csv(ortholog_file, sep="\t", index_col=0, header=None, names=["gene_systematic_id", "orthologs"], na_values="NONE")
    ortholog_counts = ortholog_df["orthologs"].str.split("|").apply(lambda x: len(x) if isinstance(x, list) else 0 )
    return ortholog_counts

num_of_orthologs_with_japonicus = get_ortholog_counts(cfg.PomBase_resource_dir / "curated_orthologs" / "pombe_japonicus_orthologs.txt")
num_of_orthologs_with_cerevisiae = get_ortholog_counts(cfg.PomBase_resource_dir / "curated_orthologs" / "pombe_cerevisiae_orthologs.txt")
num_of_orthologs_with_human = get_ortholog_counts(cfg.PomBase_resource_dir / "curated_orthologs" / "pombe_human_orthologs.txt")


### Paralog count

In [None]:

pombe_paralogs = pd.read_csv("../../resources/pombe_paralog_from_ensemble_biomart_export.tsv", sep="\t")

paralog_count = pombe_paralogs.query("`Gene stable ID` in @coding_genes").groupby(["Gene stable ID", "Gene name"]).apply(lambda sub_df: sub_df.shape[0], include_groups=False).rename("paralog_count")



## 4.5 Network-level features

### GO term richness

In [18]:
dag, objanno, ns2assoc, gene2go, go2genes, slim_dag, ns2slim_assoc = load_ontology_data(
    OntologyDataConfig(
        ontology_obo=cfg.PomBase_resource_dir / "ontologies_and_associations" / "go-basic.obo",
        ontology_association_gaf=cfg.PomBase_resource_dir / "ontologies_and_associations" / "gene_ontology_annotation.gaf.tsv",
        slim_terms_table=[
            cfg.PomBase_resource_dir / "ontologies_and_associations" / "bp_go_slim_terms.tsv",
            cfg.PomBase_resource_dir / "ontologies_and_associations" / "mf_go_slim_terms.tsv",
            cfg.PomBase_resource_dir / "ontologies_and_associations" / "cc_go_slim_terms.tsv"
        ]
    ).load_data(),
    **{
        "relationships": {"is_a", "part_of"},
        "propagate_counts": True,
        "load_obsolete": False,
        "prt": None
    }
)

/data/c/yangyusheng_optimized/DIT_HAP_pipeline/resources/pombase_data/2025-10-01/ontologies_and_associations/go-basic.obo: fmt(1.2) rel(2025-10-10) 42,666 Terms; optional_attrs(def relationship)
HMS:0:00:01.890109  49,447 annotations READ: /data/c/yangyusheng_optimized/DIT_HAP_pipeline/resources/pombase_data/2025-10-01/ontologies_and_associations/gene_ontology_annotation.gaf.tsv 


In [20]:
gene2gorichness = {}
for gene, go_terms in tqdm(gene2go.items(), desc="Calculating GO term richness"):
    gene2gorichness[gene] = len(set(go_terms))

go_richness = pd.DataFrame.from_dict(gene2gorichness, orient="index", columns=["GO_term_richness"])

Calculating GO term richness: 100%|██████████| 5262/5262 [00:00<00:00, 190203.28it/s]


### PPI degree and GI degree

In [None]:
biogrid_data = pd.read_csv("../../resources/BioGrid/BIOGRID-ORGANISM-Schizosaccharomyces_pombe_972h-5.0.251.tab3.txt", sep="\t")
PPI_and_GI = biogrid_data[['Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Experimental System Type']].drop_duplicates()

PPI = PPI_and_GI.query("`Experimental System Type` == 'physical'")
GI = PPI_and_GI.query("`Experimental System Type` == 'genetic'")

PPI_degrees = PPI.groupby('Systematic Name Interactor A').size().rename("PPI_degree")
GI_degrees = GI.groupby('Systematic Name Interactor A').size().rename("GI_degree")


## 4.6 Phenotype-level features

### FYPO Essentiality

In [5]:
FYPO_viability = pd.read_csv(cfg.PomBase_resource_dir / "Gene_metadata" / "gene_viability.tsv", sep="\t", header=None, names=["gene_systematic_id", "FYPOviability"])

### Deletion Library Results

In [8]:
DeletionLibrary_essentiality = pd.read_excel("../../resources/Hayles_2013_OB_merged_categories_sysIDupdated.xlsx")[["Updated_Systematic_ID", "Gene dispensability. This study", "Category"]]

### Transposon insertion density