# Hyperparameters

In [2]:
import json
import os
import re
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.model_selection import train_test_split
import random
import torch

import utils.dataset_functions as dataf

# Initializations
SEED=42
random.seed(SEED)
np.random.seed(SEED)

# Directory containing MSigDB JSON files
JSON_DIR = "/home/gdallagl/myworkdir/data/MSigDB/msigdb_v2025.1.Hs_json_files_to_download_locally"

# cellcycel geensets saving apth
CELL_CYCLE_CSV_PATH = "/home/gdallagl/myworkdir/data/MSigDB/cell_cycle_genesets.csv"

# Guaranted genes list
GUARANTEED_GENES_PATH = "/home/gdallagl/myworkdir/data/MSigDB/julies_cycling_signatures_cancer.tsv"

# Updated keywords pattern with word boundaries to avoid false matches
KEYWORDS_PATTERN = "|".join([
    "_PROLIFERATION_", #avoid "proliferative" "proliferator"
    "_CYCLING",  # avoid "recycling"
    "CELL_CYCLE",
    "_CC_", "_G1_", "_S_PHASE_", "_G2_", "_M_PHASE_", # avoid "aCCumbens"
    "MITOSIS", "MITOTIC",
    "CDK",
    "CHECKPOINT"
])

# Exclusion pattern
EXCLUSION_PATTERN = r"MEIOTIC|MEIOSIS|FATTY_ACID_CYCLING_MODEL"


# KEYWORDS_PATTERN = "|".join([
#     # Key regulatory phrases
#     "_CELL_CYCLE_CHECKPOINT",
#     "CELL_CYCLE_REGULATION",
#     "CELL_CYCLE_CONTROL",

#     "HALLMARK_E2F_TARGETS", 
#     " HALLMARK_G2M_CHECKPOINT",
#     # Core regulators
#     "CDK", "_CYCLIN",

#     # Transitions
#     "_G1_S", "G1_S_", "G2_M", "_S_PHASE_", "M_PHASE", "MITOTIC_CHECKPOINT"
# ])
# # Exclusion pattern
# EXCLUSION_PATTERN = r"MEIOTIC|MEIOSIS|FATTY_ACID_CYCLING_MODEL|CDK5|CANCER|APOPTOSIS|RECYCLING|SPINDLE|KINETOCHORE|CYTOKINESIS|CENTROSOME|KEGG_MEDICUS_PATHOGEN|KEGG_MEDICUS_VARIANT|KEGG_MEDICUS_REFERENCE|PREDICTED|HE_LIM_SUN_FETAL_LUNG|MALIGNANT|MORF_CDK2"

# Human proteome path
HUMAN_PROTEOME_PATH = "/home/gdallagl/myworkdir/data/UniRef50/human_proteome.tsv"

# mapping protein-gene apth
MAPPING_PATH = "/home/gdallagl/myworkdir/ESMSec/data/UniRef50/HUMAN_9606_idmapping.dat"

# Minimum frequency threshold for filtering ambiguous genes
MIN_FREQ_AMBIGOUS = 2

# min number of postive samples per positive cluster
MIN_SAMPLE_N_POSITIVE = 2

# multiplocative factor to kwno how much morenegative class to sample
NEGATIVE_CLASS_MULT = 3

# if use as postive class only the guaranted genes
ONLY_GUARANTEED = False
only_guaranteed = "only-guaranteed_" if ONLY_GUARANTEED else ""

# id samplepostive class or take all postive egens
TAKE_ALL_POSITIVE_GENES = False

# If use only curated (swissprot) proteins or not
ONLY_REVIEWED_PROTEINS = True

# savifn csv datset
FINAL_DATASET_PATH = f"/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/dataset-cell-cycle_{only_guaranteed}{MIN_SAMPLE_N_POSITIVE}:{NEGATIVE_CLASS_MULT}.csv"
print(FINAL_DATASET_PATH)

# Autorelaod
%load_ext autoreload
%autoreload 2

/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/dataset-cell-cycle_2:3.csv


In [45]:
column_names = [
    'Gene Name', 'AC', 'Variant AA Change', 'Source DB ID',
    'Consequence Type', 'Clinical Significance', 'Phenotype/Disease',
    'Phenotype/Disease Source', 'Cytogenetic Band', 'Chromosome Coordinate',
    'Ensembl gene ID', 'Ensembl transcript ID', 'Ensembl translation ID',
    'Evidence'
]
df_original = pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/mutations/uniprot_homo_sapiens_variation.txt", skiprows=164, sep="\t", #nrows=98,
            header=None,           # Crucial: Tells pandas there is NO header in the lines being read
            names=column_names,     # Assign the custom column names
            usecols=['Gene Name', 'Variant AA Change', 'Source DB ID', 'Consequence Type', 'Clinical Significance', 'Phenotype/Disease',] # **Only load these two columns**
        )

In [49]:

df = df_original[
    (df_original["Clinical Significance"] != "-") &
    (~ df_original["Clinical Significance"].isna()) &
    (df_original["Consequence Type"].isin(['missense variant']))
].copy()

df

Unnamed: 0,Gene Name,Variant AA Change,Source DB ID,Consequence Type,Clinical Significance,Phenotype/Disease
33,NSRP1,p.Lys30Glu,RCV004545615,missense variant,Likely benign,NSRP1-related disorder
273,NSRP1,p.Asp273Glu,RCV003493117,missense variant,Variant of uncertain significance,"Neurodevelopmental disorder with spasticity, s..."
1027,SERPINB6,p.Met5Ile,RCV001783729,missense variant,Likely pathogenic,Autosomal recessive nonsyndromic hearing loss 91
1035,SERPINB6,p.Val45Ile,RCV004757137,missense variant,"Variant of uncertain significance, Likely benign",SERPINB6-related disorder
1036,SERPINB6,p.Val45Ile,RCV001335154,missense variant,"Variant of uncertain significance, Likely benign",Autosomal recessive nonsyndromic hearing loss 91
...,...,...,...,...,...,...
35687810,NT5C3A,p.Asp229Val,RCV001001047,missense variant,Variant of uncertain significance,Hemolytic anemia due to pyrimidine 5' nucleoti...
35687819,NT5C3A,p.Val233Leu,RCV003486174,missense variant,Variant of uncertain significance,Hemolytic anemia due to pyrimidine 5' nucleoti...
35687861,NT5C3A,p.Leu270Pro,RCV003133084,missense variant,Variant of uncertain significance,Hemolytic anemia due to pyrimidine 5' nucleoti...
35687866,NT5C3A,p.Gly275Arg,RCV000004745,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...


In [52]:
PATHOGENIC_TERMS = ['Pathogenic', 'Likely pathogenic']
BENIGN_TERMS = ['Benign', 'Likely benign']
VUS_TERM = 'Variant of uncertain significance'
OTHER_TERMS = ['Risk factor', 'Drug response', 'Protective', 'association', 'Conflicting interpretations of pathogenicity']


def simplify_clinical_significance(text):
    """Refactors a clinical significance string into a smaller category."""
    if not isinstance(text, str):
        return 'Unknown'
    
    # Standardize the string: lowercase and split by comma/space to get individual terms
    terms = set([term.strip() for part in text.lower().split(',') for term in part.split() if term.strip()])

    # Check for core classifications
    is_pathogenic = any(p_term.lower() in terms for p_term in PATHOGENIC_TERMS)
    is_benign = any(b_term.lower() in terms for b_term in BENIGN_TERMS)
    is_vus = VUS_TERM.lower() in terms
    
    # Check for other modifiers
    is_other = any(o_term.lower().replace(' ', '_') in terms for o_term in OTHER_TERMS)

    # ------------------------------------------------------------------
    # Apply Classification Logic
    # ------------------------------------------------------------------
    
    if is_pathogenic and is_benign:
        # Mixed categories (e.g., Pathogenic, Benign) are highly ambiguous and should be flagged
        return 'Conflicting/Ambiguous'
    
    elif is_pathogenic:
        return 'Pathogenic'
    
    elif is_benign:
        return 'Benign'
    
    elif is_vus:
        # Check if VUS is combined with only "other" terms, but not P/LP/B/LB
        return 'VUS'
        
    elif is_other:
        # Handle single modifiers that didn't contain P/LP/B/LB/VUS
        return 'Other/Modifier'

    return 'Unclassified'

df['Simplified Significance'] = df['Clinical Significance'].apply(simplify_clinical_significance)

# Display the new categories
print(df[['Clinical Significance', 'Simplified Significance']].head(12))
print("\nNew Categories Count:")
print(df['Simplified Significance'].value_counts())

                                  Clinical Significance  \
33                                        Likely benign   
273                   Variant of uncertain significance   
1027                                  Likely pathogenic   
1035   Variant of uncertain significance, Likely benign   
1036   Variant of uncertain significance, Likely benign   
1046                  Variant of uncertain significance   
1052                  Variant of uncertain significance   
1057                                             Benign   
1059   Likely benign, Variant of uncertain significance   
1073                                             Benign   
1111  Likely pathogenic, Variant of uncertain signif...   
1118                  Variant of uncertain significance   

     Simplified Significance  
33                    Benign  
273             Unclassified  
1027              Pathogenic  
1035                  Benign  
1036                  Benign  
1046            Unclassified  
1052           

In [53]:
df['Simplified Significance'].value_counts()

df = df[
    (df['Simplified Significance'].isin(["Pathogenic", "Benign"]))
].copy()

df

Unnamed: 0,Gene Name,Variant AA Change,Source DB ID,Consequence Type,Clinical Significance,Phenotype/Disease,Simplified Significance
33,NSRP1,p.Lys30Glu,RCV004545615,missense variant,Likely benign,NSRP1-related disorder,Benign
1027,SERPINB6,p.Met5Ile,RCV001783729,missense variant,Likely pathogenic,Autosomal recessive nonsyndromic hearing loss 91,Pathogenic
1035,SERPINB6,p.Val45Ile,RCV004757137,missense variant,"Variant of uncertain significance, Likely benign",SERPINB6-related disorder,Benign
1036,SERPINB6,p.Val45Ile,RCV001335154,missense variant,"Variant of uncertain significance, Likely benign",Autosomal recessive nonsyndromic hearing loss 91,Benign
1057,SERPINB6,p.Met94Val,RCV001807008,missense variant,Benign,Autosomal recessive nonsyndromic hearing loss 91,Benign
...,...,...,...,...,...,...,...
35687646,NT5C3A,p.Asp132Val,RCV000004736,missense variant,"Pathogenic, Likely pathogenic",Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
35687716,NT5C3A,p.Val174Ile,RCV003133083,missense variant,"Likely benign, Variant of uncertain significance",Hemolytic anemia due to pyrimidine 5' nucleoti...,Benign
35687798,NT5C3A,p.Asn224Ser,rs104894028,missense variant,Pathogenic,Adenomas and Adenocarcinomas,Pathogenic
35687801,NT5C3A,p.Asn224Ser,RCV000004743,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic


In [57]:
df[(df["Gene Name"] == "NT5C3A")]

Unnamed: 0,Gene Name,Variant AA Change,Source DB ID,Consequence Type,Clinical Significance,Phenotype/Disease,Simplified Significance
31233377,NT5C3A,p.Asp98Val,RCV000004736,missense variant,"Pathogenic, Likely pathogenic",Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
31233446,NT5C3A,p.Val140Ile,RCV003133083,missense variant,"Likely benign, Variant of uncertain significance",Hemolytic anemia due to pyrimidine 5' nucleoti...,Benign
31233518,NT5C3A,p.Asn190Ser,RCV000004743,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
31233574,NT5C3A,p.Gly241Arg,RCV000004745,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
31233657,NT5C3A,p.Asp86Val,RCV000004736,missense variant,"Pathogenic, Likely pathogenic",Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
31233668,NT5C3A,p.Val128Ile,RCV003133083,missense variant,"Likely benign, Variant of uncertain significance",Hemolytic anemia due to pyrimidine 5' nucleoti...,Benign
31233687,NT5C3A,p.Asn178Ser,RCV000004743,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
31233697,NT5C3A,p.Gly229Arg,RCV000004745,missense variant,Pathogenic,Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic
35687511,NT5C3A,p.Phe31Leu,RCV001802318,missense variant,"Benign, Likely benign",Hemolytic anemia due to pyrimidine 5' nucleoti...,Benign
35687646,NT5C3A,p.Asp132Val,RCV000004736,missense variant,"Pathogenic, Likely pathogenic",Hemolytic anemia due to pyrimidine 5' nucleoti...,Pathogenic


In [None]:
df["Consequence Type"].unique()
['missense variant',
       'stop gained',  
       'inframe deletion', 'delins', 'duplication', 'stop retained']

array(['missense variant', 'frameshift', 'nonsense', 'stop lost',
       'stop gained', 'insertion', 'initiator codon variant',
       'inframe deletion', 'delins', 'duplication', 'stop retained'],
      dtype=object)

In [6]:
df = pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/datasets/3_parties/oncokb.org-cancer-genes_cancerGeneList.tsv", sep="\t")
df = df[df["Gene Type"].isin(["ONCOGENE", "TSG"])]
df

Unnamed: 0,Hugo Symbol,Entrez Gene ID,GRCh37 Isoform,GRCh37 RefSeq,GRCh38 Isoform,GRCh38 RefSeq,Gene Type,# of occurrence within resources (Column J-P),OncoKB Annotated,MSK-IMPACT,MSK-HEME,FOUNDATION ONE,FOUNDATION ONE HEME,Vogelstein,COSMIC CGC (v99),Gene Aliases
0,ABL1,25,ENST00000318560,NM_005157.4,ENST00000318560,NM_005157.4,ONCOGENE,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"ABL, JTK7, c-ABL"
1,AKT1,207,ENST00000349310,NM_001014431.1,ENST00000349310,NM_001014431.1,ONCOGENE,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"AKT, PKB, PRKBA, RAC, RAC-alpha"
2,ALK,238,ENST00000389048,NM_004304.4,ENST00000389048,NM_004304.4,ONCOGENE,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,CD246
3,AMER1,139285,ENST00000330258,NM_152424.3,ENST00000374869,NM_152424.3,TSG,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"FAM123B, FLJ39827, RP11-403E24.2, WTX"
4,APC,324,ENST00000257430,NM_000038.5,ENST00000257430,NM_000038.5,TSG,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"DP2.5, PPP1R46"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,YY1,7528,ENST00000262238,NM_003403.4,ENST00000262238,NM_003403.4,ONCOGENE,1,Yes,No,No,No,No,No,No,"INO80S, UCRBP, YIN-YANG-1"
1197,ZFP36L1,677,ENST00000336440,NM_001244698.1,ENST00000336440,NM_001244698.1,TSG,1,Yes,No,No,No,No,No,No,"Berg36, RNF162B, TIS11B, cMG1"
1198,ZFP36L2,678,ENST00000282388,NM_006887.4,ENST00000282388,NM_006887.4,TSG,1,Yes,No,No,No,No,No,No,"ERF2, RNF162C, TIS11D"
1200,ZNF292,23036,ENST00000369577,NM_015021.1,ENST00000369577,NM_015021.3,TSG,1,Yes,No,No,No,No,No,No,"KIAA0530, ZFP292, Zn-15, Zn-16, bA393I2.3"


## Save

# ------------------------------
# ------------------------------
# ------------------------------
# ------------------------------
# ------------------------------
# ------------------------------

# YU dataset

In [None]:
from Bio import SeqIO
import pandas as pd

# Parse all sequences in the FASTA file
records = list(SeqIO.parse("/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/0_class_yu_et_all.fasta", "fasta"))
data = {
    "protein": [record.id for record in records],
    "sequence": [str(record.seq) for record in records],
    "label": [0 for record in records],
}
df_neg = pd.DataFrame(data)
print(df_neg.shape)


records = list(SeqIO.parse("/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/1_class_yu_et_all.fasta", "fasta"))
data = {
    "protein": [record.id for record in records],
    "sequence": [str(record.seq) for record in records],
    "label": [1 for record in records],
}
df_pos = pd.DataFrame(data)
print(df_pos.shape)

df = pd.concat([df_neg, df_pos])

df


In [None]:
train_idx, test_idx = train_test_split(
    df.index,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# Assign splits
df['set'] = ''
df.loc[train_idx, 'set'] = 'train'
df.loc[test_idx, 'set'] = 'test'
df.loc[test_idx[0], 'set'] = 'val' #just temp
df.reset_index(drop=True, inplace=True)
df["protein"] = range(df["protein"].shape[0])

print(df.groupby('set')['label'].value_counts().unstack(fill_value=0))
display(df)
df

In [None]:
df.iloc[[332]].sequence.to_list()

In [None]:
df.to_csv("/home/gdallagl/myworkdir/ESMSec/data/cell_cycle/yu_et_all_cyclins.csv")

In [None]:
import pandas as pd

# 2. Use pd.read_csv with appropriate parameters
df = pd.read_csv(
    "/home/gdallagl/myworkdir/ESMSec/data/pfam/pdb_pfam_mapping.txt", 
    sep='\t', 
    comment='#', # Skips lines that start with '#' (the metadata lines)
    skipinitialspace=True # Handles any extra spaces after the tab delimiter
)

# --- Explanation of the Output ---
print(df.head())
print(f"\nDataFrame shape: {df.shape}")
df.columns

In [None]:
# --- 2. Select the necessary columns ---
# The PDB information is redundant if you only want protein-Pfam mapping
df_mapping = df[['UNIPROT_ACCESSION', 'PFAM_ACCESSION']].copy()

# --- 3. Filter for unique protein-domain pairs ---
# Remove duplicates (e.g., if the same domain is found in multiple chains/PDB structures)
df_unique = df_mapping.drop_duplicates()

# --- 4. Group by Protein and Aggregate Pfam IDs ---
# Group by the UniProt ID and aggregate all unique Pfam IDs into a list (or string)

# Option A: Aggregate into a list (better for analysis)
df_result_list = df_unique.groupby('UNIPROT_ACCESSION')['PFAM_ACCESSION'].agg(list).reset_index()
df_result_list.columns = ['Protein_Accession', 'Pfam_Families_List']

# Option B: Aggregate into a single string separated by a semicolon (easier for saving/display)
df_result_string = df_unique.groupby('UNIPROT_ACCESSION')['PFAM_ACCESSION'].agg(lambda x: ';'.join(sorted(x))).reset_index()
df_result_string.columns = ['Protein_Accession', 'Pfam_Families_String']


# --- Output the preferred DataFrame ---
print("--- DataFrame with Pfam Families as a List per Protein (Option A) ---")
print(df_result_list.head())
print(f"\nTotal unique proteins: {len(df_result_list)}")

print("\n--- DataFrame with Pfam Families as a Semicolon-Separated String (Option B) ---")
print(df_result_string.head())

In [None]:
prot = pd.read_csv("/home/gdallagl/myworkdir/ESMSec/data/UniRef50/human_proteome.tsv", sep="\t")
prot

In [None]:
df_result_string = df_result_string[df_result_string.Protein_Accession.isin(set(prot[prot.Reviewed == "reviewed"].Entry))]

df_result_string