In [None]:
import pandas as pd
import os
import requests
from xml.etree import ElementTree as ET
from time import sleep

# Пути к файлам TTD (в текущей папке)
file_genes = "./ttd_targets_full_data.txt"
file_drugs = "./ttd_drug_synonyms.txt"
file_mapping = "./ttd_drug_target_mapping.xlsx"
file_activity = "./ttd_target_compound_activity.txt"

# Список входных файлов с генами (ЗАМЕНИТЕ на ваши пути)
input_files = [
    # AML files
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.2_zscore_-0.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.2_zscore_-1.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.2_zscore_-1.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.15_zscore_-0.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.15_zscore_-1.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\AML\AML_depscore_-0.15_zscore_-1.csv",
    
    # ALL files
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.2_zscore_-0.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.2_zscore_-1.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.2_zscore_-1.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.15_zscore_-0.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.15_zscore_-1.5.csv",
    r"C:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\thresholds\gene_selection\ALL\ALL_depscore_-0.15_zscore_-1.csv"
]

# Создаём список для сводной статистики
summary_data = []

# Функции (без изменений)
def get_uniprot_id(uniprot_ac):
    if uniprot_ac == "N/A" or not uniprot_ac.endswith("_HUMAN"):
        return "N/A"
    try:
        base_url = "https://www.uniprot.org/uniprot/"
        response = requests.get(f"{base_url}{uniprot_ac}.xml", timeout=10)
        response.raise_for_status()
        root = ET.fromstring(response.text)
        entry = root.find("{http://uniprot.org/uniprot}entry")
        if entry is not None:
            accessions = entry.findall("{http://uniprot.org/uniprot}accession")
            if accessions:
                return accessions[0].text
    except Exception:
        return uniprot_ac
    return uniprot_ac

def get_protein_name(uniprot_ac):
    if uniprot_ac == "N/A" or not uniprot_ac.endswith("_HUMAN"):
        return "N/A"
    try:
        base_url = "https://www.uniprot.org/uniprot/"
        response = requests.get(f"{base_url}{uniprot_ac}.xml", timeout=10)
        response.raise_for_status()
        root = ET.fromstring(response.text)
        entry = root.find("{http://uniprot.org/uniprot}entry")
        if entry is not None:
            protein_name = entry.find(".//{http://uniprot.org/uniprot}recommendedName/{http://uniprot.org/uniprot}fullName")
            if protein_name is not None:
                return protein_name.text
            protein_name = entry.find(".//{http://uniprot.org/uniprot}alternativeName/{http://uniprot.org/uniprot}fullName")
            if protein_name is not None:
                return protein_name.text
    except Exception:
        return "N/A"
    return "N/A"

def parse_genes_and_drugs(file_path):
    genes_data = {}
    drugs_data = {}
    current_target = None
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if not parts:
                continue
            if parts[0].startswith("T") and parts[0][1:].isdigit():
                current_target = parts[0]
                if current_target not in genes_data:
                    genes_data[current_target] = {
                        "GENENAME": "N/A",
                        "UNIPROID": "N/A",
                        "DRUGS": {}
                    }
                if len(parts) >= 3:
                    if parts[1] == "GENENAME":
                        genes_data[current_target]["GENENAME"] = parts[2]
                    elif parts[1] == "UNIPROID":
                        genes_data[current_target]["UNIPROID"] = parts[2]
                if len(parts) >= 5 and parts[1] == "DRUGINFO":
                    drug_id = parts[2]
                    drug_name = parts[3]
                    status = parts[4] if len(parts) > 4 else "N/A"
                    genes_data[current_target]["DRUGS"][drug_id] = {
                        "DRUGNAME": drug_name,
                        "STATUS": status
                    }
                    if drug_id not in drugs_data:
                        drugs_data[drug_id] = {
                            "DRUGNAME": drug_name,
                            "STATUS": status,
                            "SYNONYMS": "N/A"
                        }
    return genes_data, drugs_data

def add_drug_synonyms(file_path, drugs_data):
    current_drug = None
    current_synonyms = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 3:
                if parts[1] == "TTDDRUID":
                    if current_drug and current_synonyms:
                        drugs_data[current_drug]["SYNONYMS"] = ";".join(current_synonyms)
                    current_drug = parts[2]
                    current_synonyms = []
                    if current_drug not in drugs_data:
                        drugs_data[current_drug] = {
                            "DRUGNAME": "N/A",
                            "SYNONYMS": "N/A"
                        }
                elif parts[1] == "DRUGNAME" and current_drug:
                    drugs_data[current_drug]["DRUGNAME"] = parts[2]
                elif parts[1] == "SYNONYMS" and current_drug:
                    current_synonyms.append(parts[2])
        if current_drug and current_synonyms:
            drugs_data[current_drug]["SYNONYMS"] = ";".join(current_synonyms)

def extract_chembl(synonyms):
    if synonyms == "N/A":
        return "N/A"
    for synonym in synonyms.split(";"):
        synonym = synonym.strip()
        if synonym.startswith("CHEMBL") and not synonym.startswith("SCHEMBL"):
            chembl_id = synonym.split()[0]
            if len(chembl_id) > 6:
                return chembl_id
    return "N/A"

def split_activity(activity_str):
    if pd.isna(activity_str) or activity_str == "N/A":
        return ("N/A", "N/A")
    parts = activity_str.split(maxsplit=1)
    if len(parts) >= 2:
        return (parts[0], ' '.join(parts[1:]))
    return (activity_str, "N/A")

# Загрузка данных TTD (один раз)
try:
    genes_data, drugs_data = parse_genes_and_drugs(file_genes)
    add_drug_synonyms(file_drugs, drugs_data)
    mapping_df = pd.read_excel(file_mapping, header=None, names=["TARGETID", "TTDDRUID", "Highest_status", "MOA"])
    activity_df = pd.read_csv(file_activity, sep="\t")
except Exception as e:
    print(f"Ошибка при загрузке файлов TTD: {e}")
    exit()

# Обработка каждого файла с генами
for input_file in input_files:
    try:
        # Чтение генов из файла
        gene_df = pd.read_csv(input_file)
        input_genes = gene_df["Gene"].tolist()

        final_data = []
        for gene in input_genes:
            target_ids = [tid for tid, info in genes_data.items() if info.get("GENENAME") == gene]
            if not target_ids:
                continue
            
            for target_id in target_ids:
                protein = genes_data[target_id].get("UNIPROID", "N/A")
                protein_name = get_protein_name(protein) if protein != "N/A" else "N/A"
                uniprot_id = get_uniprot_id(protein) if protein != "N/A" else "N/A"
                sleep(0.5)
                
                target_drugs = genes_data[target_id].get("DRUGS", {})
                mapping_drugs = mapping_df[mapping_df["TARGETID"] == target_id]
                all_drug_ids = set(target_drugs.keys()) | set(mapping_drugs["TTDDRUID"].unique())
                
                if not all_drug_ids:
                    final_data.append([
                        gene, protein_name, uniprot_id, target_id, 
                        "N/A", "N/A", "N/A", 
                        "N/A", "N/A", "N/A"
                    ])
                    continue
                    
                for drug_id in all_drug_ids:
                    drug_name = target_drugs.get(drug_id, {}).get("DRUGNAME", "N/A")
                    status = target_drugs.get(drug_id, {}).get("STATUS", "N/A")
                    
                    if drug_name == "N/A":
                        mapping_info = mapping_drugs[mapping_drugs["TTDDRUID"] == drug_id]
                        if not mapping_info.empty:
                            status = mapping_info.iloc[0]["Highest_status"]
                            moa = mapping_info.iloc[0]["MOA"] if pd.notna(mapping_info.iloc[0]["MOA"]) else "N/A"
                        if drug_id in drugs_data:
                            drug_name = drugs_data[drug_id].get("DRUGNAME", "N/A")
                    
                    moa_info = mapping_drugs[mapping_drugs["TTDDRUID"] == drug_id]
                    moa = moa_info.iloc[0]["MOA"] if not moa_info.empty and pd.notna(moa_info.iloc[0]["MOA"]) else "N/A"
                    
                    synonyms = drugs_data.get(drug_id, {}).get("SYNONYMS", "N/A")
                    chembl_id = extract_chembl(synonyms)
                    
                    activity_matches = activity_df[
                        (activity_df["TTD Target ID"] == target_id) & 
                        (activity_df["TTD Drug/Compound ID"] == drug_id)
                    ]
                    activity = "; ".join(activity_matches["Activity"].astype(str).unique()) if not activity_matches.empty else "N/A"
                    
                    final_data.append([
                        gene, protein_name, uniprot_id, target_id, 
                        drug_name, drug_id, status, 
                        moa, activity, chembl_id
                    ])

        # Сохранение результатов в текущую папку
        if not final_data:
            print(f"Нет данных для сохранения из {input_file}")
            summary_data.append({
                "File Name": os.path.basename(input_file),
                "Total Rows": 0,
                "Yes in Drug Mechanism": 0,
                "Unique Genes": 0,
                "Genes with Yes in Mechanism": 0
            })
        else:
            output_df = pd.DataFrame(final_data, columns=[
                "Gene", "Protein name", "UniProt ID", "Protein TTD ID",
                "Drug name", "Drug TTD ID", "Max phase TTD",
                "Action type TTD", "Activity TTD", "Drug CHEMBL ID"
            ])
            output_df = output_df.drop_duplicates()
            
            activity_split = output_df["Activity TTD"].apply(split_activity)
            output_df["Activity type TTD"] = activity_split.apply(lambda x: x[0])
            output_df["Activity value TTD"] = activity_split.apply(lambda x: x[1])
            
            cols = [
                "Gene", "UniProt ID", "Protein name", "Protein TTD ID", 
                "Drug name", "Drug TTD ID", "Max phase TTD", 
                "Action type TTD", "Activity type TTD", "Activity value TTD",
                "Drug CHEMBL ID"
            ]
            output_df = output_df[cols]
            
            # Сохраняем в текущую папку
            output_filename = os.path.basename(input_file)
            output_path = os.path.join(os.getcwd(), output_filename)
            
            try:
                output_df.to_csv(output_path, index=False, encoding="utf-8")
                print(f"Файл сохранён: {output_path}")
                
                # Собираем статистику для сводки
                total_rows = len(output_df)
                yes_in_moa = len(output_df[output_df["Action type TTD"] == "Yes"])
                unique_genes = output_df["Gene"].nunique()
                genes_with_yes = output_df[output_df["Action type TTD"] == "Yes"]["Gene"].nunique()
                
                summary_data.append({
                    "File Name": output_filename,
                    "Total Rows": total_rows,
                    "Yes in Drug Mechanism": yes_in_moa,
                    "Unique Genes": unique_genes,
                    "Genes with Yes in Mechanism": genes_with_yes
                })
            except Exception as e:
                print(f"Ошибка при сохранении {output_path}: {e}")
                summary_data.append({
                    "File Name": output_filename,
                    "Total Rows": 0,
                    "Yes in Drug Mechanism": 0,
                    "Unique Genes": 0,
                    "Genes with Yes in Mechanism": 0
                })

    except Exception as e:
        print(f"Ошибка при обработке {input_file}: {e}")
        summary_data.append({
            "File Name": os.path.basename(input_file),
            "Total Rows": 0,
            "Yes in Drug Mechanism": 0,
            "Unique Genes": 0,
            "Genes with Yes in Mechanism": 0
        })

# Сохранение сводки в текущую папку
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    # Удаляем ненужные столбцы
    summary_df = summary_df[["File Name", "Total Rows", "Unique Genes"]]
    
    # Транспонируем таблицу (столбцы -> строки)
    summary_df = summary_df.transpose()
    
    # Переименовываем столбцы (чтобы были "File 1", "File 2", ...)
    summary_df.columns = [f"File {i+1}" for i in range(len(summary_df.columns))]
    
    summary_path = os.path.join(os.getcwd(), "summary.csv")
    try:
        summary_df.to_csv(summary_path, index=True)  # index=True чтобы сохранить названия строк
        print(f"\nСводка сохранена: {summary_path}")
    except Exception as e:
        print(f"Ошибка при сохранении сводки: {e}")
else:
    print("Нет данных для сводки.")

Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.2_zscore_-0.5_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.2_zscore_-1.5_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.2_zscore_-1_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.15_zscore_-0.5_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.15_zscore_-1.5_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\AML_depscore_-0.15_zscore_-1_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\ALL_depscore_-0.2_zscore_-0.5_drugs.csv
Файл сохранён: c:\Users\rusla\OneDrive\Рабочий стол\Диплом\Гены\chembl_uniprot_ttd\ttd\ALL_depscore_-0.2_zscore_