In [90]:
import re
from chembl_webresource_client.new_client import new_client
from typing import List, Dict, Set, Optional, Tuple

def normalize(text: str) -> str:
    """–†–∞–¥–∏–∫–∞–ª—å–Ω–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞"""
    return re.sub(r"[-‚Äì‚Äî'\",.(){}\[\]:;!?\\/+\s]", "", text.upper())

def basic_clean(text: str) -> str:
    """–ú—è–≥–∫–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è"""
    return re.sub(r"[-‚Äì‚Äî'\",.(){}\[\]:;!?\\/]", "", text.upper())

def get_chembl_family_mapping(chembl_ids: List[str]) -> Dict[str, str]:
    """–°–æ–∑–¥–∞–µ–º mapping: –ª—é–±–æ–π ChEMBL ID -> –±–ª–∏–∂–∞–π—à–∏–π —Ä–æ–¥–∏—Ç–µ–ª—å—Å–∫–∏–π ID –∏–∑ –Ω–∞—à–µ–≥–æ —Å–ø–∏—Å–∫–∞"""
    molecule_form = new_client.molecule_form
    mapping = {}
    
    for chembl_id in chembl_ids:
        mapping[chembl_id] = chembl_id  # –°–∞–º —Å–µ–±–µ —Ä–æ–¥–∏—Ç–µ–ª—å
        
        try:
            # –ü–æ–ª—É—á–∞–µ–º –≤—Å–µ—Ö —á–ª–µ–Ω–æ–≤ —Å–µ–º–µ–π—Å—Ç–≤–∞
            family_members = list(molecule_form.filter(parent_chembl_id=chembl_id).only('molecule_chembl_id'))
            for member in family_members:
                if 'molecule_chembl_id' in member:
                    mapping[member['molecule_chembl_id']] = chembl_id
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ —Å–µ–º–µ–π—Å—Ç–≤–∞ –¥–ª—è {chembl_id}: {e}")
    
    return mapping

def get_synonyms_dict(chembl_ids: List[str], family_mapping: Dict[str, str]) -> Dict[str, Set[str]]:
    """–°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã—Ö —Å–∏–Ω–æ–Ω–∏–º–æ–≤ —Å –ø—Ä–∏–≤—è–∑–∫–æ–π –∫ –æ—Å–Ω–æ–≤–Ω—ã–º ID"""
    molecule = new_client.molecule
    syn_dict = {chembl_id: set() for chembl_id in chembl_ids}
    
    # –°–æ–±–∏—Ä–∞–µ–º –≤—Å–µ ChEMBL ID, –∫–æ—Ç–æ—Ä—ã–µ –æ—Ç–Ω–æ—Å—è—Ç—Å—è –∫ –Ω–∞—à–∏–º —Å–µ–º–µ–π—Å—Ç–≤–∞–º
    all_related_ids = set(family_mapping.keys())
    
    for chembl_id in all_related_ids:
        try:
            mol = molecule.get(chembl_id)
            synonyms = mol.get("molecule_synonyms", [])
            cleaned_synonyms = [normalize(s["molecule_synonym"]) for s in synonyms 
                              if isinstance(s, dict) and "molecule_synonym" in s]
            
            # –ü—Ä–∏–≤—è–∑—ã–≤–∞–µ–º —Å–∏–Ω–æ–Ω–∏–º—ã –∫ –æ—Å–Ω–æ–≤–Ω–æ–º—É ID –∏–∑ –Ω–∞—à–µ–≥–æ —Å–ø–∏—Å–∫–∞
            main_id = family_mapping[chembl_id]
            syn_dict[main_id].update(cleaned_synonyms)
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ —Å–∏–Ω–æ–Ω–∏–º–æ–≤ –¥–ª—è {chembl_id}: {e}")
    
    return syn_dict

def find_matches(chembl_ids: List[str], drug_names: List[str]) -> Tuple[Dict[str, Dict], List[str]]:
    """–ü–æ–∏—Å–∫ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–π —Å –ø—Ä–∏–≤—è–∑–∫–æ–π –∫ –æ—Å–Ω–æ–≤–Ω—ã–º ID –∏–∑ —Å–ø–∏—Å–∫–∞"""
    molecule = new_client.molecule
    results = {}
    not_found = set(drug_names)
    
    # –°–æ–∑–¥–∞–µ–º mapping —Å–µ–º–µ–π—Å—Ç–≤
    family_mapping = get_chembl_family_mapping(chembl_ids)
    all_related_ids = set(family_mapping.keys())
    
    # –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å —Å–∏–Ω–æ–Ω–∏–º–æ–≤ (—É–∂–µ –ø—Ä–∏–≤—è–∑–∞–Ω–Ω—ã—Ö –∫ –æ—Å–Ω–æ–≤–Ω—ã–º ID)
    syn_dict = get_synonyms_dict(chembl_ids, family_mapping)
    
    # 1Ô∏è‚É£ –ü–µ—Ä–≤–∞—è —Å—Ç–∞–¥–∏—è: —Ç–æ—á–Ω—ã–π –ø–æ–∏—Å–∫ –ø–æ —Å–∏–Ω–æ–Ω–∏–º–∞–º
    for drug in list(not_found):
        found = False
        for chembl_id in all_related_ids:
            try:
                mol = molecule.get(chembl_id)
                synonyms = mol.get("molecule_synonyms", [])
                for syn in synonyms:
                    if isinstance(syn, dict) and drug.lower() == syn.get("molecule_synonym", "").lower():
                        # –ü—Ä–∏–≤—è–∑—ã–≤–∞–µ–º –∫ –æ—Å–Ω–æ–≤–Ω–æ–º—É ID –∏–∑ —Å–ø–∏—Å–∫–∞
                        main_id = family_mapping[chembl_id]
                        main_mol = molecule.get(main_id)
                        results[drug] = {
                            "chembl_id": main_id,
                            "preferred_name": main_mol.get("pref_name", "N/A"),
                            "stage": "first",
                            "matched_synonym": syn.get("molecule_synonym"),
                            "original_chembl_id": chembl_id  # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π ID –¥–ª—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏
                        }
                        found = True
                        break
                if found:
                    not_found.remove(drug)
                    break
            except Exception:
                continue
    
    # 2Ô∏è‚É£ –í—Ç–æ—Ä–∞—è —Å—Ç–∞–¥–∏—è: –ø–æ–∏—Å–∫ –≤ –≤–µ—Ä—Ö–Ω–µ–º —Ä–µ–≥–∏—Å—Ç—Ä–µ
    for drug in list(not_found):
        upper_drug = drug.upper()
        found = False
        for chembl_id in all_related_ids:
            try:
                mol = molecule.get(chembl_id)
                synonyms = mol.get("molecule_synonyms", [])
                for syn in synonyms:
                    if isinstance(syn, dict) and upper_drug == syn.get("molecule_synonym", "").upper():
                        main_id = family_mapping[chembl_id]
                        main_mol = molecule.get(main_id)
                        results[drug] = {
                            "chembl_id": main_id,
                            "preferred_name": main_mol.get("pref_name", "N/A"),
                            "stage": "second",
                            "matched_synonym": syn.get("molecule_synonym"),
                            "original_chembl_id": chembl_id
                        }
                        found = True
                        break
                if found:
                    not_found.remove(drug)
                    break
            except Exception:
                continue
    
    # 3Ô∏è‚É£ –¢—Ä–µ—Ç—å—è —Å—Ç–∞–¥–∏—è: –º—è–≥–∫–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
    for drug in list(not_found):
        cleaned_drug = basic_clean(drug)
        found = False
        for chembl_id in all_related_ids:
            try:
                mol = molecule.get(chembl_id)
                synonyms = mol.get("molecule_synonyms", [])
                for syn in synonyms:
                    if isinstance(syn, dict) and cleaned_drug == basic_clean(syn.get("molecule_synonym", "")):
                        main_id = family_mapping[chembl_id]
                        main_mol = molecule.get(main_id)
                        results[drug] = {
                            "chembl_id": main_id,
                            "preferred_name": main_mol.get("pref_name", "N/A"),
                            "stage": "third",
                            "matched_synonym": syn.get("molecule_synonym"),
                            "original_chembl_id": chembl_id
                        }
                        found = True
                        break
                if found:
                    not_found.remove(drug)
                    break
            except Exception:
                continue
    
    # 4Ô∏è‚É£ –ß–µ—Ç–≤—ë—Ä—Ç–∞—è —Å—Ç–∞–¥–∏—è: —Ä–∞–¥–∏–∫–∞–ª—å–Ω–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
    for drug in list(not_found):
        norm_drug = normalize(drug)
        found = False
        for main_id in chembl_ids:  # –ò—â–µ–º —Ç–æ–ª—å–∫–æ —Å—Ä–µ–¥–∏ –æ—Å–Ω–æ–≤–Ω—ã—Ö ID
            if norm_drug in syn_dict[main_id]:
                try:
                    main_mol = molecule.get(main_id)
                    results[drug] = {
                        "chembl_id": main_id,
                        "preferred_name": main_mol.get("pref_name", "N/A"),
                        "stage": "fourth",
                        "matched_synonym": norm_drug,
                        "original_chembl_id": None  # –ù–µ –∑–Ω–∞–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π ID –ø—Ä–∏ —Ç–∞–∫–æ–º –ø–æ–∏—Å–∫–µ
                    }
                    not_found.remove(drug)
                    found = True
                    break
                except Exception:
                    continue
    
    return results, list(not_found)

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è (–æ—Å—Ç–∞—ë—Ç—Å—è —Ç–∞–∫–∏–º –∂–µ)
if __name__ == "__main__":
    chembl_ids = [
        "CHEMBL25", "CHEMBL112", "CHEMBL941", "CHEMBL600", 
        "CHEMBL521", "CHEMBL405"
    ]

    drug_names = [
        "aspirin", "acetyl-salicylic acid", "Acetyl salicylic acid", "AMPHETAMINE ASPARTATE",
        "AMPHETAMINE"
    ]

    found, not_found = find_matches(chembl_ids, drug_names)

    # –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    stages = {
        "first": "‚úÖ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ –ø–µ—Ä–≤–æ–π —Å—Ç–∞–¥–∏–∏ (—Ç–æ—á–Ω–æ–µ —Å–æ–≤–ø–∞–¥–µ–Ω–∏–µ):",
        "second": "üîÑ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ –≤—Ç–æ—Ä–æ–π —Å—Ç–∞–¥–∏–∏ (–≤–µ—Ä—Ö–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä):",
        "third": "üßπ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ —Ç—Ä–µ—Ç—å–µ–π —Å—Ç–∞–¥–∏–∏ (–º—è–≥–∫–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è):",
        "fourth": "üß® –ù–∞–π–¥–µ–Ω–æ –Ω–∞ —á–µ—Ç–≤—ë—Ä—Ç–æ–π —Å—Ç–∞–¥–∏–∏ (—Ä–∞–¥–∏–∫–∞–ª—å–Ω–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è):"
    }

    for stage, header in stages.items():
        print(f"\n{header}")
        for drug, info in found.items():
            if info["stage"] == stage:
                original_info = f" (original: {info['original_chembl_id']})" if info['original_chembl_id'] else ""
                print(f"{drug.ljust(25)} -> {info['preferred_name'].ljust(20)} (ChEMBL: {info['chembl_id']}{original_info}, matched: {info.get('matched_synonym', 'N/A')})")

    print("\n‚ùå –ù–µ –Ω–∞–π–¥–µ–Ω–æ –≤–æ–≤—Å–µ:")
    for drug in not_found:
        print(drug)


‚úÖ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ –ø–µ—Ä–≤–æ–π —Å—Ç–∞–¥–∏–∏ (—Ç–æ—á–Ω–æ–µ —Å–æ–≤–ø–∞–¥–µ–Ω–∏–µ):
AMPHETAMINE ASPARTATE     -> AMPHETAMINE          (ChEMBL: CHEMBL405 (original: CHEMBL1200377), matched: Amphetamine aspartate)
aspirin                   -> ASPIRIN              (ChEMBL: CHEMBL25 (original: CHEMBL25), matched: Aspirin)
AMPHETAMINE               -> AMPHETAMINE          (ChEMBL: CHEMBL405 (original: CHEMBL405), matched: Amphetamine)

üîÑ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ –≤—Ç–æ—Ä–æ–π —Å—Ç–∞–¥–∏–∏ (–≤–µ—Ä—Ö–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä):

üßπ –ù–∞–π–¥–µ–Ω–æ –Ω–∞ —Ç—Ä–µ—Ç—å–µ–π —Å—Ç–∞–¥–∏–∏ (–º—è–≥–∫–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è):
acetyl-salicylic acid     -> ASPIRIN              (ChEMBL: CHEMBL25 (original: CHEMBL25), matched: Acetylsalicylic Acid)

üß® –ù–∞–π–¥–µ–Ω–æ –Ω–∞ —á–µ—Ç–≤—ë—Ä—Ç–æ–π —Å—Ç–∞–¥–∏–∏ (—Ä–∞–¥–∏–∫–∞–ª—å–Ω–∞—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è):
Acetyl salicylic acid     -> ASPIRIN              (ChEMBL: CHEMBL25, matched: ACETYLSALICYLICACID)

‚ùå –ù–µ –Ω–∞–π–¥–µ–Ω–æ –≤–æ–≤—Å–µ:
