In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('full database.xml')
root = tree.getroot()

# Initialize lists to store the extracted data
drugbank_ids = []
names = []
descriptions = []
states = []
groups_list = []
categories_list = []
atc_codes_list = []
targets_list = []

# Iterate over each drug in the XML
for drug in root.findall('drug'):
    drugbank_id = drug.find('drugbank-id').text
    name = drug.find('name').text
    description = drug.find('description').text
    state = drug.find('state').text
    groups = [group.text for group in drug.find('groups').findall('group')]
    categories = [category.find('category').text for category in drug.find('categories').findall('category')]

    atc_codes = []
    for atc_code in drug.find('atc-codes').findall('atc-code'):
        code = atc_code.get('code')
        levels = [level.text for level in atc_code.findall('level')]
        atc_codes.append(f"{code}: {'; '.join(levels)}")
    
    targets = []
    for target in drug.find('targets').findall('target'):
        target_id = target.find('id').text
        target_name = target.find('name').text
        targets.append(f"{target_id}: {target_name}")
    
    # Append the extracted data to the lists
    drugbank_ids.append(drugbank_id)
    names.append(name)
    descriptions.append(description)
    states.append(state)
    groups_list.append('; '.join(groups))
    categories_list.append('; '.join(categories))
    atc_codes_list.append(' | '.join(atc_codes))
    targets_list.append(' | '.join(targets))



# Create a pandas DataFrame
data = {
    'DrugBank ID': drugbank_ids,
    'Name': names,
    'Description': descriptions,
    'State': states,
    'Groups': groups_list,
    'Categories': categories_list,
    'ATC Codes': atc_codes_list,
    'Targets': targets_list
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('C:/Users/agavr/Documents/Thesis_code/GNN-Predictor-for-Banned-Substances-in-Sports/drugbank_data.csv', index=False)


In [38]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('full database.xml')
root = tree.getroot()

# Namespace dictionary to handle the default namespace
ns = {'db': 'http://www.drugbank.ca'}

# Initialize lists to store the extracted data
drugbank_ids = []
names = []
descriptions = []
states = []
groups_list = []
categories_list = []
atc_codes_list = []
targets_list = []

# Iterate over each drug in the XML
for drug in root.findall('db:drug', ns):
    drugbank_id = drug.find('db:drugbank-id[@primary="true"]', ns)
    name = drug.find('db:name', ns)
    description = drug.find('db:description', ns)
    state = drug.find('db:state', ns)

    groups = drug.find('db:groups', ns)
    categories = drug.find('db:categories', ns)
    atc_codes = drug.find('db:atc-codes', ns)
    targets = drug.find('db:targets', ns)

    # Extract text if elements are found, else use None or empty string
    drugbank_ids.append(drugbank_id.text if drugbank_id is not None else None)
    names.append(name.text if name is not None else None)
    descriptions.append(description.text if description is not None else None)
    states.append(state.text if state is not None else None)

    groups_list.append('; '.join([group.text for group in groups.findall('db:group', ns)]) if groups is not None else '')
    categories_list.append('; '.join([category.find('db:category', ns).text for category in categories.findall('db:category', ns)]) if categories is not None else '')

    atc_code_texts = []
    if atc_codes is not None:
        for atc_code in atc_codes.findall('db:atc-code', ns):
            code = atc_code.get('code')
            levels = [level.text for level in atc_code.findall('db:level', ns)]
            atc_code_texts.append(f"{code}: {'; '.join(levels)}")
    atc_codes_list.append(' | '.join(atc_code_texts))

    target_texts = []
    if targets is not None:
        for target in targets.findall('db:target', ns):
            target_id = target.find('db:id', ns)
            target_name = target.find('db:name', ns)
            target_texts.append(f"{target_id.text if target_id is not None else 'Unknown'}: {target_name.text if target_name is not None else 'Unknown'}")
    targets_list.append(' | '.join(target_texts))

# Create a pandas DataFrame
data = {
    'DrugBank ID': drugbank_ids,
    'Name': names,
    'Description': descriptions,
    'State': states,
    'Groups': groups_list,
    'Categories': categories_list,
    'ATC Codes': atc_codes_list,
    'Targets': targets_list
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('C:/Users/agavr/Documents/Thesis_code/GNN-Predictor-for-Banned-Substances-in-Sports/drugbank_data.csv', index=False)


In [24]:
smiles_list = []
molecular_formulas = []

# Iterate over each drug in the XML and extract the new information
for drug in root.findall('db:drug', ns):
    drugbank_id = drug.find('db:drugbank-id[@primary="true"]', ns).text if drug.find('db:drugbank-id[@primary="true"]', ns) is not None else None

    # Extract SMILES and Molecular Formula from calculated properties
    smiles = None
    molecular_formula = None
    calculated_properties = drug.find('db:calculated-properties', ns)
    if calculated_properties is not None:
        for property in calculated_properties.findall('db:property', ns):
            kind = property.find('db:kind', ns)
            value = property.find('db:value', ns)
            if kind is not None and value is not None:
                if kind.text == "SMILES":
                    smiles = value.text
                elif kind.text == "Molecular Formula":
                    molecular_formula = value.text

    # Extract Molecular Formula from experimental properties if not found in calculated properties
    if molecular_formula is None:
        experimental_properties = drug.find('db:experimental-properties', ns)
        if experimental_properties is not None:
            for property in experimental_properties.findall('db:property', ns):
                kind = property.find('db:kind', ns)
                value = property.find('db:value', ns)
                if kind is not None and value is not None and kind.text == "Molecular Formula":
                    molecular_formula = value.text

    smiles_list.append((drugbank_id, smiles))
    molecular_formulas.append((drugbank_id, molecular_formula))

# Convert the lists to DataFrames for merging
smiles_df = pd.DataFrame(smiles_list, columns=['DrugBank ID', 'SMILES'])
molecular_formulas_df = pd.DataFrame(molecular_formulas, columns=['DrugBank ID', 'Molecular Formula'])

# Merge the new data with the existing DataFrame
df = df.merge(smiles_df, on='DrugBank ID', how='left')
df = df.merge(molecular_formulas_df, on='DrugBank ID', how='left')

# Save the updated DataFrame to a CSV file
df.to_csv('C:/Users/agavr/Documents/Thesis_code/GNN-Predictor-for-Banned-Substances-in-Sports/drugbank_data_updated.csv', index=False)

In [26]:
# Assuming 'df' is your DataFrame
columns_to_drop = ['SMILES_x', 'Molecular Formula_x', 'SMILES_y', 'Molecular Formula_y']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

# Optionally, save the cleaned DataFrame to a CSV file
df.to_csv('C:/Users/agavr/Documents/Thesis_code/GNN-Predictor-for-Banned-Substances-in-Sports/drugbank_data_updated.csv', index=False)
import pandas as pd

In [2]:

import pandas as pd
df = pd.read_csv('drugbank_data_updated.csv', encoding='utf-8')

df.head(10)

Unnamed: 0,DrugBank ID,Name,Description,State,Groups,Categories,ATC Codes,Targets,SMILES,Molecular Formula
0,DB00001,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,solid,approved; withdrawn,"Amino Acids, Peptides, and Proteins; Anticoagu...",B01AE02: Direct thrombin inhibitors; ANTITHROM...,BE0000048: Prothrombin,,C287H440N80O111S6
1,DB00002,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,liquid,approved,"Amino Acids, Peptides, and Proteins; Antibodie...",L01FE01: EGFR (Epidermal Growth Factor Recepto...,BE0000767: Epidermal growth factor receptor | ...,,C6484H10042N1732O2023S36
2,DB00003,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,liquid,approved,"Amino Acids, Peptides, and Proteins; Cough and...","R05CB13: Mucolytics; EXPECTORANTS, EXCL. COMBI...",BE0004796: DNA,,C1321H1999N339O396S9
3,DB00004,Denileukin diftitox,A recombinant DNA-derived cytotoxic protein co...,liquid,approved; investigational,"ADP Ribose Transferases; Amino Acids, Peptides...",L01XX29: Other antineoplastic agents; OTHER AN...,BE0000658: Interleukin-2 receptor subunit alph...,,C2560H4042N678O799S17
4,DB00005,Etanercept,Dimeric fusion protein consisting of the extra...,liquid,approved; investigational,"Agents reducing cytokine levels; Amino Acids, ...",L04AB01: Tumor necrosis factor alpha (TNF-alph...,BE0000704: Tumor necrosis factor | BE0001087: ...,,C2224H3475N621O698S36
5,DB00006,Bivalirudin,Bivalirudin is a synthetic 20 residue peptide ...,solid,approved; investigational,"Amino Acids, Peptides, and Proteins; Anticoagu...",B01AE06: Direct thrombin inhibitors; ANTITHROM...,BE0000048: Prothrombin,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,C98H138N24O33
6,DB00007,Leuprolide,Leuprolide is a synthetic 9-residue peptide an...,solid,approved; investigational,Adrenal Cortex Hormones; Agents Causing Muscle...,L02AE51: Gonadotropin releasing hormone analog...,BE0000203: Gonadotropin-releasing hormone rece...,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,C59H84N16O12
7,DB00008,Peginterferon alfa-2a,Peginterferon alfa-2a is a form of recombinant...,liquid,approved; investigational,"Adjuvants, Immunologic; Alcohols; Alfa Interfe...",L03AB11: Interferons; IMMUNOSTIMULANTS; IMMUNO...,BE0000385: Interferon alpha/beta receptor 2 | ...,,
8,DB00009,Alteplase,Alteplase is a recombinant tissue plasminogen ...,liquid,approved; investigational,"Agents causing angioedema; Amino Acids, Peptid...",B01AD02: Enzymes; ANTITHROMBOTIC AGENTS; ANTIT...,BE0000211: Plasminogen | BE0000538: Fibrinogen...,,C2569H3928N746O781S40
9,DB00010,Sermorelin,Sermorelin acetate is the acetate salt of an a...,liquid,approved; withdrawn,"Amino Acids, Peptides, and Proteins; Anterior ...",V04CD03: Tests for pituitary function; OTHER D...,BE0000625: Growth hormone-releasing hormone re...,,C149H246N44O42S


In [3]:


# Count the number of drugs with SMILES as None
num_none_smiles = df['SMILES'].isnull().sum()

print(f"Number of drugs with SMILES as None: {num_none_smiles}")


Number of drugs with SMILES as None: 4656


In [15]:
import pandas as pd
df2 = pd.read_csv('smiles\\raw\smiles.csv', encoding='utf-8')

filtered_df_2 = df2[df2['SMILES'].notnull() & (df2['SMILES'] != '') & (df2['SMILES'] != 'None')]



In [9]:
# Read the name_list.csv file into a DataFrame
name_list_df = pd.read_csv('name_list.csv', encoding='utf-8')

# Filter the name_list_df to keep only rows where the Name is in filtered_df
filtered_name_list_df = name_list_df[name_list_df['Name'].isin(filtered_df['Name'])]

# Save the updated DataFrame back to name_list.csv
filtered_name_list_df.to_csv('name_list.csv', index=False, encoding='utf-8')

print("name_list.csv has been updated to include only drug names with valid SMILES.")

name_list.csv has been updated to include only drug names with valid SMILES.


In [18]:
len(filtered_name_list_df)


11925

In [19]:
import pandas as pd

def find_strings_in_dataset(string_list, dataset):
    # Initialize a 'Doping' column with default value 0.0
    
    
    for string in string_list:
        words = string.lower().split()
        found = dataset.apply(lambda row: any(word in ' '.join(map(str, row.values)).lower() for word in words), axis=1)
        if found.any():
            dataset.loc[found, 'Doping'] = 1.0
    
    return dataset

# Example list of strings to search for
aas_doped = [
    "Androstenediol", "1-Androstenedione", "Androsterone", "Epiandrosterone", "1-Testosterone",
    "4-Androstenediol", "4-Hydroxytestosterone", "5-Androstenedione", "7ɑ-Hydroxy-DHEA",
    "7ß-Hydroxy-DHEA", "7-Keto-DHEA", "11ß-Methyl-19-nortestosterone", "17ɑ-Methylepithiostanol (epistane)",
    "19-Norandrostenediol", "19-Norandrostenedione", "Androst-4-ene-3,11,17-trione (11-ketoandrostenedione, adrenosterone)",
    "Androstanolone (5ɑ-dihydrotestosterone, 17ß-hydroxy-5ɑ-androstan-3-one)", "Androstenediol", "Androstenedione",
    "Bolasterone", "Boldenone", "Boldione", "Calusterone", "Clostebol", "Danazol", "Dehydrochlormethyltestosterone",
    "Desoxymethyltestosterone", "Dimethandrolone", "Drostanolone", "Epiandrosterone", "Epi-dihydrotestosterone",
    "Epitestosterone", "Ethylestrenol", "Fluoxymesterone", "Formebolone", "Furazabol", "Gestrinone", "Mestanolone",
    "Mesterolone", "Metandienone", "Metenolone", "Methandriol", "Methasterone", "Methyl-1-testosterone",
    "Methylclostebol", "Methyldienolone", "Methylnortestosterone", "Methyltestosterone", "Metribolone", "Mibolerone",
    "Nandrolone", "Norboletone", "Norclostebol", "Norethandrolone", "Oxabolone", "Oxandrolone", "Oxymesterone",
    "Oxymetholone", "Prasterone", "Prostanozol", "Quinbolone", "Stanozolol", "Stenbolone", "Testosterone",
    "Tetrahydrogestrinone", "Tibolone", "Trenbolone", "Trestolone"
]

anabolic_other_doped = [
    "Clenbuterol", "Osilodrostat", "Ractopamine", "Andarine", "Enobosarm (ostarine)", "LGD-4033 (ligandrol)",
    "RAD140", "S-23", "YK-11", "Zeranol", "Zilpaterol"
]

peptide_growth_mimetic_doped = [
    "EPO-Fc", "polyethylene glycol", "CNTO-530", "peginesatide", "cobalt", "daprodustat (GSK1278863)", "IOX2",
    "molidustat (BAY 85-3934)", "roxadustat (FG-4592)", "vadadustat (AKB-6548)", "xenon", "K-11706", "luspatercept",
    "sotatercept", "asialo", "carbamylated EPO (CEPO)", "buserelin", "deslorelin", "goserelin", "histrelin",
    "leuprorelin", "nafarelin", "triptorelin", "gonadorelin", "kisspeptin", "corticorelin", "tetracosactide",
    "lonapegsomatropin", "somapacitan", "somatrogon", "AOD-9604", "hGH 176-191", "CJC-1293", "CJC-1295",
    "sermorelin", "tesamorelin", "anamorelin", "capromorelin", "ibutamoren (MK-677)", "ipamorelin",
    "lenomorelin (ghrelin)", "macimorelin", "tabimorelin", "alexamorelin", "examorelin (hexarelin)", "GHRP-1",
    "GHRP-2 (pralmorelin)", "GHRP-3", "GHRP-4", "GHRP-5", "GHRP-6"
]

beta_2_doped = [
    "Arformoterol", "Fenoterol", "Formoterol", "Higenamine", "Indacaterol", "Levosalbutamol", "Olodaterol",
    "Procaterol", "Reproterol", "Salbutamol", "Salmeterol", "Terbutaline", "Tretoquinol (trimetoquinol)", "Tulobuterol",
    "Vilanterol"
]

hormone_metabolic_modulators_doped = [
    "2-Androstenol (5ɑ-androst-2-en-17-ol)", "2-Androstenone (5ɑ-androst-2-en-17-one)", "3-Androstenol (5ɑ-androst-3-en-17-ol)",
    "3-Androstenone (5ɑ-androst-3-en-17-one)", "4-Androstene-3,6,17 trione (6-oxo)", "Aminoglutethimide", "Anastrozole",
    "androstatrienedione", "arimistane", "Exemestane", "Formestane", "Letrozole", "Testolactone", "Bazedoxifene",
    "Clomifene", "Cyclofenil", "Fulvestrant", "Ospemifene", "Raloxifene", "Tamoxifen", "Toremifene", "ACE-031",
    "bimagrumab", "follistatin", "myostatin", "propeptide", "apitegromab", "domagrozumab", "landogrozumab", "stamulumab",
    "AICAR", "GW1516", "GW501516", "SR9009", "SR9011", "Meldonium", "Trimetazidine"
]

diuretics_doped = [
    "Acetazolamide", "amiloride", "bumetanide", "canrenone", "chlortalidone", "etacrynic", "furosemide", "indapamide",
    "metolazone", "spironolactone", "thiazides", "bendroflumethiazide", "chlorothiazide", "hydrochlorothiazide",
    "torasemide", "triamterene", "vaptan", "conivaptan", "mozavaptan", "tolvaptan", "Albumin", "dextran",
    "hydroxyethylstarch", "mannitol", "Desmopressin", "Probenecid"
]

stimulants_doped = [
    "Adrafinil", "Amfepramone", "Amfetamine", "Amfetaminil", "Amiphenazole", "Benfluorex", "Benzylpiperazine",
    "Bromantan", "Clobenzorex", "Cocaine", "Cropropamide", "Crotetamide", "Fencamine", "Fenetylline", "Fenfluramine",
    "Fenproporex", "Fonturacetam [4-phenylpiracetam (carphedon)]", "Furfenorex", "Lisdexamfetamine", "Mefenorex",
    "Mephentermine", "Mesocarb", "Metamfetamine(d-)", "p-methylamfetamine", "Modafinil", "Norfenfluramine",
    "Phendimetrazine", "Phentermine", "Prenylamine", "Prolintane", "2-phenylpropan-1-amine (ß-methylphenylethylamine, BMPEA)",
    "3-Methylhexan-2-amine (1,2-dimethylpentylamine)", "4-Fluoromethylphenidate", "methylhexaneamine",
    "4-Methylpentan-2-amine (1,3-dimethylbutylamine)", "5-Methylhexan-2-amine (1,4-dimethylamylamine, 1,4-dimethylpentylamine, 1,4-DMAA)",
    "Benzfetamine", "Cathine", "Cathinone", "mephedrone", "methedrone", "ɑ-pyrrolidinovalerophenone",
    "Dimetamfetamine (dimethylamphetamine)", "Ephedrine", "Epinephrine (adrenaline)", "Etamivan", "Ethylphenidate",
    "Etilamfetamine", "Etilefrine", "Famprofazone", "Fenbutrazate", "Fencamfamin", "Heptaminol", "Hydrafinil (fluorenol)",
    "Hydroxyamfetamine (parahydroxyamphetamine)", "Isometheptene", "Levmetamfetamine", "Meclofenoxate",
    "Methylenedioxymethamphetamine", "Methylephedrine", "Methylnaphthidate [(±)-methyl-2-(naphthalen-2-yl)-2-(piperidin-2-yl)acetate]",
    "Methylphenidate", "Nikethamide", "Norfenefrine", "Octodrine (1,5-dimethylhexylamine)", "Octopamine",
    "Oxilofrine (methylsynephrine)", "Pemoline", "Pentetrazol", "Phenethylamine", "Phenmetrazine", "Phenpromethamine",
    "Propylhexedrine", "Pseudoephedrine", "Selegiline", "Sibutramine", "Solriamfetol", "Strychnine",
    "Tenamfetamine (methylenedioxyamphetamine)", "Tuaminoheptane"
]

narcotics_doped = [
    "Buprenorphine", "Dextromoramide", "Diamorphine (heroin)", "Fentanyl", "Hydromorphone", "Methadone", "Morphine",
    "Nicomorphine", "Oxycodone", "Oxymorphone", "Pentazocine", "Pethidine", "Tramadol"
]

glucocorticoids_doped = [
    "Beclometasone", "Betamethasone", "Budesonide", "Ciclesonide", "Cortisone", "Deflazacort", "Dexamethasone",
    "Flunisolide", "Fluocortolone", "Fluticasone", "Hydrocortisone", "Methylprednisolone", "Mometasone",
    "Prednisolone", "Prednisone", "Triamcinolone acetonide"
]

beta_blockers_doped = [
    "Acebutolol", "Alprenolol", "Atenolol", "Betaxolol", "Bisoprolol", "Bunolol", "Carteolol", "Carvedilol",
    "Celiprolol", "Esmolol", "Labetalol", "Metipranolol", "Metoprolol", "Nadolol", "Nebivolol", "Oxprenolol",
    "Pindolol", "Propranolol", "Sotalol", "Timolol"
]

all_strings = (
    aas_doped + anabolic_other_doped + peptide_growth_mimetic_doped + beta_2_doped + hormone_metabolic_modulators_doped +
    diuretics_doped + stimulants_doped + narcotics_doped + glucocorticoids_doped + beta_blockers_doped
)

# Load your dataset
filtered_name_list_df = pd.read_csv('name_list.csv')

# Apply the function
filtered_name_list_df = find_strings_in_dataset(all_strings, filtered_name_list_df)

# Save the result
filtered_name_list_df.to_csv('doping_list.csv', index=False)

print(filtered_name_list_df.shape)


(11925, 3)


In [22]:
count_doing_1 = filtered_name_list_df[filtered_name_list_df['Doping'] == 1.0].shape[0]
print(f'Number of entities marked 1.0 under Doing column: {count_doing_1}')


Number of entities marked 1.0 under Doing column: 390


In [38]:
import pandas as pd
import numpy as np
def find_float_categories(df):
    # Select rows where the Categories column is of type float
    # float_categories = df[df['Categories'].apply(lambda x: isinstance(x, float) and  np.isnan(x))]
    # return float_categories
    rows_with_smiles_nan_categories = df[df['SMILES'].notna() & df['Categories'].isna()]
    return rows_with_smiles_nan_categories

only_smiles = pd.read_csv('only_smiles.csv')



# Find rows with float values in Categories column
rows_with_smiles_nan_categories = find_float_categories(only_smiles)
print(len(rows_with_smiles_nan_categories))


4688


In [63]:
import pandas as pd

def find_matching_drugs(df, categories, atccodes):
    matching_drugs = []

    for index, row in df.iterrows():
        if pd.isna(row['Categories']):
            continue  # Skip rows with NaN in Categories
        if pd.isna(row['ATC Codes' ]):
            continue
        
        for category in categories:
            if category.lower() in row['Categories'].lower() and len(set(category.lower().split()) & set(row['Categories'].lower().split())):
                matching_drugs.append(row['Name'])
                break  # If a match is found for a drug, no need to check other categories
    
        for atc in atccodes:
            if atc.lower() in row['ATC Codes'].lower() and len(set(atc.lower().split()) & set(row['ATC Codes'].lower().split())):
                if row['Name'] not in matching_drugs:
                    matching_drugs.append(row['Name'])
                break  # If a match is found for a drug, no need to check other categories
    return matching_drugs

doping_categories = [
    "AAS"
    "Non-Approved Substances",
    "Anabolic Agents",
    "Exogenous AAS",
    "Endogenous AAS when administered exogenously",
    "Other Anabolic Agents",
    "Peptide Hormones, Growth Factors, Related Substances, and Mimetics",
    "Growth Factors",
    "Growth"
    "Mimetics"
    "Erythropoietins (EPO) and agents affecting erythropoiesis",
    "EPO"
    "Chorionic Gonadotrophin (CG) and Luteinizing Hormone (LH) in males",
    "CG"
    "Corticotrophins",
    "Growth Hormone (GH) and its releasing factors",
    "GH"
    "Growth factors and growth factor modulators",
    "Beta-2 Agonists",
    "Beta-2"
    "Hormone and Metabolic Modulators",
    "Aromatase inhibitors",
    "Selective Estrogen Receptor Modulators (SERMs)",
    "SERMs"
    "Other anti-estrogenic substances",
    "Agents modifying myostatin function(s)",
    "Metabolic modulators",
    "Metabolic"
    "Diuretics and Masking Agents",
    "Diuretics",
    "Masking agents",
    "Stimulants",
    "Narcotics",
    "Cannabinoids",
    "Glucocorticoids",
    "Beta Blockers",
    "Manipulation of Blood and Blood Components",
    "Blood doping",
    "Chemical and physical manipulation",
    "Chemical and Physical Manipulation",
    "Gene and Cell Doping",
    "Gene doping",
    "Use of cells, genes, genetic elements, or modulation of gene expression"
]
only_smiles = pd.read_csv('only_smiles.csv')
# Find matching drugs
matching_drugs = find_matching_drugs(only_smiles, doping_categories, doping_categories)
print("Matching Drugs:")
print(len(matching_drugs))
for drug in matching_drugs:
    print(drug)


Matching Drugs:
86
Reserpine
Midodrine
Milrinone
Chlorthalidone
Aminoglutethimide
Norepinephrine
Phenylephrine
Methylphenidate
Bendroflumethiazide
Prazosin
Dronabinol
Nabilone
Metolazone
Fenfluramine
Clonidine
Metaraminol
Oxandrolone
Epinephrine
Furosemide
Methoxamine
Tretinoin
Fenoldopam
Dobutamine
Bumetanide
Levosimendan
Dopamine
Exemestane
Hydrochlorothiazide
Letrozole
Trichlormethiazide
Isoprenaline
Deserpidine
Arbutamine
Sibutramine
Guanethidine
Rescinnamine
Fluoxymesterone
Anastrozole
Hydralazine
Ephedrine
Mephentermine
Amrinone
Ethylestrenol
Pargyline
Prasterone
Stanolone
Enoximone
Rimonabant
Droxidopa
Oxymetholone
Methyltestosterone
Stanozolol
Pinacidil
Formestane
Etilefrine
Cannabidiol
Tibolone
Synephrine
Moxonidine
Dopexamine
Norethandrolone
Cafedrine
Theodrenaline
Dihydralazine
Nandrolone
Oxabolone cipionate
Bucladesine
Octopamine
Ibopamine
Amezinium metilsulfate
Norfenefrine
Mebutizide
Quinbolone
Dimetofrine
Cyclopenthiazide
Bietaserpine
Metandienone
Mesterolone
Methoserpid

In [60]:
import pandas as pd

# Load the DataFrame from a CSV file
df = pd.read_csv('doping_list.csv')

# Define a function to merge the doping columns
def merge_doping_columns(row):
    if (pd.notna(row['Doping1']) and row['Doping1'] == 1.0) or (pd.notna(row['Doping2']) and row['Doping2'] == 1.0):
        return 1.0
    return 0.0

# Apply the function to create the merged Doping column
df['Doping'] = df.apply(merge_doping_columns, axis=1)


# Drop the original Doping1 and Doping2 columns
df = df.drop(columns=['Doping1', 'Doping2'])
df['Doping'] = df['Doping'].astype(int)


count_0 = df[df['Doping'] == 0].shape[0]
count_1 = df[df['Doping'] == 1].shape[0]


# Display the resulting DataFrame
df.head(25)
df.to_csv('labeled_data.csv', index=False)
print(count_0, count_1)


11505 420


In [66]:
import pandas as pd

# List of words to search for
search_terms = [
    "Adrenergic beta-2 Receptor Agonists", "Peptide Hormones", "Hematopoietic Cell Growth Factors", 
    "Erythropoiesis-Stimulating Agents", "Chorionic Gonadotropin", "Adrenocorticotropic Hormone", 
    "Selective Beta 2-adrenergic Agonists", "Adrenergic beta-Agonists", "Hormones", "Hormone Antagonists", 
    "Hormone Substitutes", "Selective Estrogen Receptor Modulators", "Estrogen Agonist/Antagonist", 
    "Estrogen Receptor Modulators", "Diuretics", "Stimulants", "Narcotics", "Cannabinoids", 
    "Glucocorticoids", "Beta-Blockers (Beta1 Selective)"
]

def find_matching_drugs(df, search_terms):
    matching_drugs = []

    for index, row in df.iterrows():
        if any(term.lower() in str(row['Categories']).lower() for term in search_terms) or \
           any(term.lower() in str(row['ATC Codes']).lower() for term in search_terms):
            matching_drugs.append(row['Name'])
    
    return matching_drugs



df_check = pd.read_csv('only_smiles.csv')
# Find matching drugs
matching_drugs = find_matching_drugs(df_check, search_terms)
print("Matching Drugs:")
print(len(matching_drugs))
for drug in matching_drugs:
    print(drug)


Matching Drugs:
940
Leuprolide
Goserelin
Desmopressin
Cetrorelix
Vasopressin
Abarelix
Ascorbic acid
Calcitriol
Calcifediol
Ergocalciferol
Cholecalciferol
Valsartan
Ramipril
Flunisolide
Amphetamine
Pentagastrin
Nicotine
Esmolol
Tramadol
Betaxolol
Fluconazole
Caffeine
Sildenafil
Reserpine
Trospium
Midodrine
Torasemide
Isoetharine
Diflorasone
Methyclothiazide
Milrinone
Oxiconazole
Alclometasone
Cabergoline
Terconazole
Medrysone
Diethylstilbestrol
Clotrimazole
Sulfanilamide
Sulfisoxazole
Metoprolol
Chlorotrianisene
Isradipine
Olmesartan
Liothyronine
Amcinonide
Atomoxetine
Etonogestrel
Morphine
Desogestrel
Chlorthalidone
Ethoxzolamide
Codeine
Fluorometholone
Hydromorphone
Methadone
Atenolol
Diltiazem
Alfuzosin
Megestrol acetate
Methylergometrine
Aminoglutethimide
Sulfadiazine
Levonorgestrel
Norepinephrine
Timolol
Dydrogesterone
Amlodipine
Triamterene
Phenylephrine
Carbimazole
Nimodipine
Beclomethasone dipropionate
Progesterone
Phenylpropanolamine
Nisoldipine
Gentian violet cation
Acetohexam

In [73]:
def check_doping_status(df, matching_drugs):
    doping_status = {}

def check_doping_status(df, matching_drugs):
    for drug in matching_drugs:
        if drug in df['Name'].values:
            doping_status = df[df['Name'] == drug]['Doping'].values[0]
            print(f"{drug}: {doping_status}")

df = pd.read_csv('labeled_data.csv')

doping_status = check_doping_status(df, matching_drugs)

count_0 = df[df['Doping'] == 0].shape[0]
count_1 = df[df['Doping'] == 1].shape[0]



Leuprolide: 1
Goserelin: 1
Desmopressin: 1
Cetrorelix: 0
Vasopressin: 1
Abarelix: 0
Ascorbic acid: 0
Calcitriol: 0
Calcifediol: 0
Ergocalciferol: 0
Cholecalciferol: 0
Valsartan: 0
Ramipril: 0
Flunisolide: 1
Amphetamine: 0
Pentagastrin: 0
Nicotine: 0
Esmolol: 1
Tramadol: 1
Betaxolol: 1
Fluconazole: 0
Caffeine: 0
Sildenafil: 0
Reserpine: 0
Trospium: 0
Midodrine: 1
Torasemide: 1
Isoetharine: 1
Diflorasone: 0
Methyclothiazide: 1
Milrinone: 0
Oxiconazole: 0
Alclometasone: 0
Cabergoline: 0
Terconazole: 0
Medrysone: 1
Diethylstilbestrol: 0
Clotrimazole: 0
Sulfanilamide: 0
Sulfisoxazole: 0
Metoprolol: 1
Chlorotrianisene: 0
Isradipine: 0
Olmesartan: 0
Liothyronine: 0
Amcinonide: 0
Atomoxetine: 0
Etonogestrel: 0
Morphine: 1
Desogestrel: 0
Chlorthalidone: 1
Ethoxzolamide: 0
Codeine: 0
Fluorometholone: 0
Hydromorphone: 1
Methadone: 1
Atenolol: 1
Diltiazem: 0
Alfuzosin: 0
Megestrol acetate: 0
Methylergometrine: 0
Aminoglutethimide: 1
Sulfadiazine: 0
Levonorgestrel: 0
Norepinephrine: 1
Timolol: 1
Dy

In [4]:
import pandas as pd
def count_doping_status(df):
    count_0 = 0
    count_1 = 0
    
    #for drug in matching_drugs:
    for drug in df['Name'].values:
        doping_status = df[df['Name'] == drug]['Doping'].values[0]
        if doping_status == 0:
            count_0 += 1
        elif doping_status == 1:
            count_1 += 1
    
    return count_0, count_1
df = pd.read_csv('labeled_data.csv')


# Count doping status
count_0, count_1 = count_doping_status(df)

print(f'Number of drugs with Doping value 0: {count_0}')
print(f'Number of drugs with Doping value 1: {count_1}')

Number of drugs with Doping value 0: 11299
Number of drugs with Doping value 1: 626


In [77]:
def print_drugs_with_doping_value_0(df, matching_drugs):
    drugs_with_doping_0 = []

    for drug in matching_drugs:
        if drug in df['Name'].values:
            doping_status = df[df['Name'] == drug]['Doping'].values[0]
            if doping_status == 0:
                drugs_with_doping_0.append(drug)
    
    for drug in drugs_with_doping_0:
        print(drug)

print_drugs_with_doping_value_0(df, matching_drugs)

Cetrorelix
Abarelix
Ascorbic acid
Calcitriol
Calcifediol
Ergocalciferol
Cholecalciferol
Valsartan
Ramipril
Amphetamine
Pentagastrin
Nicotine
Fluconazole
Caffeine
Sildenafil
Reserpine
Trospium
Diflorasone
Milrinone
Oxiconazole
Alclometasone
Cabergoline
Terconazole
Diethylstilbestrol
Clotrimazole
Sulfanilamide
Sulfisoxazole
Chlorotrianisene
Isradipine
Olmesartan
Liothyronine
Amcinonide
Atomoxetine
Etonogestrel
Desogestrel
Ethoxzolamide
Codeine
Fluorometholone
Diltiazem
Alfuzosin
Megestrol acetate
Methylergometrine
Sulfadiazine
Levonorgestrel
Dydrogesterone
Amlodipine
Phenylephrine
Carbimazole
Nimodipine
Beclomethasone dipropionate
Progesterone
Phenylpropanolamine
Nisoldipine
Gentian violet cation
Acetohexamide
Carboprost tromethamine
Chloramphenicol
Levothyroxine
Meperidine
Prazosin
Picrotoxin
Montelukast
Celecoxib
Fosinopril
Darifenacin
Flutamide
Dextrothyroxine
Lercanidipine
Benazepril
Desoximetasone
Zafirlukast
Propylthiouracil
Acetohydroxamic acid
Lamotrigine
Bosentan
Doxapram
Cinnar

In [7]:
import csv

# Read the misaligned CSV file with UTF-8 encoding
with open('drugbank_data.csv', 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    lines = list(reader)

# Prepare a list to store the corrected rows
corrected_rows = []

# Initialize a temporary storage for a drug entry
current_entry = []

# Loop through each line to correct misalignment
for line in lines:
    # Skip empty lines
    if not line:
        continue
    # If the line starts with 'DB', it indicates the beginning of a new drug entry
    if line[0].startswith('DB'):
        if current_entry:
            # Append the completed entry to the corrected rows
            corrected_rows.append(current_entry)
        # Start a new drug entry
        current_entry = line
    else:
        # Ensure there is something to append to
        if current_entry:
            # Append additional lines to the current drug entry
            current_entry[-1] += ' ' + ' '.join(line[1:])

# Don't forget to add the last entry
if current_entry:
    corrected_rows.append(current_entry)

# Write the corrected rows to a new CSV file with UTF-8 encoding
with open('corrected_drug_list.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(corrected_rows)

print("CSV file has been corrected and saved as 'corrected_drug_list.csv'.")


CSV file has been corrected and saved as 'corrected_drug_list.csv'.


In [13]:
drug_names = df['Name'].tolist()

# Print the list of drug names
print(drug_names)


['Lepirudin', 'Cetuximab', 'Dornase alfa', 'Denileukin diftitox', 'Etanercept', 'Bivalirudin', 'Leuprolide', 'Peginterferon alfa-2a', 'Alteplase', 'Sermorelin', 'Interferon alfa-n1', 'Darbepoetin alfa', 'Urokinase', 'Goserelin', 'Reteplase', 'Erythropoietin', 'Salmon calcitonin', 'Interferon alfa-n3', 'Pegfilgrastim', 'Sargramostim', 'Peginterferon alfa-2b', 'Asparaginase Escherichia coli', 'Thyrotropin alfa', 'Antihemophilic factor, human recombinant', 'Anakinra', 'Gramicidin D', 'Human immunoglobulin G', 'Anistreplase', 'Insulin human', 'Tenecteplase', 'Menotropins', 'Interferon gamma-1b', 'Interferon alfa-2a', 'Desmopressin', 'Coagulation factor VIIa Recombinant Human', 'Oprelvekin', 'Palifermin', 'Glucagon', 'Aldesleukin', 'Botulinum toxin type B', 'Omalizumab', 'Lutropin alfa', 'Lyme disease vaccine (recombinant OspA)', 'Insulin lispro', 'Insulin glargine', 'Collagenase clostridium histolyticum', 'Rasburicase', 'Cetrorelix', 'Adalimumab', 'Somatotropin', 'Imiglucerase', 'Abciximab

In [7]:
# Extract the first 1000 drug names
import csv
first_1000_names = filtered_df['Name'].tolist()

# Print the first 1000 drug names
for i, name in enumerate(first_1000_names, start=1):
    print(f"{i}. {name}")

with open('names.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    for name in first_1000_names:
        writer.writerow([name])

1. Bivalirudin
2. Leuprolide
3. Goserelin
4. Gramicidin D
5. Desmopressin
6. Cetrorelix
7. Vasopressin
8. Daptomycin
9. Cyclosporine
10. Abarelix
11. Pyridoxal phosphate
12. Cyanocobalamin
13. Tetrahydrofolic acid
14. Histidine
15. Ademetionine
16. Pyruvic acid
17. Phenylalanine
18. Biotin
19. Choline
20. Lysine
21. Arginine
22. Ascorbic acid
23. Spermine
24. Aspartic acid
25. Ornithine
26. L-Glutamine
27. Adenosine phosphate
28. alpha-Linolenic acid
29. Serine
30. Methionine
31. Tyrosine
32. Calcitriol
33. Lutein
34. Cystine
35. Succinic acid
36. Riboflavin
37. N-Acetylglucosamine
38. Glutamic acid
39. Glutathione
40. Phosphatidyl serine
41. Glycine
42. Calcifediol
43. Pyridoxal
44. Creatine
45. Leucine
46. Tryptophan
47. Cysteine
48. Thiamine
49. Ergocalciferol
50. Dihomo-gamma-linolenic acid
51. Citrulline
52. Threonine
53. NADH
54. Folic acid
55. Icosapent
56. Alanine
57. Valine
58. Vitamin A
59. Vitamin E
60. Pyridoxine
61. Lipoic acid
62. Isoleucine
63. Aspartame
64. Cholecalcife