In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import csv

# Extract all elements from xml to csv

In [2]:
# Load the XML file
tree = ET.parse('first_1000_entries.xml')
root = tree.getroot()

In [5]:
import xml.etree.ElementTree as ET
import csv

# Function to recursively extract data from XML
def extract_element_data(elem, data_dict=None, parent_key=""):
    if data_dict is None:
        data_dict = {}

    for child in elem:
        # Construct the key for the CSV
        key = f"{parent_key}/{child.tag}" if parent_key else child.tag

        # If the element has children, recursively extract data
        if len(child):
            extract_element_data(child, data_dict, key)
        else:
            # Leaf node: store the text data
            data_dict[key] = child.text

    return data_dict

# Function to process each item and write to CSV
def xml_to_csv_with_nested(xml_file, csv_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Open CSV for writing
    with open(csv_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Process each XML element and its nested elements
        for item in root:
            data_dict = extract_element_data(item)

            # Write header only once (for the first item)
            if csvfile.tell() == 0:
                csvwriter.writerow(data_dict.keys())

            # Write the row data
            csvwriter.writerow(data_dict.values())

# Example usage
xml_file = 'first_100_entries.xml'
csv_file = 'output.csv'

# Convert XML to CSV with nested structures
xml_to_csv_with_nested(xml_file, csv_file)


# Drug-interaction

In [2]:
import xml.etree.ElementTree as ET
import csv

def extract_drug_interactions(drug, data_list=None):
    if data_list is None:
        data_list = []

    # Extract drug information from the parent 'drug' element
    drug_id = drug.findtext("{http://www.drugbank.ca}drugbank-id")
    drug_name = drug.findtext("{http://www.drugbank.ca}name")

    # Find all 'drug-interaction' elements
    for drug_interaction in drug.findall(".//{http://www.drugbank.ca}drug-interaction"): 
        interaction_data = {
            "drug-id": drug_id,
            "drug-name": drug_name
        }

        # Iterate through all sub-elements in the 'drug-interaction' block
        for child in drug_interaction:
            if child.tag == "{http://www.drugbank.ca}drugbank-id":
                interaction_data["drug-interact-id"] = child.text
            elif child.tag == "{http://www.drugbank.ca}name":
                interaction_data["drug-interact-name"] = child.text
            else:
                interaction_data[child.tag] = child.text

        # Add the extracted data to the list
        data_list.append(interaction_data)

    return data_list

# Function to extract drug-interaction info and write it to CSV
def xml_to_csv_drug_interaction(xml_file, csv_file):
    try:
        # Parse the XML file
        print(f"Parsing XML file: {xml_file}")
        tree = ET.parse(xml_file)
        root = tree.getroot()
        print("XML file parsed successfully.")
    except FileNotFoundError:
        print("Error: The XML file was not found.")
        return
    except ET.ParseError:
        print("Error: The XML file could not be parsed.")
        return

    try:
        # Extract the 'drug-interaction' data
        print("Extracting 'drug-interaction' data...")
        drug_interaction_data = []
        for drug in root.findall(".//{http://www.drugbank.ca}drug"):
            drug_interaction_data.extend(extract_drug_interactions(drug))
        print(f"Extracted {len(drug_interaction_data)} 'drug-interaction' entries.")
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return

    try:
        # Open CSV for writing
        print(f"Writing data to CSV file: {csv_file}")
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)

            # Write header once, based on the keys of the first item
            if drug_interaction_data:
                headers = sorted(drug_interaction_data[0].keys())
                csvwriter.writerow(headers)
                print(f"CSV headers written: {headers}")

                # Write the row data for each drug interaction
                for data in drug_interaction_data:
                    row = [data.get(key, '') for key in headers]
                    csvwriter.writerow(row)
                print("CSV data written successfully.")
            else:
                print("No 'drug-interaction' data found to write.")
    except IOError:
        print("Error: Could not write to the CSV file.")
        return
    except Exception as e:
        print(f"Error during CSV writing: {e}")
        return

# Example usage
xml_file = 'first_2000_entries.xml'  # Your input XML file
csv_file = 'drug_interactions_2000.csv'  # Your output CSV file

# Extract 'drug-interaction' info and write to CSV
xml_to_csv_drug_interaction(xml_file, csv_file)

Parsing XML file: first_2000_entries.xml
XML file parsed successfully.
Extracting 'drug-interaction' data...
Extracted 1304013 'drug-interaction' entries.
Writing data to CSV file: drug_interactions_2000.csv
CSV headers written: ['drug-id', 'drug-interact-id', 'drug-interact-name', 'drug-name', '{http://www.drugbank.ca}description']
CSV data written successfully.


Remove duplicate

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('D:/Frank/envs299/2000 dataset size/drug_interactions_2000.csv', encoding='ISO-8859-1')

# Remove duplicate rows based on the 'description' column
df_unique = df.drop_duplicates(subset=['description'])

# Save the cleaned data back to a new CSV file
df_unique.to_csv('D:/Frank/envs299/2000 dataset size/cleaned_drug_interactions_2000.csv', index=False)

print("Duplicate rows based on 'description' column have been removed.")


Duplicate rows based on 'description' column have been removed.


Contain only common drug list

In [15]:
import pandas as pd

# Path to the CSV file
csv_file_path = "D:/Frank/envs299/2000 dataset size/drug_interactions_2000.csv"


# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# Filter rows where either column contains any element from the list
filtered_df = df[
    df["drug-interact-name"].isin(common_drug_names) & 
    df["drug-name"].isin(common_drug_names)
]

# Remove duplicate rows based on the 'description' column
filtered_df = filtered_df.drop_duplicates(subset=['description'])

# Save the filtered DataFrame to a new CSV file (optional)
filtered_df.to_csv("D:/Frank/envs299/2000 dataset size/drug_interactions_2000_filtered_file.csv", index=False)

# Display the filtered DataFrame
print(filtered_df)

         drug-id drug-interact-id    drug-interact-name      drug-name  \
2116     DB00006          DB01254             Dasatinib    Bivalirudin   
2118     DB00006          DB01586  Ursodeoxycholic acid    Bivalirudin   
2144     DB00006          DB00908             Quinidine    Bivalirudin   
2145     DB00006          DB00675             Tamoxifen    Bivalirudin   
2146     DB00006          DB00539            Toremifene    Bivalirudin   
...          ...              ...                   ...            ...   
1043875  DB01200          DB01260              Desonide  Bromocriptine   
1043876  DB01200          DB01410           Ciclesonide  Bromocriptine   
1043945  DB01200          DB01406               Danazol  Bromocriptine   
1043973  DB01200          DB01320          Fosphenytoin  Bromocriptine   
1043998  DB01200          DB01656           Roflumilast  Bromocriptine   

                                               description  
2116     The risk or severity of bleeding and hemo

Tuple Generation

In [10]:
import pandas as pd

# Load the dataset
file_path = 'D:/Frank/envs299/2000 dataset size/drug_interactions_2000_filtered_file.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Function to clean mechanisms by removing unnecessary placeholders and fragments
def clean_mechanism(row):
    drug_a = row['drug A']
    drug_b = row['drug B']
    description = row['description'].lower()

    # List of unnecessary phrases or fragments to remove
    phrases_to_remove = [
        "can be", "when combined with", "is combined with", "combined with", "in combination with", 
        "co-administered", "co-administer", "administered with", "can", "may", "with", "d when", 
        "of d when it", "when it", "of d", "of used", "used", " d."
    ]

    # Remove drug names
    clean_description = description.replace(drug_a.lower(), "").replace(drug_b.lower(), "")
    
    # Remove unnecessary phrases or fragments
    for phrase in phrases_to_remove:
        clean_description = clean_description.replace(phrase, "")
    
    # Remove any excess spaces and adjust periods
    clean_description = ' '.join(clean_description.split()).replace(" .", ".").strip()

    # Extract the action (increase, decrease)
    if "decrease" in clean_description:
        action = "decrease"
        clean_description = clean_description.replace("decrease", "").strip()
    elif "increase" in clean_description:
        action = "increase"
        clean_description = clean_description.replace("increase", "").strip()
    else:
        action = "unknown"

    # The remaining text is the mechanism
    mechanism = clean_description.strip()

    return (drug_a, drug_b, mechanism, action)

# Apply the cleaning function and compose the final tuple
data['final_tuple_composed'] = data.apply(
    lambda row: clean_mechanism(row), axis=1
)

# Ensure 'final_tuple_composed' is treated as strings
data['final_tuple_composed'] = data['final_tuple_composed'].astype(str)

# Function to safely split the tuple into four parts
def safe_split_tuple(row):
    try:
        # Remove parentheses and split
        parts = row.strip("()").split(",")
        # Ensure there are four parts
        if len(parts) != 4:
            return [None, None, None, None]
        # Strip extra quotes and whitespace
        return [part.strip(" '") for part in parts]
    except Exception as e:
        return [None, None, None, None]

# Apply the safe splitting function
data[['drug A', 'drug B', 'mechanism', 'action']] = pd.DataFrame(
    data['final_tuple_composed'].apply(safe_split_tuple).tolist(), index=data.index
)


# Export the dataset with separated tuple columns
output_file_path = 'D:/Frank/envs299/2000 dataset size/drug_interactions_2000_tuple.csv'  # Replace with your output file path
data.to_csv(output_file_path, index=False)

print(f"Processed file saved to {output_file_path}")


Processed file saved to D:/Frank/envs299/2000 dataset size/drug_interactions_2000_tuple.csv


Replace Drug A and Drug B in description

In [19]:
file_path = 'D:/Frank/envs299/2000 dataset size/drug_interactions_2000_filtered_file.csv'  # Replace with your file path
df = pd.read_csv(file_path)
# Create a new column to store the modified descriptions
df['modified_description'] = df.apply(
    lambda row: row['description'].replace(row['drug A'], 'Drug A').replace(row['drug B'], 'Drug B'),
    axis=1
)

# Generate a mapping of unique descriptions to unique indices
description_to_index = {description: idx for idx, description in enumerate(df['modified_description'].unique())}

# Create a new column 'index' based on the mapping
df['index'] = df['modified_description'].map(description_to_index)



# Save the updated DataFrame with the new column to a new CSV file
output_file_path_with_new_column = 'D:/Frank/envs299/2000 dataset size/updated_drug_interactions_with_new_column.csv'
df.to_csv(output_file_path_with_new_column, index=False)

output_file_path_with_new_column

'D:/Frank/envs299/2000 dataset size/updated_drug_interactions_with_new_column.csv'

Remove rows with frequency less than 10

In [20]:
file_path = 'D:/Frank/envs299/2000 dataset size/updated_drug_interactions_with_new_column.csv'  # Replace with your file path
df = pd.read_csv(file_path)
# Count the occurrences of each modified description
description_counts = df['modified_description'].value_counts()

# Filter out rows where the modified description occurs less than 10 times
df_filtered = df[df['modified_description'].map(description_counts) >= 10]

# Generate a mapping of unique descriptions to unique indices for the filtered data
description_to_index_filtered = {description: idx for idx, description in enumerate(df_filtered['modified_description'].unique())}

# Create a new column 'index' based on the filtered mapping
df_filtered['index'] = df_filtered['modified_description'].map(description_to_index_filtered)

# Save the updated DataFrame to a new CSV file
output_file_filtered = 'D:/Frank/envs299/2000 dataset size/filtered_drug_interactions_with_index.csv'
df_filtered.to_csv(output_file_filtered, index=False)

# Display the path to the saved file
output_file_filtered


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['index'] = df_filtered['modified_description'].map(description_to_index_filtered)


'D:/Frank/envs299/2000 dataset size/filtered_drug_interactions_with_index.csv'

# Target Id

In [3]:
import xml.etree.ElementTree as ET
import csv

# Define the namespace
ns = {'ns0': 'http://www.drugbank.ca'}

# Function to extract 'targets/polypeptide/@id'
def extract_drug_targets(drug, data_list=None):
    if data_list is None:
        data_list = []

    # Extract drug information from the parent 'drug' element
    drug_id = drug.findtext("ns0:drugbank-id", namespaces=ns)
    drug_name = drug.findtext("ns0:name", namespaces=ns)

    # Find all 'target' elements
    for target in drug.findall(".//ns0:target", ns):
        target_data = {
            "drug-id": drug_id,
            "drug-name": drug_name
        }

        # Iterate through all 'polypeptide' elements within 'target'
        for polypeptide in target.findall(".//ns0:polypeptide", ns):
            polypeptide_id = polypeptide.get("id")  # Get the 'id' attribute from the 'polypeptide' element
            if polypeptide_id:
                target_data["polypeptide-id"] = polypeptide_id
                # Add the extracted data to the list
                data_list.append(target_data.copy())

                # Debugging: print out the data being added
                print(f"Extracted: drug_id={drug_id}, drug_name={drug_name}, polypeptide_id={polypeptide_id}")

    return data_list

# Function to extract 'target/polypeptide/@id' info and write it to CSV
def xml_to_csv_targets(xml_file, csv_file):
    try:
        # Parse the XML file
        print(f"Parsing XML file: {xml_file}")
        tree = ET.parse(xml_file)
        root = tree.getroot()
        print("XML file parsed successfully.")
    except FileNotFoundError:
        print("Error: The XML file was not found.")
        return
    except ET.ParseError:
        print("Error: The XML file could not be parsed.")
        return

    try:
        # Extract the 'target/polypeptide/@id' data
        print("Extracting 'target/polypeptide/@id' data...")
        drug_target_data = []
        for drug in root.findall(".//ns0:drug", ns):
            drug_target_data.extend(extract_drug_targets(drug))
        print(f"Extracted {len(drug_target_data)} 'target/polypeptide/@id' entries.")
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return

    try:
        # Open CSV for writing
        print(f"Writing data to CSV file: {csv_file}")
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)

            # Write header once, based on the keys of the first item
            if drug_target_data:
                headers = sorted(drug_target_data[0].keys())
                csvwriter.writerow(headers)
                print(f"CSV headers written: {headers}")

                # Write the row data for each drug target
                for data in drug_target_data:
                    row = [data.get(key, '') for key in headers]
                    csvwriter.writerow(row)
                print("CSV data written successfully.")
            else:
                print("No 'target/polypeptide/@id' data found to write.")
    except IOError:
        print("Error: Could not write to the CSV file.")
        return
    except Exception as e:
        print(f"Error during CSV writing: {e}")
        return

# Example usage
xml_file = 'first_2000_entries.xml'  # Your input XML file
csv_file = 'drug_targets_2000.csv'       # Your output CSV file

# Extract 'target/polypeptide/@id' info and write to CSV
xml_to_csv_targets(xml_file, csv_file)


Parsing XML file: first_2000_entries.xml
XML file parsed successfully.
Extracting 'target/polypeptide/@id' data...
Extracted: drug_id=DB00001, drug_name=Lepirudin, polypeptide_id=P00734
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P00533
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=O75015
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P02745
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P02746
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P02747
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P08637
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P12314
Extracted: drug_id=DB00002, drug_name=Cetuximab, polypeptide_id=P12318
Extracted: drug_id=DB00004, drug_name=Denileukin diftitox, polypeptide_id=P01589
Extracted: drug_id=DB00004, drug_name=Denileukin diftitox, polypeptide_id=P14784
Extracted: drug_id=DB00004, drug_name=Denileukin diftitox, polypeptide_id=P31785
Ext

One hot encoding

In [4]:
import pandas as pd

# Load the CSV file
file_path = r"D:/Frank/envs299/drug_targets_2000.csv"
data = pd.read_csv(file_path)

# One-hot encode the 'polypeptide-id' column, using 0 and 1 for binary representation
data_encoded = pd.get_dummies(data, columns=['polypeptide-id'], dtype=int)

# Group by 'drug-name' and aggregate with 'max' to combine all polypeptide IDs for each drug
data_combined = data_encoded.groupby('drug-name', as_index=False).max()

# Set 'drug-name' as the index for the final feature matrix (X)
X = data_combined.set_index('drug-name')

# Display the combined feature matrix
print("Combined feature matrix (X):\n", X.head())

# Save the result to a new CSV file
output_path = r"D:/Frank/envs299/drug_targets_one_hot_encoded.csv"
X.to_csv(output_path)
print(f"Combined data with one-hot encoding saved to {output_path}")


Combined feature matrix (X):
                                                     drug-id  \
drug-name                                                     
(+)-2-(4-biphenyl)propionic acid                    DB02047   
(1R,2R,3S,4R,6S)-3,4,6-Trihydroxy-5-{[(S)-hydro...  DB02028   
(2Z)-2-(Benzoylamino)-3-[4-(2-bromophenoxy)phen...  DB01720   
(2s)-2-Amino-4-(Methylsulfanyl)-1-Pyridin-2-Ylb...  DB01882   
(3,4-Dihydroxy-Phenyl)-Triphenyl-Arsonium           DB02086   

                                                    polypeptide-id_A0A024R8I1  \
drug-name                                                                       
(+)-2-(4-biphenyl)propionic acid                                            0   
(1R,2R,3S,4R,6S)-3,4,6-Trihydroxy-5-{[(S)-hydro...                          0   
(2Z)-2-(Benzoylamino)-3-[4-(2-bromophenoxy)phen...                          0   
(2s)-2-Amino-4-(Methylsulfanyl)-1-Pyridin-2-Ylb...                          0   
(3,4-Dihydroxy-Phenyl)-Triphenyl-Arsonium 

# Enzyme

In [5]:
import xml.etree.ElementTree as ET
import csv

# Define the namespace
ns = {'ns0': 'http://www.drugbank.ca'}

# Function to extract 'targets/polypeptide/@id'
def extract_drug_targets(drug, data_list=None):
    if data_list is None:
        data_list = []

    # Extract drug information from the parent 'drug' element
    drug_id = drug.findtext("ns0:drugbank-id", namespaces=ns)
    drug_name = drug.findtext("ns0:name", namespaces=ns)

    # Find all 'target' elements
    for target in drug.findall(".//ns0:enzymes", ns):
        target_data = {
            "drug-id": drug_id,
            "drug-name": drug_name
        }

        # Iterate through all 'polypeptide' elements within 'target'
        for polypeptide in target.findall(".//ns0:polypeptide", ns):
            polypeptide_id = polypeptide.get("id")  # Get the 'id' attribute from the 'polypeptide' element
            if polypeptide_id:
                target_data["polypeptide-id"] = polypeptide_id
                # Add the extracted data to the list
                data_list.append(target_data.copy())

                # Debugging: print out the data being added
                print(f"Extracted: drug_id={drug_id}, drug_name={drug_name}, polypeptide_id={polypeptide_id}")

    return data_list

# Function to extract 'target/polypeptide/@id' info and write it to CSV
def xml_to_csv_targets(xml_file, csv_file):
    try:
        # Parse the XML file
        print(f"Parsing XML file: {xml_file}")
        tree = ET.parse(xml_file)
        root = tree.getroot()
        print("XML file parsed successfully.")
    except FileNotFoundError:
        print("Error: The XML file was not found.")
        return
    except ET.ParseError:
        print("Error: The XML file could not be parsed.")
        return

    try:
        # Extract the 'target/polypeptide/@id' data
        print("Extracting 'enzyme/polypeptide/@id' data...")
        drug_target_data = []
        for drug in root.findall(".//ns0:drug", ns):
            drug_target_data.extend(extract_drug_targets(drug))
        print(f"Extracted {len(drug_target_data)} 'enzyme/polypeptide/@id' entries.")
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return

    try:
        # Open CSV for writing
        print(f"Writing data to CSV file: {csv_file}")
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)

            # Write header once, based on the keys of the first item
            if drug_target_data:
                headers = sorted(drug_target_data[0].keys())
                csvwriter.writerow(headers)
                print(f"CSV headers written: {headers}")

                # Write the row data for each drug target
                for data in drug_target_data:
                    row = [data.get(key, '') for key in headers]
                    csvwriter.writerow(row)
                print("CSV data written successfully.")
            else:
                print("No 'target/polypeptide/@id' data found to write.")
    except IOError:
        print("Error: Could not write to the CSV file.")
        return
    except Exception as e:
        print(f"Error during CSV writing: {e}")
        return

# Example usage
xml_file = 'first_2000_entries.xml'  # Your input XML file
csv_file = 'drug_enzymes_2000.csv'       # Your output CSV file

# Extract 'target/polypeptide/@id' info and write to CSV
xml_to_csv_targets(xml_file, csv_file)


Parsing XML file: first_2000_entries.xml
XML file parsed successfully.
Extracting 'enzyme/polypeptide/@id' data...
Extracted: drug_id=DB00006, drug_name=Bivalirudin, polypeptide_id=P05164
Extracted: drug_id=DB00008, drug_name=Peginterferon alfa-2a, polypeptide_id=P05177
Extracted: drug_id=DB00011, drug_name=Interferon alfa-n1, polypeptide_id=P05177
Extracted: drug_id=DB00013, drug_name=Urokinase, polypeptide_id=P39900
Extracted: drug_id=DB00018, drug_name=Interferon alfa-n3, polypeptide_id=P05177
Extracted: drug_id=DB00019, drug_name=Pegfilgrastim, polypeptide_id=P08246
Extracted: drug_id=DB00022, drug_name=Peginterferon alfa-2b, polypeptide_id=P05177
Extracted: drug_id=DB00022, drug_name=Peginterferon alfa-2b, polypeptide_id=P10635
Extracted: drug_id=DB00022, drug_name=Peginterferon alfa-2b, polypeptide_id=P11712
Extracted: drug_id=DB00025, drug_name=Antihemophilic factor, human recombinant, polypeptide_id=P00734
Extracted: drug_id=DB00025, drug_name=Antihemophilic factor, human recom

One hot encoding for enzyme

In [6]:
import pandas as pd

# Use the specified file path
file_path = r"D:/Frank/envs299/drug_enzymes_2000.csv"

# Load the CSV file
data = pd.read_csv(file_path)

# Display initial data structure
print("Data preview before processing:\n", data.head())

# One-hot encode the 'polypeptide-id' column with binary 0 and 1 values
data_encoded = pd.get_dummies(data, columns=['polypeptide-id'], dtype=int)

# Group by 'drug-name' and aggregate with 'max' to combine all polypeptide IDs for each drug
data_combined = data_encoded.groupby('drug-name', as_index=False).max()

# Set 'drug-name' as the index for the final feature matrix (X)
X = data_combined.set_index('drug-name')

# Display the combined feature matrix
print("Combined feature matrix (X):\n", X.head())

# Save the result to a new CSV file
output_path = r"D:/Frank/envs299/drug_enzymes_one_hot_encoded.csv"
X.to_csv(output_path)
print(f"Combined data with one-hot encoding saved to {output_path}")


Data preview before processing:
    drug-id              drug-name polypeptide-id
0  DB00006            Bivalirudin         P05164
1  DB00008  Peginterferon alfa-2a         P05177
2  DB00011     Interferon alfa-n1         P05177
3  DB00013              Urokinase         P39900
4  DB00018     Interferon alfa-n3         P05177
Combined feature matrix (X):
                                                 drug-id  \
drug-name                                                 
1-(2-Phenylethyl)-4-phenyl-4-acetoxypiperidine  DB01562   
4-Methoxyamphetamine                            DB01472   
5-androstenedione                               DB01456   
5alpha-androstane-3alpha,17beta-diol            DB01530   
8-azaguanine                                    DB01667   

                                                polypeptide-id_D6RB81  \
drug-name                                                               
1-(2-Phenylethyl)-4-phenyl-4-acetoxypiperidine                      0   
4-Methoxya

# Pathway

Extract smpdb

In [None]:
import xml.etree.ElementTree as ET
import csv

# Define the namespace
ns = {'ns0': 'http://www.drugbank.ca'}

# Function to extract 'pathways/pathway/smpdb-id' from all drugs
def extract_smpdb_id(drug, data_list=None):
    if data_list is None:
        data_list = []

    # Extract drug information from the parent 'drug' element
    drug_id = drug.findtext("ns0:drugbank-id", namespaces=ns)
    drug_name = drug.findtext("ns0:name", namespaces=ns)

    # Find all 'pathway' elements
    for pathway in drug.findall(".//ns0:pathway", ns):
        smpdb_id = pathway.findtext("ns0:smpdb-id", namespaces=ns)
        if smpdb_id:
            pathway_data = {
                "drug-id": drug_id,
                "drug-name": drug_name,
                "smpdb-id": smpdb_id
            }

            # Add the extracted data to the list
            data_list.append(pathway_data)

            # Debugging: print out the data being added
            print(f"Extracted: drug_id={drug_id}, drug_name={drug_name}, smpdb_id={smpdb_id}")

    return data_list

# Function to extract 'smpdb-id' info from all drugs and write to CSV
def xml_to_csv_smpdb_ids(xml_file, csv_file):
    try:
        # Parse the XML file
        print(f"Parsing XML file: {xml_file}")
        tree = ET.parse(xml_file)
        root = tree.getroot()
        print("XML file parsed successfully.")
    except FileNotFoundError:
        print("Error: The XML file was not found.")
        return
    except ET.ParseError:
        print("Error: The XML file could not be parsed.")
        return

    try:
        # Extract 'smpdb-id' data for all drugs
        print("Extracting 'pathways/pathway/smpdb-id' data for all drugs...")
        drug_pathway_data = []
        for drug in root.findall(".//ns0:drug", ns):
            drug_pathway_data.extend(extract_smpdb_id(drug))
        print(f"Extracted {len(drug_pathway_data)} 'smpdb-id' entries.")
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return

    try:
        # Open CSV for writing
        print(f"Writing data to CSV file: {csv_file}")
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)

            # Write header once, based on the keys of the first item
            if drug_pathway_data:
                headers = sorted(drug_pathway_data[0].keys())
                csvwriter.writerow(headers)
                print(f"CSV headers written: {headers}")

                # Write the row data for each pathway
                for data in drug_pathway_data:
                    row = [data.get(key, '') for key in headers]
                    csvwriter.writerow(row)
                print("CSV data written successfully.")
            else:
                print("No 'smpdb-id' data found to write.")
    except IOError:
        print("Error: Could not write to the CSV file.")
        return
    except Exception as e:
        print(f"Error during CSV writing: {e}")
        return

# Example usage
xml_file = 'first_100_entries.xml'  # Your input XML file
csv_file = 'smpdb_ids.csv'          # Your output CSV file

# Extract 'smpdb-id' info from all drugs and write to CSV
xml_to_csv_smpdb_ids(xml_file, csv_file)


Extract Pathway hsa from Kegg

From drug id

In [22]:
import requests
from bs4 import BeautifulSoup

def extract_all_hsa_content(drug_id):
    # URL for the specific drug page in KEGG
    url = f"https://www.genome.jp/dbget-bin/www_bget?{drug_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize list to capture any line with "hsa"
        hsa_content = []

        # Parse each line to find any content that includes "hsa"
        for line in soup.stripped_strings:
            if "hsa" in line:
                hsa_content.append(line)

        # Display all extracted HSA content for inspection
        if hsa_content:
            print(f"HSA-related content for drug '{drug_id}':")
            for line in hsa_content:
                print(line)
        else:
            print(f"No HSA-related content found for drug '{drug_id}'.")
    else:
        print(f"Failed to retrieve page for drug '{drug_id}'.")

# Example usage
extract_all_hsa_content("D07665")


HSA-related content for drug 'Bivalirudin' (KEGG ID: D03136):
hsa04080
hsa04610


From Drug Name

In [None]:
pip install bs4

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Load drug names from CSV file
file_path = 'drug_fingerprints_2000.csv'
drug_data = pd.read_csv(file_path)

# Ensure the column with drug names is named 'drug_name' (or modify to the actual column name)
if 'drug_name' not in drug_data.columns:
    print("Please ensure there is a column named 'drug_name' in the CSV file.")
else:
    # Define a function to get the KEGG drug ID by name
    def get_drug_id_by_name(drug_name):
        search_url = f"https://rest.kegg.jp/find/drug/{drug_name}"
        response = requests.get(search_url)
        
        if response.status_code == 200 and response.text:
            drug_id = response.text.split('\t')[0].replace("dr:", "")
            return drug_id
        else:
            return None

    # Define a function to extract HSA codes from the KEGG drug page
    def extract_hsa_codes(drug_id):
        url = f"https://www.genome.jp/dbget-bin/www_bget?{drug_id}"
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            hsa_codes = []

            for line in soup.stripped_strings:
                if line.startswith("hsa"):
                    hsa_codes.append(line.split()[0])  # Capture only the HSA code part
            return hsa_codes
        else:
            return []

    # Process each drug name to retrieve HSA codes
    results = []
    for drug_name in drug_data['drug_name']:
        drug_id = get_drug_id_by_name(drug_name)
        if drug_id:
            hsa_codes = extract_hsa_codes(drug_id)
            results.append({'drug_name': drug_name, 'hsa_codes': ", ".join(hsa_codes)})
        else:
            results.append({'drug_name': drug_name, 'hsa_codes': "Not Found"})

    # Save the results to a new CSV file
    output_df = pd.DataFrame(results)
    output_file_path = 'drug_hsa_codes_1000.csv'
    output_df.to_csv(output_file_path, index=False)

    print(f"HSA codes have been extracted and saved to {output_file_path}.")


HSA codes have been extracted and saved to drug_hsa_codes_1000.csv.


Separation （扒网页使得所有hsa code都在一个cell里，需将其分离）

In [11]:
import pandas as pd

# Load the CSV file
file_path = r"D:/Frank/envs299/drug_hsa_codes_2000.csv"
data_hsa_codes = pd.read_csv(file_path)

# Split the 'hsa_codes' column into multiple columns, removing any leading/trailing spaces
hsa_codes_expanded_cleaned = data_hsa_codes['hsa_codes'].str.split(',', expand=True).apply(lambda x: x.str.strip())

# Concatenate the original 'drug_name' column with the cleaned and expanded HSA codes
data_hsa_expanded_cleaned = pd.concat([data_hsa_codes[['drug_name']], hsa_codes_expanded_cleaned], axis=1)

# Save the cleaned and expanded DataFrame to a new CSV file
output_path = r"D:/Frank/envs299/drug_hsa_codes_expanded_cleaned.csv"
data_hsa_expanded_cleaned.to_csv(output_path, index=False)

print(f"Expanded and cleaned HSA codes saved to {output_path}")

Expanded and cleaned HSA codes saved to D:/Frank/envs299/drug_hsa_codes_expanded_cleaned.csv


One hot encoding

In [12]:
import pandas as pd

# Load the expanded HSA codes CSV file
file_path = r"D:/Frank/envs299/drug_hsa_codes_expanded_cleaned.csv"
data = pd.read_csv(file_path)

# Reshape the data to have each HSA code as a separate row with the corresponding drug name
data_melted = data.melt(id_vars=['drug_name'], value_name='hsa_code').drop(columns=['variable'])

# Drop rows where 'hsa_code' is NaN (i.e., empty cells from the original expansion)
data_melted = data_melted.dropna(subset=['hsa_code'])

# Perform one-hot encoding on the 'hsa_code' column
data_encoded = pd.get_dummies(data_melted, columns=['hsa_code'], dtype=int)

# Group by 'drug_name' to aggregate each drug's HSA codes in a one-hot encoded format
data_combined = data_encoded.groupby('drug_name', as_index=False).max()

# Display the one-hot encoded feature matrix
print("One-hot encoded feature matrix:\n", data_combined.head())

# Save the result to a new CSV file
output_path = r"D:/Frank/envs299/drug_hsa_codes_one_hot_encoded.csv"
data_combined.to_csv(output_path, index=False)
print(f"One-hot encoded data saved to {output_path}")




One-hot encoded feature matrix:
      drug_name  hsa_code_hsa00010  hsa_code_hsa00051  hsa_code_hsa00052  \
0          ATP                  0                  0                  0   
1     Abarelix                  0                  0                  0   
2  Acamprosate                  0                  0                  0   
3     Acarbose                  0                  0                  1   
4   Acebutolol                  0                  0                  0   

   hsa_code_hsa00071  hsa_code_hsa00130  hsa_code_hsa00140  hsa_code_hsa00230  \
0                  0                  0                  0                  0   
1                  0                  0                  0                  0   
2                  0                  0                  0                  0   
3                  0                  0                  0                  0   
4                  0                  0                  0                  0   

   hsa_code_hsa00232  hsa_cod

# Substructure

In [None]:
pip install rdkit

In [17]:
from rdkit import Chem
from rdkit.Chem import AllChem

Extract molecular fingerprint from Pubmed

In [None]:
import pandas as pd
import requests
from rdkit import Chem
from rdkit.Chem import AllChem

# Load the drug names from the file
file_path = r"D:/Frank/envs299/drug_targets_one_hot_encoded.csv"
drug_data = pd.read_csv(file_path)

# Prepare a list to store the results
fingerprint_data = []

# Loop through each drug in the file
for drug_name in drug_data['drug-name']:  # Assuming the column is named 'drug-name'
    print(f"Processing {drug_name}...")

    # Construct the PubChem API URL for the Isomeric SMILES
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/IsomericSMILES/JSON"
    
    # Make the GET request to PubChem
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the JSON response and extract the SMILES string
        data = response.json()
        try:
            smiles = data['PropertyTable']['Properties'][0]['IsomericSMILES']
            print(f"Isomeric SMILES for {drug_name}: {smiles}")
            
            # Define the molecule from SMILES
            molecule = Chem.MolFromSmiles(smiles)
            
            # Generate the molecular fingerprint
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(molecule, radius=2, nBits=1024)
            
            # Convert the fingerprint to a list of bits
            fingerprint_bits = list(fingerprint)
            
            # Add the drug name and fingerprint to the results list
            fingerprint_data.append([drug_name] + fingerprint_bits)
        
        except (IndexError, KeyError) as e:
            #print(f"SMILES not found for {drug_name}")
            fingerprint_data.append([drug_name] + ["SMILES not found"])
    else:
        #print(f"Failed to retrieve data for {drug_name}. Status code: {response.status_code}")
        fingerprint_data.append([drug_name] + ["API request failed"])

# Create a DataFrame from the results
fingerprint_df = pd.DataFrame(fingerprint_data)

# Set the column names
fingerprint_df.columns = ['drug-name'] + [f'bit_{i}' for i in range(1024)]


# Save the fingerprint data to a new CSV file
output_path = r"D:/Frank/envs299/drug_fingerprints_2000.csv"
fingerprint_df.to_csv(output_path, index=False)
print(f"Molecular fingerprints saved to {output_path}")


Extract SMILES from Drugbank (but Drugbank dataset is incomplete for SMILES)

In [None]:
import xml.etree.ElementTree as ET
import csv

# Define the namespace
ns = {'ns0': 'http://www.drugbank.ca'}

# Function to extract SMILES value for each drug
def extract_smiles(drug, data_list=None):
    if data_list is None:
        data_list = []

    # Extract drug information from the parent 'drug' element
    drug_id = drug.findtext("ns0:drugbank-id", namespaces=ns)
    drug_name = drug.findtext("ns0:name", namespaces=ns)

    # Find the SMILES string in calculated-properties/property[9]/value
    smiles = drug.find(".//ns0:calculated-properties/ns0:property[9]/ns0:value", namespaces=ns)

    # If SMILES string is found, add to data list
    if smiles is not None:
        data_list.append({
            "drug-id": drug_id,
            "drug-name": drug_name,
            "SMILES": smiles.text
        })

        # Debugging: print out the data being added
        print(f"Extracted: drug_id={drug_id}, drug_name={drug_name}, SMILES={smiles.text}")
    else:
        print(f"No SMILES found for drug_id={drug_id}")

    return data_list

# Function to extract SMILES for all drugs and write them to CSV
def xml_to_csv_smiles(xml_file, csv_file):
    try:
        # Parse the XML file
        print(f"Parsing XML file: {xml_file}")
        tree = ET.parse(xml_file)
        root = tree.getroot()
        print("XML file parsed successfully.")
    except FileNotFoundError:
        print("Error: The XML file was not found.")
        return
    except ET.ParseError:
        print("Error: The XML file could not be parsed.")
        return

    try:
        # Extract SMILES data
        print("Extracting SMILES data...")
        drug_smiles_data = []
        for drug in root.findall(".//ns0:drug", ns):
            drug_smiles_data.extend(extract_smiles(drug))
        print(f"Extracted SMILES for {len(drug_smiles_data)} drugs.")
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return

    try:
        # Open CSV for writing
        print(f"Writing data to CSV file: {csv_file}")
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)

            # Write header
            if drug_smiles_data:
                headers = sorted(drug_smiles_data[0].keys())
                csvwriter.writerow(headers)
                print(f"CSV headers written: {headers}")

                # Write the row data for each drug
                for data in drug_smiles_data:
                    row = [data.get(key, '') for key in headers]
                    csvwriter.writerow(row)
                print("CSV data written successfully.")
            else:
                print("No SMILES data found to write.")
    except IOError:
        print("Error: Could not write to the CSV file.")
    except Exception as e:
        print(f"Error during CSV writing: {e}")

# Example usage
xml_file = 'first_100_entries.xml'  # Your input XML file
csv_file = 'drug_smiles.csv'        # Your output CSV file

# Extract SMILES info and write to CSV
xml_to_csv_smiles(xml_file, csv_file)


Data cleaning

In [8]:
import pandas as pd

# Load the CSV file
file_path = r"D:/Frank/envs299/drug_fingerprints_2000.csv"
data = pd.read_csv(file_path)

# Remove rows where the 'bit_0' column indicates an API failure
data_cleaned = data[data['bit_0'] != 'API request failed']
output_path = r"D:/Frank/envs299/drug_fingerprints_2000.csv"
data_cleaned.to_csv(output_path, index=False)

# Jaccard Matrix

In [18]:
import pandas as pd
from sklearn.metrics import jaccard_score
import numpy as np

# Load the CSV file
file_path = r"D:/Frank/envs299/drug_fingerprints_2000.csv"
data = pd.read_csv(file_path)

# Set 'drug-name' as index if it's one of the first two columns
data.set_index('drug-name', inplace=True)

# Remove any non-binary columns (select only columns with values 0 and 1)
binary_data = data.loc[:, (data.isin([0, 1]).all())]

# Initialize an empty DataFrame for storing the similarity scores
jaccard_similarity_matrix = pd.DataFrame(
    np.zeros((len(binary_data), len(binary_data))), 
    index=binary_data.index, 
    columns=binary_data.index
)

# Calculate pairwise Jaccard similarity only for the upper triangle
for i, drug1 in enumerate(binary_data.index):
    for j, drug2 in enumerate(binary_data.index):
        if j > i:  # Calculate only for the upper triangle
            # Calculate Jaccard similarity score between two binary vectors
            similarity = jaccard_score(binary_data.loc[drug1], binary_data.loc[drug2])
            jaccard_similarity_matrix.loc[drug1, drug2] = similarity
            jaccard_similarity_matrix.loc[drug2, drug1] = similarity  # Fill symmetric entry

# Set diagonal to 1 (self-similarity)
np.fill_diagonal(jaccard_similarity_matrix.values, 1)

# Save the Jaccard similarity matrix to a new CSV file
output_path = r"D:/Frank/envs299/drug_jaccard_similarity_substructure.csv"
jaccard_similarity_matrix.to_csv(output_path)
print(f"Jaccard similarity matrix saved to {output_path}")


Jaccard similarity matrix saved to D:/Frank/envs299/drug_jaccard_similarity_substructure.csv


# Make four input matrix have the same size

In [None]:
import pandas as pd
import csv

# Load the original CSV files
file_path_enzymes = r"D:/Frank/envs299/2000 dataset size/drug_jaccard_similarity_enzymes.csv"
file_path_targets = r"D:/Frank/envs299/2000 dataset size/drug_jaccard_similarity_targets.csv"
file_path_substructure = r"D:/Frank/envs299/2000 dataset size/drug_jaccard_similarity_substructure.csv"
file_path_pathways = r"D:/Frank/envs299/2000 dataset size/drug_jaccard_similarity_pathways.csv"

# Read each CSV file and extract the list of drug names
df_enzymes = pd.read_csv(file_path_enzymes, index_col=0)
df_targets = pd.read_csv(file_path_targets, index_col=0)
df_substructure = pd.read_csv(file_path_substructure, index_col=0)
df_pathways = pd.read_csv(file_path_pathways, index_col=0)

# Get the common drug names from the first row and first column of each DataFrame
common_drug_names = set(df_enzymes.index) & set(df_targets.index) & set(df_substructure.index) & set(df_pathways.index)
common_drug_names = common_drug_names & set(df_enzymes.columns) & set(df_targets.columns) & set(df_substructure.columns) & set(df_pathways.columns)
common_drug_names = list(common_drug_names)  # Convert to list for indexing
print(common_drug_names)

with open("common_drug_names.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(common_drug_names)

# Filter each matrix to include only rows and columns with common drug names
df_enzymes_filtered = df_enzymes.loc[common_drug_names, common_drug_names]
df_targets_filtered = df_targets.loc[common_drug_names, common_drug_names]
df_substructure_filtered = df_substructure.loc[common_drug_names, common_drug_names]
df_pathways_filtered = df_pathways.loc[common_drug_names, common_drug_names]

# Save each filtered matrix to new CSV files
df_enzymes_filtered.to_csv(r"D:/Frank/envs299/drug_jaccard_similarity_enzymes_filtered.csv")
df_targets_filtered.to_csv(r"D:/Frank/envs299/drug_jaccard_similarity_targets_filtered.csv")
df_substructure_filtered.to_csv(r"D:/Frank/envs299/drug_jaccard_similarity_substructure_filtered.csv")
df_pathways_filtered.to_csv(r"D:/Frank/envs299/drug_jaccard_similarity_pathways_filtered.csv")

print("Filtered matrices with common drug names in both rows and columns have been saved.")


['Granisetron', 'Doxepin', 'Ketoprofen', 'Rosuvastatin', 'Mequitazine', 'Flurbiprofen', 'Aminoglutethimide', 'Ambenonium', 'Encainide', 'Treprostinil', 'Ephedrine', 'Reserpine', 'Amiodarone', 'Flunitrazepam', 'Apomorphine', 'Bimatoprost', 'Methimazole', 'Oxycodone', 'Tripelennamine', 'Pyridostigmine', 'Desmopressin', 'Oxtriphylline', 'Quinidine', 'Eszopiclone', 'Oxazepam', 'Methantheline', 'Allopurinol', 'Frovatriptan', 'Lisuride', 'Bupropion', 'Diclofenac', 'Methysergide', 'Dopamine', 'Midomafetamine', 'Phenprocoumon', 'Flupentixol', 'Metoprolol', 'Erlotinib', 'Levodopa', 'Practolol', 'Benazepril', 'Etoricoxib', 'Disulfiram', 'Triamcinolone', 'Fluorometholone', 'Fluticasone propionate', 'Tacrolimus', 'Estrone', 'Acebutolol', 'Betamethasone', 'Pemetrexed', 'Vinblastine', 'Dextroamphetamine', 'Tasosartan', 'Flurazepam', 'Alfuzosin', 'Sorafenib', 'Pilocarpine', 'Bromfenac', 'Acetaminophen', 'Ziprasidone', 'Aminocaproic acid', 'Flurandrenolide', 'Ergocalciferol', 'Phenobarbital', 'Risperi

Map the ddi category to the matrix

In [22]:
import pandas as pd

# Load the provided CSV files
filtered_interactions_file = 'D:/Frank/envs299/2000 dataset size/updated_normalized_drug_interactions_with_changeable.csv'
jaccard_similarity_file = 'D:/Frank/envs299/2000 dataset size/drug_jaccard_similarity_enzymes_filtered.csv'

# Read the data into DataFrames
filtered_interactions_df = pd.read_csv(filtered_interactions_file)
jaccard_similarity_df = pd.read_csv(jaccard_similarity_file, index_col=0)  # Assuming the first column is the index

# Prepare the matrix with the same format as the Jaccard similarity matrix
# Initialize an empty matrix with the same dimensions and indices/columns
matrix_with_indices = pd.DataFrame(
    index=jaccard_similarity_df.index,
    columns=jaccard_similarity_df.columns
)

# Populate the matrix with the 'index' values from the filtered_interactions_df
for _, row in filtered_interactions_df.iterrows():
    drug_a = row['drug A']
    drug_b = row['drug B']
    index_value = row['index']
    is_changeable = row['Changeable']  # Check the Changeable column
    
    # Fill the matrix for both directions if Changeable = 1, else only in one direction
    if drug_a in matrix_with_indices.index and drug_b in matrix_with_indices.columns:
        matrix_with_indices.loc[drug_a, drug_b] = index_value
        if is_changeable == 1:
            matrix_with_indices.loc[drug_b, drug_a] = index_value

# Save the matrix to a new CSV file
output_matrix_file = 'D:/Frank/envs299/2000 dataset size/matrix_with_indices.csv'
matrix_with_indices.to_csv(output_matrix_file)

print(f"Matrix saved to: {output_matrix_file}")


Matrix saved to: D:/Frank/envs299/2000 dataset size/matrix_with_indices.csv
