# Offline Activities of Module II


## Task 1

Retrieve all approved drugs from the ChEMBL database, sort them by approval year and name.

In [1]:

from chembl_webresource_client.new_client import new_client
import pandas as pd
from tqdm.notebook import tqdm

# Initialize the drug client
drug = new_client.drug

# Fetch all approved drugs with a first approval year from 2014 onwards
approved_drugs = drug.filter(first_approval__gte=2014, max_phase=4)

# Helper function to extract INN names
def get_inn_name(drug_info):
    synonyms = drug_info.get('molecule_synonyms', [])
    inn_names = [syn['synonyms'] for syn in synonyms if syn['syn_type'] == 'INN']
    return "; ".join(inn_names) if inn_names else "No INN"

# Prepare lists to hold data for DataFrame construction
data = {
    'molecule_chembl_id': [],
    'first_approval': [],
    'inn_name': [],
    'max_phase': []
}

# Iterate over each drug record
for drug in tqdm(approved_drugs, desc="Processing drugs"):
    data['molecule_chembl_id'].append(drug.get('molecule_chembl_id', ''))
    data['first_approval'].append(drug.get('first_approval', ''))
    inn_name = get_inn_name(drug)  # Capture the INN name using the helper function
    data['inn_name'].append(inn_name)
    data['max_phase'].append(drug.get('max_phase', ''))

# Convert to DataFrame
df_drugs = pd.DataFrame(data)

# Sort DataFrame by INN name and first approval
df_drugs.sort_values(by=['inn_name', 'first_approval'], ascending=[True, True], inplace=True)

# Export to CSV
df_drugs.to_csv('sorted_approved_drugs.csv', index=False)

Processing drugs:   0%|          | 0/543 [00:00<?, ?it/s]

## Task 2

For each approved drug since 2014 that you identified in step (1), retrieve a list of UniProt accession numbers, namely protein targets associated with the drug;


In [2]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
from tqdm.notebook import tqdm

# Initialize ChEMBL client
chembl_client = new_client

# Define the size of chunks for processing large datasets
chunk_size = 25 

# Fetch approved drugs since 2014
approved_drugs = chembl_client.drug.filter(first_approval__gte=2014, max_phase=4).only(['molecule_chembl_id', 'first_approval'])
approved_drugs_df = pd.DataFrame([drug for drug in tqdm(approved_drugs, desc="Loading approved drugs")])

# Display basic drug information
print(approved_drugs_df.head())

# Select the first 50 drugs for demonstration purposes
drugs_approved = approved_drugs_df['molecule_chembl_id'][:50]

# Dictionary to store compound to target ChEMBL IDs
comp2target = {d: set() for d in drugs_approved}

# Retrieve activities to map compounds to target ChEMBL IDs
keys = list(comp2target.keys())
for i in tqdm(range(0, len(keys), chunk_size), desc="Mapping compounds to targets"):
    activities = chembl_client.activity.filter(molecule_chembl_id__in=keys[i:i + chunk_size]).only('molecule_chembl_id', 'target_chembl_id')
    for activity in activities:
        comp2target[activity['molecule_chembl_id']].add(activity['target_chembl_id'])

# Dictionary to store compound to UniProt IDs
for key in tqdm(keys, desc="Mapping ChEMBL IDs to UniProt"):
    uniprots = set()
    vals = list(comp2target[key])
    for i in range(0, len(vals), chunk_size):
        targets = chembl_client.target.filter(target_chembl_id__in=vals[i:i + chunk_size]).only('target_components')
        for target in targets:
            for component in target['target_components']:
                uniprots.add(component['accession'])
    comp2target[key] = uniprots

# Creating DataFrame for compound to UniProt mapping
comp2uniprot_df = pd.DataFrame([(k, v) for k, vals in comp2target.items() for v in vals], columns=['ChEMBL_ID', 'UniProt'])

# Display the DataFrame
print(comp2uniprot_df.head())

# Export the results
comp2uniprot_df.to_csv('chembl_drugs_to_uniprot.tsv', sep='\t', index=False)


Loading approved drugs:   0%|          | 0/543 [00:00<?, ?it/s]

  applicants atc_code_description  first_approval molecule_chembl_id  \
0       None                 None            2022           CHEMBL40   
1       None                 None            2022           CHEMBL45   
2       None                 None            2017          CHEMBL110   
3       None                 None            2018        CHEMBL17860   
4       None                 None            2017       CHEMBL278623   

  research_codes synonyms  
0           None     None  
1           None     None  
2           None     None  
3           None     None  
4           None     None  


Mapping compounds to targets:   0%|          | 0/2 [00:00<?, ?it/s]

Mapping ChEMBL IDs to UniProt:   0%|          | 0/50 [00:00<?, ?it/s]

  ChEMBL_ID UniProt
0  CHEMBL40  A6XA80
1  CHEMBL40  P11712
2  CHEMBL40  P11229
3  CHEMBL40  P50406
4  CHEMBL40  P02768


## Task 3

For each protein with a UniProt accession number that you identified in step (2), retrieve UniProt keywords associated with it

In [3]:
import requests
from tqdm.notebook import tqdm
import pandas as pd


# Load the DataFrame with UniProt accessions from the saved file
df = pd.read_csv('chembl_drugs_to_uniprot.tsv', sep='\t', usecols=['UniProt'])


# Base URL for the UniProt REST API
base_url = "https://www.ebi.ac.uk/proteins/api/proteins"

# Function to retrieve keywords for a given UniProt accession
def fetch_keywords(accession):
    response = requests.get(f"{base_url}/{accession}", headers={"Accept": "application/json"})
    if response.status_code == 200:
        json_data = response.json()
        # Extract keywords, handling cases where no keywords are found
        keywords = [keyword['value'] for keyword in json_data.get('keywords', [])]
        return keywords
    else:
        print(f"Failed to retrieve data for {accession}: {response.status_code}")
        return []

# List to hold keywords for each protein
all_keywords = []

# Loop over UniProt accessions with progress bar
for accession in tqdm(df['UniProt'], desc="Fetching keywords"):
    keywords = fetch_keywords(accession)
    all_keywords.append((accession, keywords))

# Convert results to DataFrame
keywords_df = pd.DataFrame(all_keywords, columns=['UniProt', 'Keywords'])

# Display the DataFrame
print(keywords_df)

# Export the results to CSV
keywords_df.to_csv('uniprot_keywords.csv', index=False)

Fetching keywords:   0%|          | 0/3582 [00:00<?, ?it/s]

     UniProt                                           Keywords
0     A6XA80  [Endoplasmic reticulum, Leukotriene biosynthes...
1     P11712  [3D-structure, Alternative splicing, Direct pr...
2     P11229  [3D-structure, Alternative splicing, Cell memb...
3     P50406  [3D-structure, Cell membrane, Disulfide bond, ...
4     P02768  [3D-structure, Alternative splicing, Calcium, ...
...      ...                                                ...
3577  P0DTD1  [3D-structure, Activation of host autophagy by...
3578  Q12809  [3D-structure, Alternative splicing, Cell memb...
3579  P14416  [3D-structure, Alternative splicing, Cell memb...
3580  P03372  [3D-structure, Activator, Alternative promoter...
3581  P10275  [3D-structure, Activator, Alternative splicing...

[3582 rows x 2 columns]
