In [12]:
# AI and Biotechnology/Bioinformatics
# AI and Drug Discovery Course: QSAR Data Curation
# Student: Michelle [Your Last Name]
# Assignment 2: QSAR Data Curation Using ChEMBL

"""
This notebook demonstrates how to collect and preprocess bioactivity data
from ChEMBL for QSAR modeling of KRAS (GTPase KRas).
"""


'\nThis notebook demonstrates how to collect and preprocess bioactivity data \nfrom ChEMBL for QSAR modeling of KRAS (GTPase KRas).\n'

In [13]:
# PART 1: Setup and Environment Configuration

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

# Create data directory
!mkdir -p "/content/gdrive/My Drive/Colab Notebooks/data"

# Install ChEMBL web service client
!pip install chembl_webresource_client

# Import required libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client
import numpy as np

print("Setup complete!")


Mounted at /content/gdrive/
Setup complete!


In [14]:
### PART 2: Target Selection and Identification


# Search for KRAS target in ChEMBL database
target = new_client.target
target_query = target.search("KRAS")
targets = pd.DataFrame.from_dict(target_query)

print(f"Found {len(targets)} KRAS-related targets")
print("\nTop 5 targets:")
print(targets[['target_chembl_id', 'pref_name', 'organism', 'target_type']].head())

# Select the primary KRAS target (GTPase KRas)
selected_target = targets.target_chembl_id[0]
print(f"\nSelected target: {selected_target}")
print(f"Target name: {targets.pref_name[0]}")

Found 7 KRAS-related targets

Top 5 targets:
  target_chembl_id                                        pref_name  \
0    CHEMBL2189121                                      GTPase KRas   
1    CHEMBL5483196                            Protein cereblon-KRAS   
2    CHEMBL5169273  von Hippel-Lindau disease tumor suppressor/KRAS   
3    CHEMBL5465393                                        SOS1-KRAS   
4    CHEMBL4523623                                       PDE6D/KRAS   

       organism                  target_type  
0  Homo sapiens               SINGLE PROTEIN  
1  Homo sapiens  PROTEIN-PROTEIN INTERACTION  
2  Homo sapiens  PROTEIN-PROTEIN INTERACTION  
3  Homo sapiens              PROTEIN COMPLEX  
4  Homo sapiens  PROTEIN-PROTEIN INTERACTION  

Selected target: CHEMBL2189121
Target name: GTPase KRas


In [15]:
# PART 3: Bioactivity Data Retrieval


# Retrieve bioactivity data with IC50 values
activity = new_client.activity
results = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

# Convert to DataFrame
df1 = pd.DataFrame.from_dict(results)

print(f"\nRetrieved {len(df1)} bioactivity records")
print(f"Columns: {df1.shape[1]}")

# Save raw data
df1.to_csv('bioactivity_raw_data.csv', index=False)
!cp bioactivity_raw_data.csv "/content/gdrive/My Drive/Colab Notebooks/data"

print("\nRaw data saved successfully!")
print(df1.head())


Retrieved 5760 bioactivity records
Columns: 46

Raw data saved successfully!
  action_type activity_comment  activity_id activity_properties  \
0        None             None     13352855                  []   
1        None             None     13352856                  []   
2        None             None     14548911                  []   
3        None             None     14548912                  []   
4        None             None     14548913                  []   

  assay_chembl_id                                  assay_description  \
0   CHEMBL2399318  Inhibition of full-length human KRas4B (amino ...   
1   CHEMBL2399319  Inhibition of full-length human KRas4B (amino ...   
2   CHEMBL3223233  Inhibition of recombinant HA-tagged K-Ras G12V...   
3   CHEMBL3223233  Inhibition of recombinant HA-tagged K-Ras G12V...   
4   CHEMBL3223233  Inhibition of recombinant HA-tagged K-Ras G12V...   

  assay_type assay_variant_accession assay_variant_mutation bao_endpoint  ...  \
0    

In [16]:
### PART 4: Data Preprocessing and Quality Control


# Check for missing values in critical columns
print("\nMissing values analysis:")
critical_cols = ['standard_value', 'canonical_smiles', 'standard_type', 'standard_units']
for col in critical_cols:
    if col in df1.columns:
        missing = df1[col].isna().sum()
        print(f"{col}: {missing} missing ({missing/len(df1)*100:.2f}%)")

# Filter records with valid bioactivity values
df2 = df1[df1["standard_value"].notna()].copy()
print(f"\nAfter removing missing standard_value: {len(df2)} records")

# Filter records with valid SMILES
df2 = df2[df2["canonical_smiles"].notna()]
df2 = df2[df2["canonical_smiles"].str.lower() != "none"]
df2 = df2[df2["canonical_smiles"].str.strip() != ""]
print(f"After removing invalid SMILES: {len(df2)} records")

# Standardize units (ensure all IC50 values are in nM)
df2 = df2[df2["standard_units"] == "nM"]
print(f"After filtering for nM units: {len(df2)} records")


Missing values analysis:
standard_value: 22 missing (0.38%)
canonical_smiles: 8 missing (0.14%)
standard_type: 0 missing (0.00%)
standard_units: 22 missing (0.38%)

After removing missing standard_value: 5738 records
After removing invalid SMILES: 5730 records
After filtering for nM units: 5730 records


In [17]:
### PART 5: Bioactivity Classification


# Assign bioactivity classes based on IC50 thresholds
# Active: IC50 <= 1000 nM
# Intermediate: 1000 nM < IC50 < 10000 nM
# Inactive: IC50 >= 10000 nM

bioactivity_class = []
for value in df2['standard_value']:
    value = float(value)
    if value >= 10000:
        bioactivity_class.append("inactive")
    elif value <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

# Add bioactivity class to dataframe
df2['bioactivity_class'] = bioactivity_class

# Display class distribution
print("\nBioactivity class distribution:")
print(df2['bioactivity_class'].value_counts())
print("\nPercentage distribution:")
print(df2['bioactivity_class'].value_counts(normalize=True) * 100)



Bioactivity class distribution:
bioactivity_class
active          2925
intermediate    1581
inactive        1224
Name: count, dtype: int64

Percentage distribution:
bioactivity_class
active          51.047120
intermediate    27.591623
inactive        21.361257
Name: proportion, dtype: float64


In [18]:
### PART 6: Create Final Curated Dataset


# Select relevant columns for QSAR modeling
df3 = df2[[
    'molecule_chembl_id',
    'canonical_smiles',
    'standard_value',
    'bioactivity_class'
]].copy()

# Remove duplicates based on molecule_chembl_id
df3 = df3.drop_duplicates(subset='molecule_chembl_id', keep='first')
print(f"\nAfter removing duplicates: {len(df3)} unique compounds")

# Sort by standard_value
df3 = df3.sort_values('standard_value').reset_index(drop=True)

print("\nFinal curated dataset:")
print(df3.head(10))
print(f"\nDataset shape: {df3.shape}")
print(f"Columns: {df3.columns.tolist()}")


After removing duplicates: 2751 unique compounds

Final curated dataset:
  molecule_chembl_id                                   canonical_smiles  \
0      CHEMBL5741049  Oc1cc(Cl)c(C(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)...   
1      CHEMBL4855757  Oc1cc(-c2ncc3c(N4CC5CCC(C4)N5)nc(OCC45CCCN4C(C...   
2      CHEMBL4857438  Oc1cc(-c2ncc3c(N4CC5CCC(C4)N5)nc(OC[C@@]45CCCN...   
3      CHEMBL5612044  [2H]C([2H])([2H])N(c1nc(OC([2H])([2H])C23CCC([...   
4      CHEMBL5611970  C#Cc1c(F)ccc2cc(O)cc(-c3ncc4c(N(C)[C@H]5C[C@@H...   
5      CHEMBL4863339  C#Cc1cccc2cc(O)cc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...   
6      CHEMBL5612889  C#Cc1c(F)ccc2cc(O)cc(-c3ncc4c(N(C)[C@H]5C[C@@H...   
7      CHEMBL5776467  Oc1ccc(OC(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)nc(...   
8      CHEMBL5996049  C#Cc1c(F)ccc2cccc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...   
9      CHEMBL4858364  C#Cc1c(F)ccc2cc(O)cc(-c3ncc4c(N5CC6CCC(C5)N6)n...   

  standard_value bioactivity_class  
0            0.1            active  
1            0.1          

In [19]:
### PART 7: Save Preprocessed Data


# Save to CSV
output_file = 'bioactivity_preprocessed_data.csv'
df3.to_csv(output_file, index=False)

# Copy to Google Drive
!cp {output_file} "/content/gdrive/My Drive/Colab Notebooks/data"

print(f"\nPreprocessed data saved to: {output_file}")
print(f"Total records in final dataset: {len(df3)}")

# Display summary statistics
print("\nSummary Statistics:")
print(df3['standard_value'].describe())


Preprocessed data saved to: bioactivity_preprocessed_data.csv
Total records in final dataset: 2751

Summary Statistics:
count        2751
unique       1069
top       10000.0
freq          352
Name: standard_value, dtype: object


In [20]:
### PART 8: Data Quality Report



print("DATA CURATION SUMMARY REPORT")
print(f"Target: {targets.pref_name[0]} ({selected_target})")
print(f"Initial bioactivity records: {len(df1)}")
print(f"Final curated records: {len(df3)}")
print(f"Records removed: {len(df1) - len(df3)} ({(len(df1)-len(df3))/len(df1)*100:.2f}%)")
print("\nFinal dataset composition:")
for class_name in ['active', 'intermediate', 'inactive']:
    count = (df3['bioactivity_class'] == class_name).sum()
    pct = count / len(df3) * 100
    print(f"  {class_name.capitalize()}: {count} compounds ({pct:.2f}%)")


print("\n✓ Data curation complete! Dataset ready for QSAR modeling.")

DATA CURATION SUMMARY REPORT
Target: GTPase KRas (CHEMBL2189121)
Initial bioactivity records: 5760
Final curated records: 2751
Records removed: 3009 (52.24%)

Final dataset composition:
  Active: 1376 compounds (50.02%)
  Intermediate: 698 compounds (25.37%)
  Inactive: 677 compounds (24.61%)

✓ Data curation complete! Dataset ready for QSAR modeling.
