In [2]:
# PART 1: Setup and Environment Configuration

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

# Create data directory
!mkdir -p "/content/gdrive/My Drive/Colab Notebooks/data/EGFR"

# Install ChEMBL web service client
!pip install chembl_webresource_client

# Import required libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client
import numpy as np

print("Setup complete!")

Mounted at /content/gdrive/
Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.3.0-py3-none-any.whl (70 kB

In [4]:
### PART 2: Target Selection and Identification


# Search for EGFR target in ChEMBL database
target = new_client.target
target_query = target.search("EGFR")
targets = pd.DataFrame.from_dict(target_query)

print(f"Found {len(targets)} EGFR-related targets")
print("\nTop 5 targets:")
print(targets[['target_chembl_id', 'pref_name', 'organism', 'target_type']].head())

# Select the primary EGFR target (GTPase KRas)
selected_target = targets.target_chembl_id[0]
print(f"\nSelected target: {selected_target}")
print(f"Target name: {targets.pref_name[3]}")

Found 20 EGFR-related targets

Top 5 targets:
  target_chembl_id                                          pref_name  \
0       CHEMBL3608                   Epidermal growth factor receptor   
1    CHEMBL4523747                                        EGFR/PPP1CA   
2    CHEMBL5465557                                          CCN2-EGFR   
3        CHEMBL203                   Epidermal growth factor receptor   
4    CHEMBL4523680  Protein cereblon/Epidermal growth factor receptor   

       organism                  target_type  
0  Mus musculus               SINGLE PROTEIN  
1  Homo sapiens  PROTEIN-PROTEIN INTERACTION  
2  Homo sapiens  PROTEIN-PROTEIN INTERACTION  
3  Homo sapiens               SINGLE PROTEIN  
4  Homo sapiens  PROTEIN-PROTEIN INTERACTION  

Selected target: CHEMBL3608
Target name: Epidermal growth factor receptor


In [5]:
# PART 3: Bioactivity Data Retrieval


# Retrieve bioactivity data with IC50 values
activity = new_client.activity
results = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

# Convert to DataFrame
df1 = pd.DataFrame.from_dict(results)

print(f"\nRetrieved {len(df1)} bioactivity records")
print(f"Columns: {df1.shape[1]}")

# Save raw data
df1.to_csv('bioactivity_raw_data.csv', index=False)
!cp bioactivity_raw_data.csv "/content/gdrive/My Drive/Colab Notebooks/data/EGFR"

print("\nRaw data saved successfully!")
print(df1.head())


Retrieved 97 bioactivity records
Columns: 46

Raw data saved successfully!
  action_type activity_comment  activity_id activity_properties  \
0        None             None       110221                  []   
1        None             None       113118                  []   
2        None             None       119387                  []   
3        None             None       133319                  []   
4        None             None       193384                  []   

  assay_chembl_id                                  assay_description  \
0    CHEMBL675511  Inhibition of epidermal growth factor receptor...   
1    CHEMBL675511  Inhibition of epidermal growth factor receptor...   
2    CHEMBL675511  Inhibition of epidermal growth factor receptor...   
3    CHEMBL675511  Inhibition of epidermal growth factor receptor...   
4    CHEMBL675513  Inhibition of epidermal growth factor receptor...   

  assay_type assay_variant_accession assay_variant_mutation bao_endpoint  ...  \
0      

In [6]:
### PART 4: Data Preprocessing and Quality Control


# Check for missing values in critical columns
print("\nMissing values analysis:")
critical_cols = ['standard_value', 'canonical_smiles', 'standard_type', 'standard_units']
for col in critical_cols:
    if col in df1.columns:
        missing = df1[col].isna().sum()
        print(f"{col}: {missing} missing ({missing/len(df1)*100:.2f}%)")

# Filter records with valid bioactivity values
df2 = df1[df1["standard_value"].notna()].copy()
print(f"\nAfter removing missing standard_value: {len(df2)} records")

# Filter records with valid SMILES
df2 = df2[df2["canonical_smiles"].notna()]
df2 = df2[df2["canonical_smiles"].str.lower() != "none"]
df2 = df2[df2["canonical_smiles"].str.strip() != ""]
print(f"After removing invalid SMILES: {len(df2)} records")

# Standardize units (ensure all IC50 values are in nM)
df2 = df2[df2["standard_units"] == "nM"]
print(f"After filtering for nM units: {len(df2)} records")


Missing values analysis:
standard_value: 3 missing (3.09%)
canonical_smiles: 0 missing (0.00%)
standard_type: 0 missing (0.00%)
standard_units: 2 missing (2.06%)

After removing missing standard_value: 94 records
After removing invalid SMILES: 94 records
After filtering for nM units: 83 records


In [7]:
### PART 5: Bioactivity Classification


# Assign bioactivity classes based on IC50 thresholds
# Active: IC50 <= 1000 nM
# Intermediate: 1000 nM < IC50 < 10000 nM
# Inactive: IC50 >= 10000 nM

bioactivity_class = []
for value in df2['standard_value']:
    value = float(value)
    if value >= 10000:
        bioactivity_class.append("inactive")
    elif value <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

# Add bioactivity class to dataframe
df2['bioactivity_class'] = bioactivity_class

# Display class distribution
print("\nBioactivity class distribution:")
print(df2['bioactivity_class'].value_counts())
print("\nPercentage distribution:")
print(df2['bioactivity_class'].value_counts(normalize=True) * 100)



Bioactivity class distribution:
bioactivity_class
inactive        46
active          22
intermediate    15
Name: count, dtype: int64

Percentage distribution:
bioactivity_class
inactive        55.421687
active          26.506024
intermediate    18.072289
Name: proportion, dtype: float64


In [8]:
### PART 6: Create Final Curated Dataset


# Select relevant columns for QSAR modeling
df3 = df2[[
    'molecule_chembl_id',
    'canonical_smiles',
    'standard_value',
    'bioactivity_class'
]].copy()

# Remove duplicates based on molecule_chembl_id
df3 = df3.drop_duplicates(subset='molecule_chembl_id', keep='first')
print(f"\nAfter removing duplicates: {len(df3)} unique compounds")

# Sort by standard_value
df3 = df3.sort_values('standard_value').reset_index(drop=True)

print("\nFinal curated dataset:")
print(df3.head(10))
print(f"\nDataset shape: {df3.shape}")
print(f"Columns: {df3.columns.tolist()}")


After removing duplicates: 81 unique compounds

Final curated dataset:
  molecule_chembl_id                                   canonical_smiles  \
0       CHEMBL545315  C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OCCCN...   
1         CHEMBL7917                  COc1cc2ncnc(Nc3cccc(Cl)c3)c2cc1OC   
2       CHEMBL341946         O=C(NCCCc1ccccc1)c1cc(NCc2cc(O)ccc2O)ccc1O   
3      CHEMBL4284413            O=C(Nc1nccs1)C(c1ccccc1)N1Cc2ccccc2C1=O   
4      CHEMBL4434788  CC(C)(O)Cn1/c(=N/C(=O)c2ccnc(-c3ccccc3)c2)[nH]...   
5       CHEMBL388978  CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...   
6       CHEMBL426587  CN(C)CCCn1nc(C2=C(c3cn(-c4csc5ccccc45)c4ccccc3...   
7       CHEMBL359486  CN(C)CCCn1nc(C2=C(c3cn(-c4cnc5ccccc5c4)c4ccccc...   
8       CHEMBL360304  CN(C)CCCn1nc(C2=C(c3cn(-c4cccc5ccccc45)c4ccccc...   
9       CHEMBL368895  CN(C)CCCn1nc(C2=C(c3cn(-c4ccc5ccccc5c4)c4ccccc...   

  standard_value bioactivity_class  
0            1.6            active  
1          100.0            

In [9]:
### PART 7: Save Preprocessed Data


# Save to CSV
output_file = 'bioactivity_preprocessed_data.csv'
df3.to_csv(output_file, index=False)

# Copy to Google Drive
!cp {output_file} "/content/gdrive/My Drive/Colab Notebooks/data/EGFR"

print(f"\nPreprocessed data saved to: {output_file}")
print(f"Total records in final dataset: {len(df3)}")

# Display summary statistics
print("\nSummary Statistics:")
print(df3['standard_value'].describe())


Preprocessed data saved to: bioactivity_preprocessed_data.csv
Total records in final dataset: 81

Summary Statistics:
count          81
unique         60
top       10000.0
freq           10
Name: standard_value, dtype: object


In [10]:
### PART 8: Data Quality Report



print("DATA CURATION SUMMARY REPORT")
print(f"Target: {targets.pref_name[0]} ({selected_target})")
print(f"Initial bioactivity records: {len(df1)}")
print(f"Final curated records: {len(df3)}")
print(f"Records removed: {len(df1) - len(df3)} ({(len(df1)-len(df3))/len(df1)*100:.2f}%)")
print("\nFinal dataset composition:")
for class_name in ['active', 'intermediate', 'inactive']:
    count = (df3['bioactivity_class'] == class_name).sum()
    pct = count / len(df3) * 100
    print(f"  {class_name.capitalize()}: {count} compounds ({pct:.2f}%)")


print("\n✓ Data curation complete! Dataset ready for QSAR modeling.")

DATA CURATION SUMMARY REPORT
Target: Epidermal growth factor receptor (CHEMBL3608)
Initial bioactivity records: 97
Final curated records: 81
Records removed: 16 (16.49%)

Final dataset composition:
  Active: 20 compounds (24.69%)
  Intermediate: 15 compounds (18.52%)
  Inactive: 46 compounds (56.79%)

✓ Data curation complete! Dataset ready for QSAR modeling.
