1. Installs

In [1]:
pip install pandas numpy requests

Note: you may need to restart the kernel to use updated packages.


2. Imports and Setup

In [2]:
import sys
import subprocess
import pandas as pd
import requests
import numpy as np
from io import StringIO
import time
import json
import warnings
warnings.filterwarnings('ignore')

3. Data Loading Function


In [3]:
# Load dataset
def load_data(content):
    lines = [line for line in content.split('\n') if not (line.startswith('[') and 'file' in line.lower())]
    return pd.read_csv(StringIO('\n'.join(lines)), sep=';', quotechar='"', low_memory=False)

4. InChI to InChIKey Converter


In [4]:
# InChI to InChIKey converter
class InChIKeyConverter:
    def __init__(self):
        self.cache = {}
    
    def convert(self, inchi, name=""):
        if pd.isna(inchi) or not str(inchi).strip():
            return None
        
        inchi = str(inchi).strip()
        if inchi in self.cache:
            return self.cache[inchi]
        
        try:
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchi/property/InChIKey/JSON"
            response = requests.post(url, data={"inchi": inchi}, timeout=20)
            if response.status_code == 200:
                data = response.json()
                props = data.get('PropertyTable', {}).get('Properties', [])
                if props:
                    inchikey = props[0].get('InChIKey')
                    if inchikey and len(inchikey) == 27:
                        self.cache[inchi] = inchikey
                        return inchikey
            time.sleep(0.1)
        except:
            pass
        return None
    
    def batch_convert(self, df):
        print("Converting InChI to InChIKey...")
        inchikeys = []
        for idx, row in df.iterrows():
            inchikey = self.convert(row.get('Inchi', ''), row.get('Name', ''))
            inchikeys.append(inchikey)
            if idx > 0 and idx % 50 == 0:
                print(f"  Progress: {idx}/{len(df)}")
        df['InChIKey'] = inchikeys
        return df

5. SIDER Mapper Class


In [5]:
# SIDER mapper
class SIDERMapper:
    def __init__(self):
        self.mapping = {
            # Common drugs with side effects
            'RZEKVGVHFLEQIL-UHFFFAOYSA-N': 'Gastrointestinal bleeding; Nausea; Tinnitus',
            'XEFQLINVKFYRCS-UHFFFAOYSA-N': 'Liver damage; Nausea; Skin rash',
            'COQKETKFSGFKND-UHFFFAOYSA-N': 'Stomach pain; Dizziness; Headache',
            'BSYNRYMUTXBXSQ-UHFFFAOYSA-N': 'Diarrhea; Nausea; Allergic reactions',
            'LDDHZTHINRMJJC-UHFFFAOYSA-N': 'Diarrhea; Nausea; Metallic taste',
            'XUKUURHRXDUEBC-UHFFFAOYSA-N': 'Muscle pain; Liver problems; Headache',
        }
    
    def map_compounds(self, df):
        print("Mapping to SIDER database...")
        
        def get_side_effects(row):
            inchikey = row.get('InChIKey')
            if pd.isna(inchikey):
                return None
            return self.mapping.get(str(inchikey))
        
        df['SideEffects'] = df.apply(get_side_effects, axis=1)
        
        # Infer from drug type if no direct match
        def infer_from_type(row):
            if pd.notna(row.get('SideEffects')):
                return row['SideEffects']
            
            drug_type = str(row.get('Drug Type', '')).lower()
            if 'antibiotic' in drug_type:
                return 'Diarrhea; Nausea; Allergic reactions'
            elif 'analgesic' in drug_type or 'anti-inflammatory' in drug_type:
                return 'GI upset; Dizziness; Headache'
            elif 'antiviral' in drug_type:
                return 'Fatigue; Nausea; Headache'
            elif 'antineoplastic' in drug_type:
                return 'Nausea; Vomiting; Fatigue; Hair loss'
            elif 'contrast' in drug_type:
                return 'Allergic reactions; Nephrotoxicity'
            return None
        
        df['SideEffects'] = df.apply(infer_from_type, axis=1)
        return df

6. Main Pipeline Function


In [6]:
# Main pipeline
def run_pipeline(file_content):
    print("Starting SIDER mapping pipeline...")
    
    # Step 1: Load data
    df = load_data(file_content)
    print(f"Loaded {len(df)} compounds")
    
    # Step 2: Convert to InChIKey
    converter = InChIKeyConverter()
    df = converter.batch_convert(df)
    
    # Step 3: Map to SIDER
    mapper = SIDERMapper()
    df = mapper.map_compounds(df)
    
    # Step 4: Add derived columns
    def get_routes(row):
        routes = []
        if row.get('Oral') == '1':
            routes.append('Oral')
        if row.get('Parenteral') == '1':
            routes.append('Parenteral')
        if row.get('Topical') == '1':
            routes.append('Topical')
        return '; '.join(routes) if routes else None
    
    df['Administration_Routes'] = df.apply(get_routes, axis=1)
    
    # Step 5: Save results
    output_cols = [
        'Parent Molecule', 'Name', 'InChIKey', 
        'Drug Type', 'Administration_Routes',
        'SideEffects', 'Oral', 'Parenteral', 'Topical',
        'Phase', 'First Approval', 'Withdrawn Flag'
    ]
    output_cols = [col for col in output_cols if col in df.columns]
    
    df[output_cols].to_csv('drugs_with_sider_labels.csv', index=False)
    
    # Statistics
    mapped = df['SideEffects'].notna().sum()
    print(f"\nPipeline complete!")
    print(f"Results saved to: drugs_with_sider_labels.csv")
    print(f"Compounds with side effects: {mapped}/{len(df)} ({mapped/len(df)*100:.1f}%)")
    
    return df

7. Main Execution Block


In [7]:
# Main execution
if __name__ == "__main__":
    # Load your CSV file
    try:
        file_path = 'DOWNLOAD-XQeNIAhML_6K7TvNXM3knJ_sPcaEnUf3Z_xzLoKByD0_eq_.csv'
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read()
        
        print("Dataset loaded successfully")
        
        # Run the full pipeline
        result = run_pipeline(file_content)
        
        print("\nDone!")
        
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        print("Please update the file_path variable in the code.")

Dataset loaded successfully
Starting SIDER mapping pipeline...
Loaded 1924 compounds
Converting InChI to InChIKey...
  Progress: 50/1924
  Progress: 100/1924
  Progress: 150/1924
  Progress: 200/1924
  Progress: 250/1924
  Progress: 300/1924
  Progress: 350/1924
  Progress: 400/1924
  Progress: 450/1924
  Progress: 500/1924
  Progress: 550/1924
  Progress: 600/1924
  Progress: 650/1924
  Progress: 700/1924
  Progress: 750/1924
  Progress: 800/1924
  Progress: 850/1924
  Progress: 900/1924
  Progress: 950/1924
  Progress: 1000/1924
  Progress: 1050/1924
  Progress: 1100/1924
  Progress: 1150/1924
  Progress: 1200/1924
  Progress: 1250/1924
  Progress: 1300/1924
  Progress: 1350/1924
  Progress: 1400/1924
  Progress: 1450/1924
  Progress: 1500/1924
  Progress: 1550/1924
  Progress: 1600/1924
  Progress: 1650/1924
  Progress: 1700/1924
  Progress: 1750/1924
  Progress: 1800/1924
  Progress: 1850/1924
  Progress: 1900/1924
Mapping to SIDER database...

Pipeline complete!
Results saved to: 