In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
TOX21_ENDPOINTS = [
    'NR-AR',           # Androgen Receptor
    'NR-AR-LBD',       # Androgen Receptor Ligand Binding Domain
    'NR-AhR',          # Aryl hydrocarbon Receptor
    'NR-Aromatase',    # Aromatase
    'NR-ER',           # Estrogen Receptor
    'NR-ER-LBD',       # Estrogen Receptor Ligand Binding Domain
    'NR-PPAR-gamma',   # Peroxisome Proliferator-Activated Receptor gamma
    'SR-ARE',          # Antioxidant Response Element
    'SR-ATAD5',        # ATPase Family AAA Domain Containing 5
    'SR-HSE',          # Heat Shock Element
    'SR-MMP',          # Mitochondrial Membrane Potential
    'SR-p53'           # p53 pathway
]

In [None]:
ENDPOINT_NAMES = {
    'NR-AR': 'Androgen Receptor Disruption',
    'NR-AR-LBD': 'Androgen Receptor Binding',
    'NR-AhR': 'Aryl Hydrocarbon Receptor',
    'NR-Aromatase': 'Aromatase Inhibition',
    'NR-ER': 'Estrogen Receptor Disruption',
    'NR-ER-LBD': 'Estrogen Receptor Binding',
    'NR-PPAR-gamma': 'PPAR-gamma Activation',
    'SR-ARE': 'Antioxidant Response',
    'SR-ATAD5': 'DNA Damage Response',
    'SR-HSE': 'Heat Shock Response',
    'SR-MMP': 'Mitochondrial Toxicity',
    'SR-p53': 'p53 Tumor Suppressor'
}

# **Feature Extraction for the tox21 endpoints**

In [None]:
def extract_molecular_features(smiles):
    """
    Extract molecular features that work well for toxicity prediction
    """
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return [0] * 12  # Return zeros for invalid SMILES

    try:
        features = [
            Descriptors.MolWt(mol),                    # Molecular weight
            Descriptors.MolLogP(mol),                  # Lipophilicity
            Descriptors.TPSA(mol),                     # Topological polar surface area
            Descriptors.NumHDonors(mol),               # Hydrogen bond donors
            Descriptors.NumHAcceptors(mol),            # Hydrogen bond acceptors
            Descriptors.NumRotatableBonds(mol),        # Rotatable bonds
            Descriptors.HeavyAtomCount(mol),           # Heavy atom count
            Descriptors.NumAromaticRings(mol),         # Aromatic rings
            Descriptors.FractionCSP3(mol),             # Fraction sp3 carbons
            Descriptors.BertzCT(mol),                  # Molecular complexity
            Descriptors.NumSaturatedRings(mol),        # Saturated rings
            Descriptors.NumAliphaticRings(mol)         # Aliphatic rings
        ]
        return features
    except:
        return [0] * 12

In [None]:
def create_features_dataframe(smiles_list):
    """
    Create feature DataFrame from SMILES list
    """
    feature_names = [
        'molecular_weight', 'logp', 'tpsa', 'hbd', 'hba', 'rotatable_bonds',
        'heavy_atoms', 'aromatic_rings', 'fraction_sp3', 'complexity',
        'saturated_rings', 'aliphatic_rings'
    ]

    print(f"🧬 Extracting features for {len(smiles_list)} molecules...")

    features_list = []
    failed_count = 0

    for i, smiles in enumerate(smiles_list):
        features = extract_molecular_features(smiles)
        features_list.append(features)

        if features == [0] * 12:
            failed_count += 1

        if (i + 1) % 1000 == 0:
            print(f"   Processed {i + 1}/{len(smiles_list)} molecules...")

    print(f"✅ Feature extraction complete! Failed: {failed_count}/{len(smiles_list)}")

    return pd.DataFrame(features_list, columns=feature_names)

# **Toxicity Prediction Model**

In [None]:
class AfrcanPhytochemicalToxicityPredictor:
    """
    Toxicity predictor specifically for African phytochemicals
    """

    def __init__(self):
        self.models = {}
        self.feature_names = []
        self.training_stats = {}

    def train_models(self, tox21_df):
        """
        Train toxicity models using Tox21 data
        """
        print("🤖 Training toxicity prediction models...")
        print(f"   Training on {len(tox21_df)} Tox21 compounds")

        # Extract features from Tox21 SMILES
        print("   Extracting features from Tox21 dataset...")
        tox21_features = create_features_dataframe(tox21_df['smiles'].tolist())
        self.feature_names = tox21_features.columns.tolist()

        # Train model for each toxicity endpoint
        for endpoint in TOX21_ENDPOINTS:
            print(f"   Training model for {ENDPOINT_NAMES[endpoint]}...")

            # Get valid data (remove NaN values)
            valid_mask = ~tox21_df[endpoint].isna()
            X_train = tox21_features[valid_mask]
            y_train = tox21_df[endpoint][valid_mask]

            if len(y_train) == 0:
                print(f"   ⚠️ No valid data for {endpoint}")
                continue

            # Train Random Forest model
            model = RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                random_state=42,
                class_weight='balanced',  # Handle imbalanced data
                n_jobs=-1
            )

            model.fit(X_train, y_train)
            self.models[endpoint] = model

            # Store training statistics
            self.training_stats[endpoint] = {
                'n_samples': len(y_train),
                'n_positive': sum(y_train),
                'n_negative': len(y_train) - sum(y_train),
                'positive_rate': sum(y_train) / len(y_train)
            }

            print(f"   ✅ {endpoint}: {len(y_train)} samples, {sum(y_train)} positive cases")

        print(f"🎯 Training complete! Trained {len(self.models)} models")

    def predict_afrodb_toxicity(self, filtered_afrodb_df):
        """
        Predict toxicity for filtered AfroDB compounds
        """
        print("🔮 Making toxicity predictions for African phytochemicals...")

        # Extract features from AfroDB SMILES
        afrodb_features = create_features_dataframe(filtered_afrodb_df['canonical_smiles'].tolist())

        # Make predictions for each endpoint
        predictions = {}

        for endpoint in self.models:
            print(f"   Predicting {ENDPOINT_NAMES[endpoint]}...")

            model = self.models[endpoint]

            # Get probability predictions
            pred_proba = model.predict_proba(afrodb_features)

            # Handle cases where model only learned one class
            if pred_proba.shape[1] == 1:
                toxic_prob = pred_proba[:, 0]
            else:
                toxic_prob = pred_proba[:, 1]  # Probability of toxic class

            predictions[f"{endpoint}_probability"] = toxic_prob
            predictions[f"{endpoint}_prediction"] = ['Toxic' if p > 0.5 else 'Non-toxic' for p in toxic_prob]
            predictions[f"{endpoint}_risk_level"] = [
                'High' if p > 0.7 else 'Medium' if p > 0.3 else 'Low' for p in toxic_prob
            ]

        return pd.DataFrame(predictions)

    def predict_single_compound(self, smiles):
        """
        Predict toxicity for a single compound (for LLM integration)
        """
        features = extract_molecular_features(smiles)
        features_array = np.array(features).reshape(1, -1)

        results = {'smiles': smiles, 'predictions': {}}

        for endpoint in self.models:
            model = self.models[endpoint]
            pred_proba = model.predict_proba(features_array)[0]

            toxic_prob = pred_proba[1] if len(pred_proba) > 1 else pred_proba[0]

            results['predictions'][endpoint] = {
                'endpoint_name': ENDPOINT_NAMES[endpoint],
                'toxic_probability': round(toxic_prob, 3),
                'prediction': 'Toxic' if toxic_prob > 0.5 else 'Non-toxic',
                'risk_level': 'High' if toxic_prob > 0.7 else 'Medium' if toxic_prob > 0.3 else 'Low'
            }

        # Overall risk assessment
        high_risk_count = sum(1 for p in results['predictions'].values() if p['risk_level'] == 'High')
        results['overall_risk'] = 'High' if high_risk_count >= 2 else 'Medium' if high_risk_count >= 1 else 'Low'

        return results

    def save_models(self, filename='african_phytochemical_toxicity_models.pkl'):
        """
        Save trained models
        """
        model_data = {
            'models': self.models,
            'feature_names': self.feature_names,
            'training_stats': self.training_stats,
            'endpoints': TOX21_ENDPOINTS,
            'endpoint_names': ENDPOINT_NAMES
        }
        joblib.dump(model_data, filename)
        print(f"💾 Models saved to {filename}")

    def load_models(self, filename='african_phytochemical_toxicity_models.pkl'):
        """
        Load pre-trained models
        """
        model_data = joblib.load(filename)
        self.models = model_data['models']
        self.feature_names = model_data['feature_names']
        self.training_stats = model_data['training_stats']
        print(f"📂 Models loaded from {filename}")

# **MAIN PIPELINE FUNCTION**

In [None]:
def run_your_pipeline(tox21_file, filtered_afrodb_df):
    """
    Run the complete pipeline with your specific datasets
    """
    print("🚀 African Phytochemical Toxicity Prediction Pipeline")
    print("=" * 60)

    # Load Tox21 dataset
    print("📁 Loading Tox21 dataset...")
    tox21_df = pd.read_csv(tox21_file)
    print(f"   Loaded {len(tox21_df)} Tox21 compounds")

    # Display dataset info
    print(f"   AfroDB filtered dataset: {len(filtered_afrodb_df)} compounds")

    # Initialize predictor
    predictor = AfrcanPhytochemicalToxicityPredictor()

    # Train models
    predictor.train_models(tox21_df)

    # Save models
    predictor.save_models()

    # Make predictions on filtered AfroDB
    print("🔮 Making predictions on filtered African phytochemicals...")
    toxicity_predictions = predictor.predict_afrodb_toxicity(filtered_afrodb_df)

    # Combine results
    final_results = pd.concat([filtered_afrodb_df.reset_index(drop=True), toxicity_predictions], axis=1)

    # Save results
    output_file = 'african_phytochemicals_toxicity_predictions.csv'
    final_results.to_csv(output_file, index=False)

    print(f"✅ Pipeline complete!")
    print(f"📊 Results saved to: {output_file}")
    print(f"🔧 Models saved for LLM integration")

    # Show summary statistics
    print("\n📈 Prediction Summary:")
    for endpoint in TOX21_ENDPOINTS:
        if f"{endpoint}_risk_level" in final_results.columns:
            risk_counts = final_results[f"{endpoint}_risk_level"].value_counts()
            print(f"   {ENDPOINT_NAMES[endpoint]}:")
            print(f"      High Risk: {risk_counts.get('High', 0)}")
            print(f"      Medium Risk: {risk_counts.get('Medium', 0)}")
            print(f"      Low Risk: {risk_counts.get('Low', 0)}")

    return final_results, predictor


# **LLM Integration Functions**

In [None]:
def quick_phytochemical_toxicity_check(smiles):
    """
    Quick toxicity check for LLM integration
    """
    predictor = AfrcanPhytochemicalToxicityPredictor()
    predictor.load_models()

    return predictor.predict_single_compound(smiles)

def interpret_toxicity_results(results):
    """
    Create human-readable toxicity report
    """
    report = f"🧪 Toxicity Analysis for: {results['smiles']}\n"
    report += f"🎯 Overall Risk Level: {results['overall_risk']}\n\n"

    report += "📋 Individual Endpoint Analysis:\n"
    for endpoint, data in results['predictions'].items():
        risk_emoji = "🔴" if data['risk_level'] == 'High' else "🟡" if data['risk_level'] == 'Medium' else "🟢"
        report += f"   {risk_emoji} {data['endpoint_name']}: {data['prediction']} ({data['toxic_probability']*100:.1f}%)\n"

    return report


italicized text## **Usage**

In [None]:
if __name__ == "__main__":
    print("🌿 African Phytochemical Toxicity Prediction System")
    print("=" * 50)


    print("\n🔧 To run with your datasets:")
    print("# Load your filtered AfroDB data")
    print("# filtered_results = your_filtered_dataframe")
    print("# results, predictor = run_your_pipeline('tox21.csv', filtered_results)")

    print("\n🔍 For LLM integration:")
    print("# result = quick_phytochemical_toxicity_check('COC1=CC=CC2=C1C(=O)C1=C(O)C3=C...')")
    print("# report = interpret_toxicity_results(result)")
    print("# print(report)")

🌿 African Phytochemical Toxicity Prediction System

🔧 To run with your datasets:
# Load your filtered AfroDB data
# filtered_results = your_filtered_dataframe
# results, predictor = run_your_pipeline('tox21.csv', filtered_results)

🔍 For LLM integration:
# result = quick_phytochemical_toxicity_check('COC1=CC=CC2=C1C(=O)C1=C(O)C3=C...')
# report = interpret_toxicity_results(result)
# print(report)


# **Loading the Tox21 Data for Model training**

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd

#Mounting the drive to retrieve the necessary contents
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
tox21 = '/content/drive/MyDrive/tox21.csv'
df = pd.read_csv(tox21)
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [None]:
# Download the CSV file
!wget https://coconut.s3.uni-jena.de/prod/downloads/2025-06/coconut_csv-06-2025.zip -O coconut_csv.zip
!unzip -o coconut_csv.zip -d coconut_csv

--2025-07-10 14:55:54--  https://coconut.s3.uni-jena.de/prod/downloads/2025-06/coconut_csv-06-2025.zip
Resolving coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)... 141.35.104.26, 141.35.104.25, 2001:638:1558:2368::8d23:6819, ...
Connecting to coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)|141.35.104.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 208243979 (199M) [application/zip]
Saving to: ‘coconut_csv.zip’


2025-07-10 14:56:07 (17.5 MB/s) - ‘coconut_csv.zip’ saved [208243979/208243979]

Archive:  coconut_csv.zip
  inflating: coconut_csv/coconut_csv-06-2025.csv  


In [None]:
import pandas as pd
# Show all columns
pd.set_option('display.max_columns', None)
df = pd.read_csv("coconut_csv/coconut_csv-06-2025.csv")

print(df.shape)
df.head()




(695119, 44)


Unnamed: 0,identifier,canonical_smiles,standard_inchi,standard_inchi_key,name,iupac_name,annotation_level,total_atom_count,heavy_atom_count,molecular_weight,exact_molecular_weight,molecular_formula,alogp,topological_polar_surface_area,rotatable_bond_count,hydrogen_bond_acceptors,hydrogen_bond_donors,hydrogen_bond_acceptors_lipinski,hydrogen_bond_donors_lipinski,lipinski_rule_of_five_violations,aromatic_rings_count,qed_drug_likeliness,formal_charge,fractioncsp3,number_of_minimal_rings,van_der_walls_volume,contains_sugar,contains_ring_sugars,contains_linear_sugars,murcko_framework,np_likeness,chemical_class,chemical_sub_class,chemical_super_class,direct_parent_classification,np_classifier_pathway,np_classifier_superclass,np_classifier_class,np_classifier_is_glycoside,organisms,collections,dois,synonyms,cas
0,CNP0282693.1,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@@](O)(C(=O)...,InChI=1S/C27H29NO10/c1-11-6-12(28)7-18(37-11)3...,ITSGNOIFAJAQHJ-BMFNZSJVSA-N,Esorubicin,"(7~{S},9~{S})-7-[(2~{S},4~{R},6~{S})-4-amino-6...",3,67,38,527.53,527.17915,C27H29NO10,1.03,185.84,5,11,5,11,5,2,2,0.3,0,0.44,5,445.49,False,False,False,O1CCCCC1OC2c3cc4c(cc3CCC2)Cc5ccccc5C4,1.56,Naphthacenes,Tetracenequinones,Benzenoids,Tetracenequinones,Polyketides,Polycyclic aromatic polyketides,Anthracyclines,True,,ChEMBL NPs,,4'-Deoxydoxorubicin|63521-85-7|Esorubicin INN|...,63521-85-7
1,CNP0214016.1,COC1=CC2=CC(=C1Cl)N(C)C(=O)C[C@H](OC(=O)[C@H](...,InChI=1S/C34H46ClN3O10/c1-18-11-10-12-26(45-9)...,WKPWGQKGSOKKOO-RSFHAFMBSA-N,MAYTANSINE,"[(1~{S},2~{R},3~{S},5~{S},6~{S},16~{E},18~{E},...",4,94,48,692.21,691.28717,C34H46ClN3O10,3.53,156.47,5,10,2,10,2,1,1,0.35,0,0.59,4,618.34,False,False,False,O1CNC2CC=CC=CCc3cccc(c3)NCCCC4OC4CC1C2,2.42,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",Organic acids and derivatives,Alpha amino acid esters,Polyketides,Macrolides,Ansa macrolides,False,Maytenus buchananii|Maytenus confertiflorus|Ma...,ChEBI NPs|ChEMBL NPs|KNApSaCK|NCI DTP data|Sup...,,Maitansine|Maytansine USAN|Maitansinum|NSC-153...,35846-53-8
2,CNP0185052.1,NC1=NC(N)=C2N=C(CNC3=CC=C(C(=O)N[C@@H](CCCNC(=...,InChI=1S/C27H27N9O6/c28-21-20-22(36-27(29)35-2...,NYQPLPNEESYGNO-IBGZPJMESA-N,Talotrexin,"2-[[(4~{S})-4-carboxy-4-[[4-[(2,4-diaminopteri...",3,69,42,573.57,573.20843,C27H27N9O6,1.29,248.43,12,11,7,11,7,3,4,0.12,0,0.19,4,494.02,False,False,False,n1cnc2ncc(nc2c1)CNc3ccc(cc3)CNCCCCNCc4ccccc4,-0.62,Benzene and substituted derivatives,Benzoic acids and derivatives,Benzenoids,Hippuric acids and derivatives,Alkaloids,,,False,,ChEMBL NPs,,113857-87-7|PT523|Talotrexin INN|A8E516A20K|N(...,113857-87-7
3,CNP0074548.1,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(COC...,InChI=1S/C16H16N4O8S.Na/c1-26-19-9(8-3-2-4-27-...,URDOHUPGIOGTKV-JTBFTWTJSA-M,CEFUROXIME SODIUM,"sodium (6~{R},7~{R})-3-(carbamoyloxymethyl)-7-...",3,45,30,446.37,446.05083,C16H15N4NaO8S,-4.87,176.59,7,10,2,10,2,0,1,0.18,0,0.31,3,350.55,False,False,False,o1cccc1CCNC2CN3C=CCSC32,-0.04,Lactams,Beta lactams,Organoheterocyclic compounds,Cephalosporin 3'-carbamates,Amino acids and Peptides,β-lactams,Cephalosporins,False,,ChEMBL NPs,,Sodium cefuroxime|Anaptivan|Biociclin|Cefuroxi...,56238-63-2
4,CNP0602851.1,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N...,InChI=1S/C23H24N6O5S2/c1-34-27-16(14-11-36-23(...,YWKJNRNSJKEFMK-PQFQYKRASA-N,Cefquinome,"(6~{R},7~{R})-7-[[(2~{Z})-2-(2-aminothiazol-4-...",3,60,36,528.62,528.12496,C23H24N6O5S2,-0.65,153.92,7,10,2,10,2,1,2,0.21,0,0.39,5,438.41,False,False,False,n1cscc1CCNC2CN3C=C(CSC32)Cn4cccc5c4CCCC5,-0.18,Lactams,Beta lactams,Organoheterocyclic compounds,Cephalosporins,Amino acids and Peptides,β-lactams,Cephalosporins,False,,ChEMBL NPs,,84957-30-2|Cobactan|Cefquinoma|HR111V|Z74S078C...,84957-30-2


In [None]:
import pandas as pd
from collections import Counter

class PlantAgent:
    def __init__(self, df):
        self.df = df

    def search_by_plant(self, plant_name, top_n=10):
        results = self.df[self.df['organisms'].str.contains(plant_name, case=False, na=False)]
        if results.empty:
            print(f"❌ No compounds found for: {plant_name}")
            return None
        return results[[  # You can add or remove fields here
            'identifier', 'name', 'organisms',
            'canonical_smiles', 'molecular_formula',
            'molecular_weight', 'exact_molecular_weight', 'alogp',
            'topological_polar_surface_area', 'rotatable_bond_count',
            'hydrogen_bond_acceptors', 'hydrogen_bond_donors',
            'aromatic_rings_count', 'formal_charge', 'fractioncsp3',
            'qed_drug_likeliness', 'np_likeness', 'contains_sugar',
            'contains_ring_sugars', 'contains_linear_sugars',
            'chemical_class', 'chemical_sub_class', 'chemical_super_class',
            'np_classifier_class', 'np_classifier_superclass', 'np_classifier_pathway'
        ]].head(top_n)

    def list_all_organisms_tabular(self):
        """Returns organisms with occurrence counts in tabular format"""
        organism_series = self.df['organisms'].dropna()
        all_organisms = []

        for entry in organism_series:
            parts = [o.strip() for o in entry.split('|') if o.strip()]
            all_organisms.extend(parts)

        organism_counts = Counter(all_organisms)
        return pd.DataFrame(organism_counts.items(), columns=['Organism', 'Count']).sort_values(by='Count', ascending=False)








In [None]:
# Get user input
plant_query = input("Enter plant name (e.g., Vernonia amygdalina): ")
plant_agent = PlantAgent(df)
plant_info = plant_agent.search_by_plant(plant_query)
plant_info.head()

Enter plant name (e.g., Vernonia amygdalina): Vernonia amygdalina


Unnamed: 0,identifier,name,organisms,canonical_smiles,molecular_formula,molecular_weight,exact_molecular_weight,alogp,topological_polar_surface_area,rotatable_bond_count,hydrogen_bond_acceptors,hydrogen_bond_donors,aromatic_rings_count,formal_charge,fractioncsp3,qed_drug_likeliness,np_likeness,contains_sugar,contains_ring_sugars,contains_linear_sugars,chemical_class,chemical_sub_class,chemical_super_class,np_classifier_class,np_classifier_superclass,np_classifier_pathway
12752,CNP0284830.2,Vernodalol,Daucus virgatus|Vernonia amygdalina|Vernonia c...,C=C[C@@]12COC(=O)C(=C)[C@@H]1[C@@H](O)[C@H](C(...,C20H24O8,392.4,392.14712,0.46,119.36,6,8,2,0,0,0.45,0.29,2.73,False,False,False,Prenol lipids,Terpene lactones,Lipids and lipid-like molecules,Elemane sesquiterpenoids,Sesquiterpenoids,Terpenoids
73232,CNP0269056.3,vernolide,Vernonia amygdalina,C=C(C)C(=O)O[C@H]1C[C@@]23O[C@@H]2CCC(=C[C@H]2...,C19H22O7,362.38,362.13655,1.17,94.59,2,7,1,0,0,0.58,0.34,3.52,False,False,False,Lactones,Gamma butyrolactones,Organoheterocyclic compounds,Germacrane sesquiterpenoids,Sesquiterpenoids,Terpenoids
143540,CNP0257740.2,Hydroxyvernolide,Vernonia amygdalina|Vernonia colorata,C=C(CO)C(=O)O[C@H]1C[C@@]23O[C@@H]2CCC(=C[C@@H...,C19H22O8,378.38,378.13147,0.14,114.82,3,8,2,0,0,0.58,0.31,3.64,False,False,False,Hydroxy acids and derivatives,Beta hydroxy acids and derivatives,Organic acids and derivatives,Germacrane sesquiterpenoids,Sesquiterpenoids,Terpenoids
143582,CNP0269056.4,Vernolide,Vernonia amygdalina|Vernonia colorata,C=C(C)C(=O)O[C@H]1C[C@@]23O[C@@H]2CCC(=C[C@@H]...,C19H22O7,362.38,362.13655,1.17,94.59,2,7,1,0,0,0.58,0.34,3.52,False,False,False,Lactones,Gamma butyrolactones,Organoheterocyclic compounds,Germacrane sesquiterpenoids,Sesquiterpenoids,Terpenoids
144855,CNP0321988.4,Vernomygdin,Vernonia amygdalina,C=C1C(=O)O[C@H]2C=C3CC[C@H]4O[C@@]4(C[C@H](OC(...,C19H24O7,364.39,364.1522,1.25,94.59,2,7,1,0,0,0.68,0.34,3.39,False,False,False,Lactones,Gamma butyrolactones,Organoheterocyclic compounds,Germacrane sesquiterpenoids,Sesquiterpenoids,Terpenoids


In [None]:
class FilterAgent:
    def __init__(self, df):
        self.df = df.copy()

    def lipinski_filter(self, row):
        try:
            mw = row['molecular_weight']
            logp = row['alogp']
            hba = row['hydrogen_bond_acceptors']
            hbd = row['hydrogen_bond_donors']
            rules_passed = sum([
                mw < 500,
                logp < 5,
                hba <= 10,
                hbd <= 5
            ])
            return rules_passed >= 2  # relaxed rule
        except:
            return False

    def apply_filters(self, qed_threshold=0.5, np_threshold=0.3):
        # Apply Lipinski
        self.df['lipinski_pass'] = self.df.apply(self.lipinski_filter, axis=1)

        # Apply QED filter
        self.df['qed_pass'] = self.df['qed_drug_likeliness'] >= qed_threshold

        # Apply NP-likeness filter
        self.df['np_pass'] = self.df['np_likeness'] >= np_threshold

        # Return filtered DataFrame
        filtered_df = self.df[
            self.df['lipinski_pass'] &
            self.df['qed_pass'] &
            self.df['np_pass']
        ]
        print(f"✅ {len(filtered_df)} compounds passed Lipinski's rule out of {len(self.df)}")
        return filtered_df


In [None]:
# Instantiate and apply
filter_agent = FilterAgent(df)
filtered_results = filter_agent.apply_filters()




✅ 151807 compounds passed Lipinski's rule out of 695119


In [None]:
filtered_results.head()

Unnamed: 0,identifier,canonical_smiles,standard_inchi,standard_inchi_key,name,iupac_name,annotation_level,total_atom_count,heavy_atom_count,molecular_weight,exact_molecular_weight,molecular_formula,alogp,topological_polar_surface_area,rotatable_bond_count,hydrogen_bond_acceptors,hydrogen_bond_donors,hydrogen_bond_acceptors_lipinski,hydrogen_bond_donors_lipinski,lipinski_rule_of_five_violations,aromatic_rings_count,qed_drug_likeliness,formal_charge,fractioncsp3,number_of_minimal_rings,van_der_walls_volume,contains_sugar,contains_ring_sugars,contains_linear_sugars,murcko_framework,np_likeness,chemical_class,chemical_sub_class,chemical_super_class,direct_parent_classification,np_classifier_pathway,np_classifier_superclass,np_classifier_class,np_classifier_is_glycoside,organisms,collections,dois,synonyms,cas,lipinski_pass,qed_pass,np_pass
5,CNP0074843.1,O=C(O)C1=CC=CC(/C=C2\C[C@H]3C[C@@H](O)[C@H](/C...,InChI=1S/C25H32O4/c26-23(18-6-2-1-3-7-18)10-9-...,OINUMRGCICIETD-CGKNXJIZSA-N,Naxaprostene,"3-[(~{E})-[(3~{a}~{S},4~{R},5~{R},6~{a}~{S})-4...",3,61,29,396.53,396.23006,C25H32O4,4.67,77.76,5,3,3,3,3,0,1,0.63,0,0.56,4,382.14,False,False,False,c1ccc(cc1)C=C2CC3CCC(C=CCC4CCCCC4)C3C2,1.01,Prenol lipids,Monoterpenoids,Lipids and lipid-like molecules,Bicyclic monoterpenoids,,,,False,,ChEMBL NPs,,87269-59-8|Naxaprostene INN|3-(E)-(3aS|4R|5R|6...,87269-59-8,True,True,True
8,CNP0180168.1,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...,InChI=1S/C25H36O6/c1-4-5-21(30)31-25(20(29)14-...,BMCQMVFGOVHVNG-TUFAYURCSA-N,hydrocortisone butyrate,"[(8~{S},9~{S},10~{R},11~{S},13~{S},14~{S},17~{...",3,67,31,432.56,432.25119,C25H36O6,3.13,100.9,5,6,2,6,2,0,0,0.65,0,0.8,4,404.91,False,False,False,C1=C2CCC3C(CCC4CCCC43)C2CCC1,2.14,Steroids and steroid derivatives,Pregnane steroids,Lipids and lipid-like molecules,"Gluco\/mineralocorticoids, progestogins and de...",Terpenoids,Steroids,Pregnane steroids,False,,ChEMBL NPs|Super Natural II,,Hydrocortisone 17-butyrate|Cortisol 17-butyrat...,13609-67-1,True,True,True
13,CNP0074882.1,CCC(=O)O[C@]1(C(=O)COC(C)=O)CC[C@H]2[C@@H]3C[C...,InChI=1S/C27H36O7/c1-6-23(32)34-27(22(31)14-33...,DALKLAYLIPSCQL-YPYQNWSCSA-N,Methylprednisolone aceponate,"[(6~{S},8~{S},9~{S},10~{R},11~{S},13~{S},14~{S...",3,70,34,472.58,472.2461,C27H36O7,3.34,106.97,5,7,1,7,1,0,0,0.61,0,0.7,4,436.0,False,False,False,C1=CC2C(=CC1)CCC3C2CCC4CCCC43,2.1,Steroids and steroid derivatives,Pregnane steroids,Lipids and lipid-like molecules,"Gluco\/mineralocorticoids, progestogins and de...",Terpenoids,Steroids,Pregnane steroids,False,,ChEMBL NPs,,86401-95-8|Advantan|Methylprednisolone acepona...,86401-95-8,True,True,True
19,CNP0166492.1,COC1=C(C(=O)OCCN(C)C)[C@H]2C[C@@H]3C4=C(CCN3C[...,InChI=1S/C26H35N3O5/c1-15-20-14-29-9-8-18-17-7...,HTVYZQDNQXGFOF-WWMNNVMHSA-N,DIMETHYLAMINOETHYL RESERPILINATE,"2-(dimethylamino)ethyl (1~{R},15~{S},16~{S},20...",2,69,34,469.58,469.25767,C26H35N3O5,3.09,76.26,6,7,1,7,1,0,2,0.65,0,0.58,5,434.84,False,False,False,O1C=CC2CC3c4[nH]c5ccccc5c4CCN3CC2C1,0.44,Yohimbine alkaloids,Yohimbine alkaloids,Alkaloids and derivatives,Yohimbine alkaloids,Alkaloids,Tryptophan alkaloids,Corynanthe type,True,,ChEMBL NPs,,CHEMBL2106350,,True,True,True
25,CNP0401611.1,CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3CCC4=CC(=O)...,InChI=1S/C30H37NO4/c1-18(32)30(35-19(2)33)15-1...,OOLLAFOLCSJHRE-ZHAKMVSLSA-N,Ulipristal acetate,"[(8~{S},11~{R},13~{S},14~{S},17~{R})-17-acetyl...",3,72,35,475.63,475.27226,C30H37NO4,5.54,63.68,4,5,0,5,0,1,1,0.53,0,0.57,5,449.94,False,False,False,c1ccc(cc1)C2C3=C4C(=CCCC4)CCC3C5CCCC5C2,1.19,Steroids and steroid derivatives,Steroid esters,Lipids and lipid-like molecules,Steroid esters,Terpenoids,Steroids,Pregnane steroids,False,,ChEMBL NPs,,126784-99-4|Ella|EllaOne|CDB-2914|CDB 2914|Uli...,126784-99-4,True,True,True


In [None]:
results, predictor = run_your_pipeline(tox21, filtered_results)

🚀 African Phytochemical Toxicity Prediction Pipeline
📁 Loading Tox21 dataset...
   Loaded 7831 Tox21 compounds
   AfroDB filtered dataset: 151807 compounds
🤖 Training toxicity prediction models...
   Training on 7831 Tox21 compounds
   Extracting features from Tox21 dataset...
🧬 Extracting features for 7831 molecules...




   Processed 1000/7831 molecules...


[14:57:50] Explicit valence for atom # 8 Al, 6, is greater than permitted


   Processed 2000/7831 molecules...


[14:57:52] Explicit valence for atom # 3 Al, 6, is greater than permitted
[14:57:52] Explicit valence for atom # 4 Al, 6, is greater than permitted


   Processed 3000/7831 molecules...


[14:57:54] Explicit valence for atom # 4 Al, 6, is greater than permitted


   Processed 4000/7831 molecules...


[14:57:55] Explicit valence for atom # 9 Al, 6, is greater than permitted
[14:57:55] Explicit valence for atom # 5 Al, 6, is greater than permitted


   Processed 5000/7831 molecules...


[14:57:56] Explicit valence for atom # 16 Al, 6, is greater than permitted


   Processed 6000/7831 molecules...


[14:57:58] Explicit valence for atom # 20 Al, 6, is greater than permitted


   Processed 7000/7831 molecules...
✅ Feature extraction complete! Failed: 8/7831
   Training model for Androgen Receptor Disruption...
   ✅ NR-AR: 7265 samples, 309.0 positive cases
   Training model for Androgen Receptor Binding...
   ✅ NR-AR-LBD: 6758 samples, 237.0 positive cases
   Training model for Aryl Hydrocarbon Receptor...
   ✅ NR-AhR: 6549 samples, 768.0 positive cases
   Training model for Aromatase Inhibition...
   ✅ NR-Aromatase: 5821 samples, 300.0 positive cases
   Training model for Estrogen Receptor Disruption...
   ✅ NR-ER: 6193 samples, 793.0 positive cases
   Training model for Estrogen Receptor Binding...
   ✅ NR-ER-LBD: 6955 samples, 350.0 positive cases
   Training model for PPAR-gamma Activation...
   ✅ NR-PPAR-gamma: 6450 samples, 186.0 positive cases
   Training model for Antioxidant Response...
   ✅ SR-ARE: 5832 samples, 942.0 positive cases
   Training model for DNA Damage Response...
   ✅ SR-ATAD5: 7072 samples, 264.0 positive cases
   Training model for 

In [None]:
#For LLM integration:
result = quick_phytochemical_toxicity_check('CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C')
report = interpret_toxicity_results(result)
print(report)

📂 Models loaded from african_phytochemical_toxicity_models.pkl
🧪 Toxicity Analysis for: CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
🎯 Overall Risk Level: Low

📋 Individual Endpoint Analysis:
   🟢 Androgen Receptor Disruption: Non-toxic (8.6%)
   🟢 Androgen Receptor Binding: Non-toxic (1.6%)
   🟡 Aryl Hydrocarbon Receptor: Non-toxic (40.2%)
   🟡 Aromatase Inhibition: Non-toxic (44.6%)
   🟡 Estrogen Receptor Disruption: Non-toxic (47.4%)
   🟡 Estrogen Receptor Binding: Non-toxic (41.0%)
   🟡 PPAR-gamma Activation: Non-toxic (32.3%)
   🟡 Antioxidant Response: Toxic (54.5%)
   🟢 DNA Damage Response: Non-toxic (8.2%)
   🟡 Heat Shock Response: Non-toxic (35.6%)
   🟡 Mitochondrial Toxicity: Toxic (66.5%)
   🟢 p53 Tumor Suppressor: Non-toxic (16.7%)

