<a href="https://colab.research.google.com/github/Anonymous143w/ConsumeWise/blob/main/Drug_toxcity/Drug_Toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-cpu



In [23]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import login
import faiss
import re

In [42]:
class DrugToxicityRAG:
    def __init__(self):
        self.compounds_df = None
        self.relationships_df = None
        self.embedding_model = None
        self.llm_tokenizer = None
        self.llm_model = None
        self.compound_embeddings = None
        self.faiss_index = None

    def load_data(self, compounds_path, relationships_path):
        """Load compound and relationship data"""
        self.compounds_df = pd.read_csv(compounds_path)
        self.relationships_df = pd.read_csv(relationships_path)

        # Clean text data
        self.compounds_df['DEF'] = self.compounds_df['DEF'].fillna('')
        self.compounds_df['NAME'] = self.compounds_df['NAME'].fillna('')

        # Create full text for embedding
        self.compounds_df['full_text'] = self.compounds_df['NAME'] + ': ' + self.compounds_df['DEF']

        print(f"Loaded {len(self.compounds_df)} compounds and {len(self.relationships_df)} relationships")
        return self

    def load_models(self, embedding_model_name="all-MiniLM-L12-v2", llm_size="1b", use_quantization=False):
        """
        Load embedding model and Llama 3.2 model

        Parameters:
        - embedding_model_name: model name for sentence transformers
        - llm_size: "1b" or "3b" to specify Llama 3.2 model size
        - use_quantization: Whether to use 4-bit quantization (requires bitsandbytes)
        """
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer(embedding_model_name)

        # Select the appropriate Llama 3.2 model based on size
        if llm_size.lower() == "1b":
            model_name = "meta-llama/Llama-3.2-1B-Instruct"
        else:  # Default to 3B
            model_name = "meta-llama/Llama-3.2-3B-Instruct"

        print(f"Loading Llama 3.2 {llm_size.upper()} Instruct model...")
        self.llm_tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Set up model loading configuration
        model_kwargs = {
            "torch_dtype": torch.float16,
            "device_map": "auto"
        }

        # Add quantization config if requested
        if use_quantization:
            try:
                print("Attempting to use 4-bit quantization (requires bitsandbytes)")
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16
                )
                model_kwargs["quantization_config"] = quantization_config
            except ImportError:
                print("Warning: bitsandbytes not available. Falling back to standard loading.")

        # Load the model with the appropriate configuration
        self.llm_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            **model_kwargs
        )

        return self

    def create_embeddings(self):
        """Create embeddings for all compounds"""
        print("Creating embeddings for compounds...")
        texts = self.compounds_df['full_text'].tolist()
        self.compound_embeddings = self.embedding_model.encode(texts)

        # Create FAISS index for fast similarity search
        embedding_dim = self.compound_embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatL2(embedding_dim)
        self.faiss_index.add(self.compound_embeddings.astype('float32'))

        return self

    def create_toxicity_labels(self):
        """Create binary labels for toxic compounds"""
        toxic_compounds = self.relationships_df[
            self.relationships_df['target'].str.contains('toxic|poisoning', case=False, na=False)
        ]['source'].unique()

        self.compounds_df['is_toxic'] = self.compounds_df['NAME'].isin(toxic_compounds).astype(int)
        print(f"Identified {len(toxic_compounds)} toxic compounds")

        return self

    def retrieve_similar_compounds(self, query_text, k=5):
        """Retrieve most similar compounds to a query"""
        # Create embedding for query
        query_embedding = self.embedding_model.encode([query_text])[0].reshape(1, -1).astype('float32')

        # Search similar compounds
        distances, indices = self.faiss_index.search(query_embedding, k)

        # Get similar compounds
        similar_compounds = self.compounds_df.iloc[indices[0]]
        similar_compounds = similar_compounds.copy()
        similar_compounds.loc[:, 'similarity'] = 1 - distances[0] / 100


        return similar_compounds

    def generate_toxicity_analysis(self, query_text, k=5):
        """Generate toxicity analysis using RAG approach with Llama 3.2"""
        # Retrieve relevant compounds
        similar_compounds = self.retrieve_similar_compounds(query_text, k)

        # Create context from retrieved compounds
        context = ""
        for _, row in similar_compounds.iterrows():
            context += f"Compound: {row['NAME']}\n"
            context += f"Definition: {row['DEF']}\n"
            context += f"Toxicity: {'Toxic' if row['is_toxic'] == 1 else 'Non-toxic'}\n\n"

        # Create prompt for Llama 3.2 Instruct
        prompt = f"""<|system|>
You are a pharmacology and toxicology expert. Analyze chemicals and compounds to determine their potential toxicity based on similar known compounds.

<|user|>
I need to analyze this compound:
{query_text}

Here is information about similar compounds from our database:
{context}

Based on this information, provide a detailed toxicity analysis.

<|assistant|>"""

        # Generate response from Llama 3.2
        inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
        output = self.llm_model.generate(
            inputs.input_ids,
            max_length=2048,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.llm_tokenizer.pad_token_id,
            attention_mask=inputs.attention_mask
        )
        response = self.llm_tokenizer.decode(output[0], skip_special_tokens=True)

        # Extract just the assistant's response
        if "<|assistant|>" in response:
            response = response.split("<|assistant|>")[1].strip()

        return {
            "similar_compounds": similar_compounds,
            "analysis": response
        }

    def evaluate_retrieval(self, test_queries, ground_truth):
        """Evaluate the retrieval component"""
        precision_at_k = []

        for query, relevant_compounds in zip(test_queries, ground_truth):
            retrieved = self.retrieve_similar_compounds(query, k=10)
            retrieved_set = set(retrieved['NAME'].tolist())
            relevant_set = set(relevant_compounds)

            # Calculate precision@10
            precision = len(retrieved_set.intersection(relevant_set)) / len(retrieved_set)
            precision_at_k.append(precision)

        return {
            "mean_precision@10": np.mean(precision_at_k)
        }

    def export_model(self, output_dir):
        """Export the model for deployment"""
        os.makedirs(output_dir, exist_ok=True)

        # Save compound data with embeddings
        self.compounds_df.to_pickle(os.path.join(output_dir, "compounds_df.pkl"))

        # Save FAISS index
        faiss.write_index(self.faiss_index, os.path.join(output_dir, "compounds_index.faiss"))

        # Save embedding model
        self.embedding_model.save(os.path.join(output_dir, "embedding_model"))

        print(f"Model exported to {output_dir}")
        return self

In [33]:
# Example usage function
def run_rag_pipeline(compounds_path, relationships_path, llm_size="1b", use_quantization=False):
    """Run the complete RAG pipeline"""
    rag_system = DrugToxicityRAG()

    # Load data and models
    rag_system.load_data(compounds_path, relationships_path)
    rag_system.load_models(llm_size=llm_size, use_quantization=use_quantization)

    # Create embeddings and labels
    rag_system.create_embeddings()
    rag_system.create_toxicity_labels()

    return rag_system


In [34]:
# Example function to analyze a new compound
def analyze_new_compound(rag_system, compound_description):
    """Analyze a new compound using the RAG system"""
    result = rag_system.generate_toxicity_analysis(compound_description)

    # Print similar compounds
    print("Similar Compounds:")
    for _, row in result["similar_compounds"].iterrows():
        print(f"- {row['NAME']} (Similarity: {row['similarity']:.2f}, Toxic: {'Yes' if row['is_toxic'] == 1 else 'No'})")

    print("\nAnalysis:")
    print(result["analysis"])

    return result

In [35]:
!pip install -U bitsandbytes

In [36]:
login(token="use yours")


In [44]:
# Load the RAG system WITHOUT quantization
rag_system = run_rag_pipeline(
    "/content/MRDEF.csv",  # Path to your compounds CSV
    "/content/umls_kg.csv",  # Path to your relationships CSV
    llm_size="1B",  # Use "1b" for the smaller model
    use_quantization=False  # Don't use quantization (default)
)

Loaded 529 compounds and 257 relationships
Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading Llama 3.2 1B Instruct model...
Creating embeddings for compounds...
Identified 6 toxic compounds
