In [5]:
import pandas as pd
import nltk
import json
import os
import numpy as np
import requests
import time
from tqdm import tqdm

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def simple_tokenize(text):
    """Simple tokenization function that doesn't require punkt"""
    # Split on whitespace and remove empty strings
    tokens = [token.strip() for token in text.split()]
    # Remove punctuation from tokens
    tokens = [token.strip('.,!?:;()[]{}""\'') for token in tokens if token.strip('.,!?:;()[]{}""\'')]
    return tokens

In [8]:
def generate_medical_qa_pairs_with_groq(api_key, num_pairs=100, topics=None, output_json_path="groq_generated_qa.json"):
    """
    Generate medical Q&A pairs using Groq API
    
    Args:
        api_key (str): Your Groq API key
        num_pairs (int): Number of Q&A pairs to generate
        topics (list): Optional list of medical topics to focus on
        output_json_path (str): Path to save the generated Q&A pairs
    
    Returns:
        list: The generated Q&A pairs
    """
    url = "https://api.groq.com/openai/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # Default medical topics if none provided
    if not topics:
        topics = [
            "diabetes", "hypertension", "asthma", "cancer", "heart disease",
            "arthritis", "allergies", "depression", "anxiety", "pneumonia",
            "influenza", "COVID-19", "pregnancy", "pediatrics", "geriatrics",
            "dermatology", "neurology", "cardiology", "gastroenterology", "hematology"
        ]
    
    all_qa_pairs = []
    pairs_per_request = 5  # Number of pairs to generate in each API call
    
    print(f"Generating {num_pairs} medical Q&A pairs using Groq API...")
    
    # Calculate number of API calls needed
    num_requests = (num_pairs + pairs_per_request - 1) // pairs_per_request
    
    for i in tqdm(range(num_requests)):
        # Select random topics for this batch
        selected_topics = np.random.choice(topics, size=min(pairs_per_request, len(topics)), replace=False)
        topics_str = ", ".join(selected_topics)
        
        # Create the prompt
        prompt = f"""Generate {pairs_per_request} medically accurate question-answer pairs about the following topics: {topics_str}.

For each pair:
1. Create a detailed medical question that a patient might ask
2. Provide a comprehensive, accurate answer based on current medical knowledge
3. Make sure the answer is informative and would be helpful to a patient

Format the output as a JSON array where each object has 'question' and 'answer' fields. Example format:
[
  {{
    "question": "What are the early warning signs of type 2 diabetes?",
    "answer": "Early warning signs of type 2 diabetes include increased thirst, frequent urination, unexplained weight loss, fatigue, blurred vision, slow-healing sores, and frequent infections. These symptoms occur because excess glucose in the bloodstream pulls fluid from tissues, making you feel thirsty, and leads to increased urination. If you experience these symptoms, consult a healthcare provider for proper testing and diagnosis."
  }},
  ...
]

Important: Only output valid JSON that can be parsed. Do not include any additional text before or after the JSON.
"""

        # Prepare the API request
        data = {
            "model": "llama3-70b-8192",  # You can change the model as needed
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.7,
            "max_tokens": 4000,
            "response_format": {"type": "json_object"}
        }
        
        try:
            # Make the API call
            response = requests.post(url, headers=headers, json=data)
            response.raise_for_status()
            
            result = response.json()
            content = result["choices"][0]["message"]["content"]
            
            # Parse the JSON response
            qa_pairs = json.loads(content)
            
            # The response might be formatted differently depending on the model's output
            # Check if we have a direct array or if there's a nested structure
            if isinstance(qa_pairs, list):
                generated_pairs = qa_pairs
            elif isinstance(qa_pairs, dict) and 'pairs' in qa_pairs:
                generated_pairs = qa_pairs['pairs']
            else:
                # Try to find any array in the response
                for key, value in qa_pairs.items():
                    if isinstance(value, list) and len(value) > 0 and 'question' in value[0]:
                        generated_pairs = value
                        break
                else:
                    raise ValueError("Unexpected response format from Groq API")
            
            # Process each pair to add token information
            for j, pair in enumerate(generated_pairs):
                if 'question' not in pair or 'answer' not in pair:
                    continue
                    
                question = pair['question']
                answer = pair['answer']
                
                # Tokenize questions and answers
                question_tokens = simple_tokenize(question)
                answer_tokens = simple_tokenize(answer)
                
                # Remove stopwords (optional)
                stop_words = set(stopwords.words('english'))
                question_filtered = [w for w in question_tokens if w.lower() not in stop_words]
                answer_filtered = [w for w in answer_tokens if w.lower() not in stop_words]
                
                # Create entry for this QA pair
                entry = {
                    "id": len(all_qa_pairs) + j,
                    "question": {
                        "text": question,
                        "tokens": question_tokens,
                        "filtered_tokens": question_filtered
                    },
                    "answer": {
                        "text": answer,
                        "tokens": answer_tokens,
                        "filtered_tokens": answer_filtered
                    },
                    "source": "groq_api",
                    "topics": selected_topics.tolist()
                }
                
                all_qa_pairs.append(entry)
            
            # Add a small delay between requests to avoid rate limiting
            time.sleep(2)
            
        except Exception as e:
            print(f"Error in request {i+1}: {e}")
            time.sleep(5)  # Longer delay after an error
    
    # Trim to the requested number of pairs
    all_qa_pairs = all_qa_pairs[:num_pairs]
    
    # Write to JSON file
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(all_qa_pairs, f, ensure_ascii=False, indent=4)
    
    print(f"Successfully generated {len(all_qa_pairs)} QA pairs")
    print(f"Output saved to: {output_json_path}")
    
    return all_qa_pairs

def answer_question_with_groq(question, api_key, context=None, conversation_history=None):
    """
    Answer a question using Groq API
    
    Args:
        question (str): The question to answer
        api_key (str): Your Groq API key
        context (str, optional): Additional context to provide to the model
        conversation_history (list, optional): Previous conversation messages
        
    Returns:
        dict: Result with answer and metadata
    """
    url = "https://api.groq.com/openai/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # Prepare messages
    messages = []
    
    # Add system message with context if provided
    if context:
        messages.append({
            "role": "system", 
            "content": f"You are a medical assistant providing accurate and helpful information. Use the following context to inform your answers: {context}"
        })
    else:
        messages.append({
            "role": "system", 
            "content": "You are a medical assistant providing accurate and helpful information based on current medical knowledge."
        })
    
    # Add conversation history if provided
    if conversation_history:
        messages.extend(conversation_history)
    
    # Add the current question
    messages.append({"role": "user", "content": question})
    
    # Prepare the API request
    data = {
        "model": "llama3-70b-8192",  # You can change the model as needed
        "messages": messages,
        "temperature": 0.7,
        "max_tokens": 2048
    }
    
    try:
        start_time = time.time()
        
        # Make the API call
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        
        result = response.json()
        answer = result["choices"][0]["message"]["content"]
        
        processing_time = time.time() - start_time
        
        return {
            'answer': answer,
            'source': 'groq_api',
            'processing_time': processing_time,
            'context_used': bool(context),
            'model': result.get('model', 'llama3-70b-8192')
        }
    
    except Exception as e:
        print(f"Error with Groq API: {e}")
        return {
            'error': True,
            'message': str(e),
            'answer': f"Error getting answer: {str(e)}"
        }

In [9]:
def interactive_medical_qa_session(api_key, qa_data=None):
    """
    Run an interactive Q&A session using Groq API
    
    Args:
        api_key (str): Your Groq API key
        qa_data (list, optional): Previously generated Q&A pairs for context
    """
    print("\n=== Medical Q&A Interactive Session ===")
    print("Ask medical questions (type 'quit' to exit)")
    
    conversation_history = []
    
    while True:
        # Get question from user
        user_question = input("\nYour question: ")
        
        if user_question.lower() in ['quit', 'exit', 'q']:
            print("Ending session. Goodbye!")
            break
        
        # Try to find relevant context from previously generated QA pairs
        context = None
        if qa_data:
            # Simple keyword matching for demonstration
            keywords = set(simple_tokenize(user_question.lower()))
            best_match_score = 0
            best_match = None
            
            for qa_pair in qa_data:
                question_keywords = set(qa_pair['question']['filtered_tokens'])
                match_score = len(keywords.intersection(question_keywords))
                
                if match_score > best_match_score:
                    best_match_score = match_score
                    best_match = qa_pair
            
            if best_match and best_match_score > 0:
                context = f"Reference Q&A: Question: '{best_match['question']['text']}' Answer: '{best_match['answer']['text']}'"
        
        # Get answer from Groq
        print("Getting answer...")
        result = answer_question_with_groq(
            user_question, 
            api_key, 
            context=context,
            conversation_history=conversation_history
        )
        
        if 'error' in result and result['error']:
            print(f"Error: {result['message']}")
            continue
        
        # Print the answer
        print(f"\nAnswer (processed in {result['processing_time']:.2f}s):")
        print(result['answer'])
        
        # Update conversation history (keeping last 5 exchanges to manage context window)
        conversation_history.append({"role": "user", "content": user_question})
        conversation_history.append({"role": "assistant", "content": result['answer']})
        
        # Keep only the last 10 messages (5 exchanges)
        if len(conversation_history) > 10:
            conversation_history = conversation_history[-10:]

In [55]:
def load_processed_data(json_path):
    """Load the processed QA data from JSON file"""
    print(f"Loading data from: {json_path}")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [56]:
def build_qa_retrieval_model(data, model_dir="qa_model"):
    """Build a simple retrieval-based QA model using TF-IDF"""
    
    # Extract questions and answers
    questions = [item['question']['text'] for item in data]
    answers = [item['answer']['text'] for item in data]
    
    # Create TF-IDF vectorizer for questions
    print("Training TF-IDF vectorizer on questions...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    question_vectors = tfidf_vectorizer.fit_transform(questions)
    
    # Create directory for saving model artifacts
    os.makedirs(model_dir, exist_ok=True)
    
    # Save the vectorizer
    with open(os.path.join(model_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    
    # Save the question vectors
    with open(os.path.join(model_dir, 'question_vectors.pkl'), 'wb') as f:
        pickle.dump(question_vectors, f)
    
    # Save the answers for retrieval
    with open(os.path.join(model_dir, 'answers.pkl'), 'wb') as f:
        pickle.dump(answers, f)
    
    print(f"Model saved to directory: {model_dir}")
    
    return {
        'vectorizer': tfidf_vectorizer,
        'question_vectors': question_vectors,
        'answers': answers
    }

In [57]:
def evaluate_qa_model(model, data, test_size=0.2):
    """Evaluate the QA model using a test set"""
    
    # Extract questions and answers
    questions = [item['question']['text'] for item in data]
    answers = [item['answer']['text'] for item in data]
    
    # Split into train and test sets
    train_questions, test_questions, train_answers, test_answers = train_test_split(
        questions, answers, test_size=test_size, random_state=42
    )
    
    # Get model components
    vectorizer = model['vectorizer']
    train_question_vectors = vectorizer.transform(train_questions)
    
    # Evaluate on test set
    correct = 0
    total = len(test_questions)
    
    print(f"Evaluating on {total} test questions...")
    
    for i, test_question in enumerate(test_questions):
        # Vectorize the test question
        test_vector = vectorizer.transform([test_question])
        
        # Calculate similarity with all training questions
        similarities = cosine_similarity(test_vector, train_question_vectors).flatten()
        
        # Get the most similar question's index
        most_similar_idx = np.argmax(similarities)
        
        # Retrieve the corresponding answer
        predicted_answer = train_answers[most_similar_idx]
        
        # Check if the answer is correct (simple string match)
        if predicted_answer.lower() == test_answers[i].lower():
            correct += 1
        
        # Print progress
        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{total} test questions")
    
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
    
    return accuracy

def answer_question(question, model_dir="qa_model"):
    """Use the trained model to answer a new question"""
    
    # Load model components
    with open(os.path.join(model_dir, 'tfidf_vectorizer.pkl'), 'rb') as f:
        vectorizer = pickle.load(f)
    
    with open(os.path.join(model_dir, 'question_vectors.pkl'), 'rb') as f:
        question_vectors = pickle.load(f)
    
    with open(os.path.join(model_dir, 'answers.pkl'), 'rb') as f:
        answers = pickle.load(f)
    
    # Vectorize the new question
    question_vector = vectorizer.transform([question])
    
    # Calculate similarity with all known questions
    similarities = cosine_similarity(question_vector, question_vectors).flatten()
    
    # Get top 3 most similar questions' indices
    top_indices = similarities.argsort()[-3:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            'similarity': similarities[idx],
            'answer': answers[idx]
        })
    
    return results

In [10]:
if __name__ == "__main__":
    # Replace with your actual Groq API key
    GROQ_API_KEY = "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"
    
    # Ask if user wants to generate new QA pairs or use interactive mode
    print("Medical Q&A System Using Groq API")
    print("1. Generate medical Q&A pairs")
    print("2. Interactive Q&A session")
    print("3. Generate Q&A pairs and then start interactive session")
    
    choice = input("Select an option (1/2/3): ")
    
    qa_data = None
    
    if choice == '1' or choice == '3':
        # Ask how many QA pairs to generate
        num_pairs = int(input("How many Q&A pairs to generate? (default: 50): ") or "50")
        
        # Generate Q&A pairs
        qa_data = generate_medical_qa_pairs_with_groq(
            GROQ_API_KEY, 
            num_pairs=num_pairs,
            output_json_path="groq_generated_qa.json"
        )
        
        print("\nExample generated Q&A pair:")
        if qa_data:
            example = qa_data[0]
            print(f"Question: {example['question']['text']}")
            print(f"Answer: {example['answer']['text']}")
    
    if choice == '2' or choice == '3':
        # If we didn't generate QA pairs but the file exists, load it
        if not qa_data and os.path.exists("groq_generated_qa.json"):
            try:
                with open("groq_generated_qa.json", 'r', encoding='utf-8') as f:
                    qa_data = json.load(f)
                print(f"Loaded {len(qa_data)} previously generated Q&A pairs for context")
            except Exception as e:
                print(f"Error loading previous Q&A pairs: {e}")
        
        # Run interactive session
        interactive_medical_qa_session(GROQ_API_KEY, qa_data=qa_data)

Medical Q&A System Using Groq API
1. Generate medical Q&A pairs
2. Interactive Q&A session
3. Generate Q&A pairs and then start interactive session


Select an option (1/2/3):  1
How many Q&A pairs to generate? (default: 50):  1


Generating 1 medical Q&A pairs using Groq API...


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Error in request 1: 400 Client Error: Bad Request for url: https://api.groq.com/openai/v1/chat/completions


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.39s/it]

Successfully generated 0 QA pairs
Output saved to: groq_generated_qa.json

Example generated Q&A pair:



