In [50]:
import pandas as pd
import nltk
import json
import os
import json
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [51]:
# Only download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
def simple_tokenize(text):
    """Simple tokenization function that doesn't require punkt"""
    # Split on whitespace and remove empty strings
    tokens = [token.strip() for token in text.split()]
    # Remove punctuation from tokens
    tokens = [token.strip('.,!?:;()[]{}""\'') for token in tokens if token.strip('.,!?:;()[]{}""\'')]
    return tokens

In [53]:
def process_qa_csv(csv_path, output_json_path):
    # Read the CSV file
    print(f"Reading CSV from: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # Check what columns are actually in the dataset
    print("Columns in CSV:", df.columns.tolist())
    
    # Determine question and answer column names (case-insensitive)
    question_col = None
    answer_col = None
    
    for col in df.columns:
        if col.lower() in ['question', 'questions']:
            question_col = col
        elif col.lower() in ['answer', 'answers']:
            answer_col = col
    
    if not question_col or not answer_col:
        raise ValueError(f"Could not find question and answer columns in: {df.columns}")
    
    print(f"Using columns: Question='{question_col}', Answer='{answer_col}'")
    
    # Initialize result list
    processed_data = []
    
    # Process each row
    for idx, row in df.iterrows():
        question = str(row[question_col])
        answer = str(row[answer_col])
        
        # Tokenize questions and answers using our simple tokenizer
        question_tokens = simple_tokenize(question)
        answer_tokens = simple_tokenize(answer)
        
        # Remove stopwords (optional)
        stop_words = set(stopwords.words('english'))
        question_filtered = [w for w in question_tokens if w.lower() not in stop_words]
        answer_filtered = [w for w in answer_tokens if w.lower() not in stop_words]
        
        # Create entry for this QA pair
        entry = {
            "id": idx,
            "question": {
                "text": question,
                "tokens": question_tokens,
                "filtered_tokens": question_filtered
            },
            "answer": {
                "text": answer,
                "tokens": answer_tokens,
                "filtered_tokens": answer_filtered
            }
        }
        
        processed_data.append(entry)
    
    # Write to JSON file
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)
    
    print(f"Successfully processed {len(processed_data)} QA pairs")
    print(f"Output saved to: {output_json_path}")
    
    return processed_data

In [54]:
if __name__ == "__main__":
    input_csv = "medquad.csv"  # Your CSV file name
    output_json = "processed_medquad.json"
    
    # First, let's check the contents of the CSV file
    try:
        df_preview = pd.read_csv(input_csv, nrows=5)
        print("CSV Preview (first 5 rows):")
        print(df_preview)
    except Exception as e:
        print(f"Error previewing CSV: {e}")
    
    processed_data = process_qa_csv(input_csv, output_json)
    
    if processed_data:
        print(f"First entry sample: {processed_data[0]}")

CSV Preview (first 5 rows):
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
Reading CSV from: medquad.csv
Columns in CSV: ['question', 'answer', 'source', 'focus_area']
Using columns: Question='question', Answer='answer'
Successfully processed 1641

In [55]:
def load_processed_data(json_path):
    """Load the processed QA data from JSON file"""
    print(f"Loading data from: {json_path}")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [56]:
def build_qa_retrieval_model(data, model_dir="qa_model"):
    """Build a simple retrieval-based QA model using TF-IDF"""
    
    # Extract questions and answers
    questions = [item['question']['text'] for item in data]
    answers = [item['answer']['text'] for item in data]
    
    # Create TF-IDF vectorizer for questions
    print("Training TF-IDF vectorizer on questions...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    question_vectors = tfidf_vectorizer.fit_transform(questions)
    
    # Create directory for saving model artifacts
    os.makedirs(model_dir, exist_ok=True)
    
    # Save the vectorizer
    with open(os.path.join(model_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    
    # Save the question vectors
    with open(os.path.join(model_dir, 'question_vectors.pkl'), 'wb') as f:
        pickle.dump(question_vectors, f)
    
    # Save the answers for retrieval
    with open(os.path.join(model_dir, 'answers.pkl'), 'wb') as f:
        pickle.dump(answers, f)
    
    print(f"Model saved to directory: {model_dir}")
    
    return {
        'vectorizer': tfidf_vectorizer,
        'question_vectors': question_vectors,
        'answers': answers
    }

In [57]:
def evaluate_qa_model(model, data, test_size=0.2):
    """Evaluate the QA model using a test set"""
    
    # Extract questions and answers
    questions = [item['question']['text'] for item in data]
    answers = [item['answer']['text'] for item in data]
    
    # Split into train and test sets
    train_questions, test_questions, train_answers, test_answers = train_test_split(
        questions, answers, test_size=test_size, random_state=42
    )
    
    # Get model components
    vectorizer = model['vectorizer']
    train_question_vectors = vectorizer.transform(train_questions)
    
    # Evaluate on test set
    correct = 0
    total = len(test_questions)
    
    print(f"Evaluating on {total} test questions...")
    
    for i, test_question in enumerate(test_questions):
        # Vectorize the test question
        test_vector = vectorizer.transform([test_question])
        
        # Calculate similarity with all training questions
        similarities = cosine_similarity(test_vector, train_question_vectors).flatten()
        
        # Get the most similar question's index
        most_similar_idx = np.argmax(similarities)
        
        # Retrieve the corresponding answer
        predicted_answer = train_answers[most_similar_idx]
        
        # Check if the answer is correct (simple string match)
        if predicted_answer.lower() == test_answers[i].lower():
            correct += 1
        
        # Print progress
        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{total} test questions")
    
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
    
    return accuracy

def answer_question(question, model_dir="qa_model"):
    """Use the trained model to answer a new question"""
    
    # Load model components
    with open(os.path.join(model_dir, 'tfidf_vectorizer.pkl'), 'rb') as f:
        vectorizer = pickle.load(f)
    
    with open(os.path.join(model_dir, 'question_vectors.pkl'), 'rb') as f:
        question_vectors = pickle.load(f)
    
    with open(os.path.join(model_dir, 'answers.pkl'), 'rb') as f:
        answers = pickle.load(f)
    
    # Vectorize the new question
    question_vector = vectorizer.transform([question])
    
    # Calculate similarity with all known questions
    similarities = cosine_similarity(question_vector, question_vectors).flatten()
    
    # Get top 3 most similar questions' indices
    top_indices = similarities.argsort()[-3:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            'similarity': similarities[idx],
            'answer': answers[idx]
        })
    
    return results

In [58]:
if __name__ == "__main__":
    # Load processed data
    processed_json = "processed_medquad.json"
    qa_data = load_processed_data(processed_json)
    
    # Build and save the model
    model = build_qa_retrieval_model(qa_data)
    
    # Evaluate the model
    evaluate_qa_model(model, qa_data)

Loading data from: processed_medquad.json
Training TF-IDF vectorizer on questions...
Model saved to directory: qa_model
Evaluating on 3283 test questions...
Processed 100/3283 test questions
Processed 200/3283 test questions
Processed 300/3283 test questions
Processed 400/3283 test questions
Processed 500/3283 test questions
Processed 600/3283 test questions
Processed 700/3283 test questions
Processed 800/3283 test questions
Processed 900/3283 test questions
Processed 1000/3283 test questions
Processed 1100/3283 test questions
Processed 1200/3283 test questions
Processed 1300/3283 test questions
Processed 1400/3283 test questions
Processed 1500/3283 test questions
Processed 1600/3283 test questions
Processed 1700/3283 test questions
Processed 1800/3283 test questions
Processed 1900/3283 test questions
Processed 2000/3283 test questions
Processed 2100/3283 test questions
Processed 2200/3283 test questions
Processed 2300/3283 test questions
Processed 2400/3283 test questions
Processed 25

In [63]:
 # Example of using the model to answer questions
print("\nTesting the model with sample questions:")
test_questions = [
        "What are the symptoms of diabetes?",
        "How is pneumonia diagnosed?",
        "What treatments are available for arthritis?"
    ]
    
for question in test_questions:
    print(f"\nQuestion: {question}")
    results = answer_question(question)
    
    print("Top answers:")
    for i, result in enumerate(results):
        print(f"{i+1}. (Similarity: {result['similarity']:.4f})")
        print(f"   {result['answer'][:200]}..." if len(result['answer']) > 200 else f"   {result['answer']}")


Testing the model with sample questions:

Question: What are the symptoms of diabetes?
Top answers:
1. (Similarity: 1.0000)
   Many people with diabetes experience one or more symptoms, including extreme thirst or hunger, a frequent need to urinate and/or fatigue. Some lose weight without trying. Additional signs include sore...
2. (Similarity: 1.0000)
   Diabetes is often called a "silent" disease because it can cause serious complications even before you have symptoms. Symptoms can also be so mild that you dont notice them. An estimated 8 million peo...
3. (Similarity: 0.7529)
   Type 1 diabetes, which used to be called called juvenile diabetes or insulin-dependent diabetes, develops most often in young people. However, type 1 diabetes can also develop in adults. With this for...

Question: How is pneumonia diagnosed?
Top answers:
1. (Similarity: 0.6221)
   Pneumonia (nu-MO-ne-ah) is an infection in one or both of the lungs. Many germssuch as bacteria, viruses, and fungican cause pn