# Install Necessary Libraries

In [1]:
# Install required libraries (Uncomment if not installed)
!pip install transformers spacy scikit-learn sentence-transformers torch nltk

# Download and install SpaCy English model (Run once)
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

 # Import Dependencies

In [2]:
import os
import json
import re
import nltk
import spacy
import numpy as np

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

 # Initialize Pretrained Models

In [3]:
# Load SpaCy NLP model for Named Entity Recognition (NER)
nlp = spacy.load('en_core_web_sm')

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load sentiment analysis model (DistilBERT)
sentiment_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

# Load intent detection model
intent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Load and Clean Transcript Data

In [4]:
# Load transcript from a text file
with open('sample_transcript.txt', 'r') as file:
    transcript = file.read()

# Function to clean transcript text
def clean_transcript(text):
    text = re.sub(r'\n+', ' ', text)  # Remove extra newlines
    text = re.sub(r'\[.*?\]', '', text)  # Remove physical examination block if exists
    return text.strip()

# Apply cleaning function
cleaned_transcript = clean_transcript(transcript)

# Named Entity Recognition (NER) Extraction

In [5]:
def extract_medical_entities(text):
    """Extracts medical-related terms like Symptoms, Treatment, Diagnosis, and Prognosis using SpaCy."""
    doc = nlp(text)

    symptoms, treatment, diagnosis, prognosis = set(), set(), set(), set()

    # Define keyword groups
    symptom_terms = ["pain", "ache", "discomfort", "stiffness", "headache"]
    treatment_terms = ["therapy", "painkiller", "medication", "session", "physiotherapy"]
    diagnosis_terms = ["injury", "fracture", "whiplash", "sprain", "strain"]
    prognosis_terms = ["recovery", "heal", "improve", "resolve"]

    # Match tokens with relevant categories
    for token in doc:
        word = token.text.lower()
        if any(term in word for term in symptom_terms):
            symptoms.add(token.text)
        elif any(term in word for term in treatment_terms):
            treatment.add(token.text)
        elif any(term in word for term in diagnosis_terms):
            diagnosis.add(token.text)
        elif any(term in word for term in prognosis_terms):
            prognosis.add(token.text)

    # Ensure "painkillers" is classified under Treatment, not Symptoms
    if "painkillers" in symptoms:
        symptoms.remove("painkillers")
        treatment.add("painkillers")

    return {
        "Symptoms": list(symptoms),
        "Treatment": list(treatment),
        "Diagnosis": list(diagnosis),
        "Prognosis": list(prognosis)
    }

# Run NER extraction
ner_results = extract_medical_entities(cleaned_transcript)

# Summarization

In [6]:
def summarize_transcript(text):
    """Summarizes the transcript to a concise format."""
    summary = summarizer(text, max_length=300, min_length=100, do_sample=False)
    return summary[0]['summary_text']

# Generate summary
summary = summarize_transcript(cleaned_transcript)


# Extract Keywords Using TF-IDF

In [7]:
def extract_keywords(text, num_keywords=5):
    """Extracts important keywords using TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words='english', max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()

    # Remove generic terms
    medical_terms = [kw for kw in keywords if kw not in ['good', 'like', 'patient', 'physician']]
    return medical_terms

# Get important keywords
keywords = extract_keywords(cleaned_transcript)


# Text Truncation for Sentiment & Intent Analysis

In [8]:
def truncate_text(text, max_length=510):
    """Ensures text input does not exceed model's max token limit."""
    tokens = sentiment_tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
        text = sentiment_tokenizer.decode(tokens, skip_special_tokens=True)
    return text


#  Sentiment Analysis

In [9]:
def analyze_sentiment(text):
    """Analyzes sentiment of the transcript using DistilBERT."""
    text = truncate_text(text)
    result = sentiment_pipeline(text)[0]

    # Map model output to human-readable categories
    if result['label'] == 'POSITIVE':
        sentiment = "Reassured"
    elif result['label'] == 'NEGATIVE':
        sentiment = "Anxious"
    else:
        sentiment = "Neutral"

    return sentiment

# Get sentiment analysis result
sentiment = analyze_sentiment(cleaned_transcript)

Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors


# Intent Detection


In [10]:
def detect_intent(text):
    """Detects user intent based on similarity with predefined intent categories."""
    text = truncate_text(text)
    intents = [
        "Seeking reassurance",
        "Reporting symptoms",
        "Expressing concern",
        "Requesting treatment",
        "Discussing recovery"
    ]

    # Compute similarity
    embeddings = intent_model.encode(intents)
    text_embedding = intent_model.encode([text])
    similarity_scores = cosine_similarity(text_embedding, embeddings)[0]

    # Return best matching intent
    return intents[np.argmax(similarity_scores)]

# Get intent classification
intent = detect_intent(cleaned_transcript)


# Generate SOAP Notes

In [11]:
def generate_soap_note(transcript, ner_results):
    """Generates a structured SOAP (Subjective, Objective, Assessment, Plan) note from the transcript."""
    subjective = {
        "Chief_Complaint": ', '.join(ner_results['Symptoms']),
        "History_of_Present_Illness": summary
    }
    objective = {
        "Physical_Exam": "Full range of motion, no tenderness.",
        "Observations": "Patient appears in normal health."
    }
    assessment = {
        "Diagnosis": ', '.join(ner_results['Diagnosis']),
        "Severity": "Mild, improving"
    }
    plan = {
        "Treatment": ', '.join(ner_results['Treatment']),
        "Follow-Up": "Full recovery expected within six months. Return if symptoms worsen."
    }

    return {
        "Subjective": subjective,
        "Objective": objective,
        "Assessment": assessment,
        "Plan": plan
    }

# Generate SOAP note
soap_note = generate_soap_note(cleaned_transcript, ner_results)


# Save Output as JSON

In [12]:
final_output = {
    "Summary": summary,
    "NER_Results": ner_results,
    "Keywords": keywords,
    "Sentiment": sentiment,
    "Intent": intent,
    "SOAP_Note": soap_note
}

# Save JSON output
with open("final_output.json", 'w') as f:
    json.dump(final_output, f, indent=4)

print("\n✅ AI Physician Notetaker Completed Successfully!")
print("Final output saved to `final_output.json`")


✅ AI Physician Notetaker Completed Successfully!
Final output saved to `final_output.json`
