In [8]:
import requests
from bs4 import BeautifulSoup
import csv

homepage_url = "https://www.nhsinform.scot/illnesses-and-conditions/a-to-z/"

response = requests.get(homepage_url)
soup = BeautifulSoup(response.content, 'html.parser')

disease_links = []
for h2 in soup.find_all('h2'):
    ul = h2.find_next('ul')
    if ul:
        for li in ul.find_all('li'):
            a_tag = li.find('a')
            if a_tag:
                disease_name = a_tag.text.strip()
                disease_link = a_tag['href']
                disease_links.append((disease_name, disease_link))

csv_file = "disease_links.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Disease Name', 'Link']) 
    for disease_name, disease_link in disease_links:
        writer.writerow([disease_name, disease_link])

print(f"Disease names and links have been saved to {csv_file}")

Disease names and links have been saved to disease_links.csv


In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

csv_file = "disease_links.csv" 
data = pd.read_csv(csv_file)

results_list = [] 

def extract_symptoms(disease_name, url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")

    
        headings = soup.find_all(["h2", "h3"]) 
        symptoms_start = None
        symptoms_end = None

    
        def get_text_including_strong(tag):
            if tag:
                strong_text = ''.join(strong.get_text() for strong in tag.find_all('strong'))
                return f"{tag.get_text()} {strong_text}".strip()
            return ''

    
        for heading in headings:
            heading_text = get_text_including_strong(heading)
            if "What are the symptoms of" in heading_text or "Symptoms" in heading_text:
                symptoms_start = heading
            if symptoms_start and ("What are the causes of" in heading_text or "Causes" in heading_text):
                symptoms_end = heading
                break

        if not symptoms_start:
            print(f"Symptoms section for {disease_name} not found.")
            return None

    
        symptoms_content = []
        for sibling in symptoms_start.find_next_siblings():
            if sibling == symptoms_end:
                break
            if sibling.name in ["p", "ul", "ol"]:
                symptoms_content.append(sibling.get_text())

        return "\n".join(symptoms_content)
    except Exception as e:
        print(f"Error extracting symptoms for {disease_name}: {e}")
        return None

for index, row in data.iterrows():
    disease_name = row['disease_name']
    url = row['link']
    
    print(f"Extracting symptoms for {disease_name} from {url}...")
    symptoms = extract_symptoms(disease_name, url)
    
    if symptoms:
        print(f"Symptoms of {disease_name}:\n{symptoms}")
        results_list.append({'Disease Name': disease_name, 'Symptoms': symptoms})
    else:
        print(f"Symptoms not found for {disease_name}.")
        results_list.append({'Disease Name': disease_name, 'Symptoms': ''})

results = pd.DataFrame(results_list)
results.to_csv('extracted_symptoms.csv', index=False)

print("Extraction completed and saved to 'extracted_symptoms.csv'.")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Extracting symptoms for Abdominal aortic aneurysm from https://www.nhsinform.scot/illnesses-and-conditions/cardiovascular-disease/heart-disease/abdominal-aortic-aneurysm/...
Symptoms of Abdominal aortic aneurysm:
In most cases, an AAA causes no noticeable symptoms. However, if it becomes large, some people may develop a pain or a pulsating feeling in their abdomen (tummy) or persistent back pain.
An AAA doesn’t usually pose a serious threat to health, but there’s a risk that a larger aneurysm could burst (rupture).
A ruptured aneurysm can cause massive internal bleeding, which is usually fatal. Around 8 out of 10 people with a rupture either die before they reach hospital or don’t survive surgery.
The most common symptom of a ruptured aortic aneurysm is sudden and severe pain in the abdomen.
If you suspect that you or someone else has had a ruptured aneurysm, call 999 immediately and ask for an ambulance.
Extracting symptoms for Achillies tendinopathy from https://www.nhsinform.scot/il

NLP

In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

# Load evidence and condition JSON files
with open('C:/Users/edwar/Desktop/ICLINIC/CODE/datasets/release_evidences.json') as f:
    evidence_data = json.load(f)

with open('C:/Users/edwar/Desktop/ICLINIC/CODE/datasets/release_conditions.json') as f:
    condition_data = json.load(f)

# Load training dataset
train_data = pd.read_csv('C:/Users/edwar/Desktop/ICLINIC/CODE/datasets/release_train_patients/release_train_patients.csv')

# Map symptom name to evidence code
symptom_to_evidence_map = {evidence_data[key]['question_en'].lower(): key for key in evidence_data}

# Load pre-trained NLP model for named entity recognition (NER) to extract symptoms from user input
symptom_extractor = pipeline('ner', model='deepset/roberta-base-squad2')

# Define dataset class for PyTorch
class MedicalDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index], dtype=torch.float32), torch.tensor(self.labels[index], dtype=torch.long)

# Neural network model
class MedicalNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(MedicalNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to map symptoms to evidence codes
def symptom_to_evidence(symptom):
    return symptom_to_evidence_map.get(symptom.lower(), 'E_0')  # Default to 'E_0' if symptom not found

# Function to encode evidence codes to numerical form
def encode_evidences(evidence_codes, max_len):
    evidence_encoding = {f'E_{i}': i for i in range(1, 1000)}  # Example encoding for E_1 to E_999
    encoded = [evidence_encoding.get(code, 0) for code in evidence_codes]
    return encoded[:max_len] + [0] * (max_len - len(encoded))  # Pad with 0s if necessary

# Prepare data for training
def prepare_data():
    features = []
    labels = []
    max_len = 10  # Max number of evidences for each condition

    for _, row in train_data.iterrows():
        evidences = eval(row['EVIDENCES'])  # Extract evidence list
        encoded_evidences = encode_evidences(evidences, max_len)
        features.append(encoded_evidences)
        labels.append(row['DIFFERENTIAL_DIAGNOSIS'].split(',')[0].strip('"'))  # Get the first diagnosis as label

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    return np.array(features), np.array(encoded_labels), label_encoder

# Prepare training data
X, y, label_encoder = prepare_data()
dataset = MedicalDataset(X, y)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize neural network, optimizer, and loss function
input_size = X.shape[1]
output_size = len(np.unique(y))
model = MedicalNetwork(input_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
def train(model, train_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for data, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Train the model
train(model, train_loader, epochs=10)

# Prediction function based on user query
def predict_disease_from_query(query):
    # Step 1: Extract symptoms using NLP model
    extracted_symptoms = symptom_extractor(query)
    symptoms = [entity['word'].lower() for entity in extracted_symptoms]

    # Step 2: Convert symptoms to evidence codes
    evidence_codes = [symptom_to_evidence(symptom) for symptom in symptoms]
    
    # Step 3: Encode evidence codes to the format required by the model
    max_len = 10  # Same as used in training
    encoded_evidences = torch.tensor([encode_evidences(evidence_codes, max_len)], dtype=torch.float32)
    
    # Step 4: Predict disease using the trained neural network model
    model.eval()
    with torch.no_grad():
        output = model(encoded_evidences)
        predicted_label = torch.argmax(output, dim=1).item()
    
    # Step 5: Decode the predicted label to the disease name
    predicted_disease = label_encoder.inverse_transform([predicted_label])[0]
    
    return predicted_disease

# Example usage
user_query = "I've been facing weight loss and coughing"
predicted_disease = predict_disease_from_query(user_query)
print(f"Predicted disease: {predicted_disease}]]")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm





Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 3.5030
Epoch 2/10, Loss: 3.5009
Epoch 3/10, Loss: 3.5005
Epoch 4/10, Loss: 3.5003
Epoch 5/10, Loss: 3.5002
Epoch 6/10, Loss: 3.5002
Epoch 7/10, Loss: 3.5001
Epoch 8/10, Loss: 3.5001
Epoch 9/10, Loss: 3.5000
Epoch 10/10, Loss: 3.5000
Predicted disease: [['Bronchite']]


In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

# Load datasets
disease_graph = pd.read_csv('C:/Users/edwar/Desktop/FOLDERS/ICLINIC/CODE/disease_graph.csv')
disease_symptoms = pd.read_csv('C:/Users/edwar/Desktop/FOLDERS/ICLINIC/CODE/disease_symptoms.csv')

# Map disease to symptoms from disease_graph.csv
disease_symptom_map = disease_graph.groupby('diseases')['symptoms'].apply(list).to_dict()

# Create symptom-to-disease map for reverse lookup
symptom_to_disease_map = {}
for disease, symptoms in disease_symptom_map.items():
    for symptom in symptoms:
        symptom_to_disease_map[symptom.lower()] = disease

# Load pre-trained NLP model for named entity recognition (NER) to extract symptoms from user input
symptom_extractor = pipeline('ner', model='deepset/roberta-base-squad2')

# Define dataset class for PyTorch
class MedicalDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index], dtype=torch.float32), torch.tensor(self.labels[index], dtype=torch.long)

# Neural network model
class MedicalNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(MedicalNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Function to map symptoms to diseases
def symptom_to_disease(symptom):
    return symptom_to_disease_map.get(symptom.lower(), 'Unknown')

# Function to encode symptoms as input features
def encode_symptoms(symptoms, max_len):
    # Create a symptom encoding dictionary
    symptom_encoding = {symptom: idx for idx, symptom in enumerate(symptom_to_disease_map.keys())}
    
    # Encode the symptoms
    encoded = [symptom_encoding.get(symptom, 0) for symptom in symptoms]
    
    # Pad the encoded symptoms with 0s if necessary
    return encoded[:max_len] + [0] * (max_len - len(encoded))

# Prepare data for training
def prepare_data():
    features = []
    labels = []
    max_len = 10  # Max number of symptoms for each condition

    for _, row in disease_symptoms.iterrows():
        symptoms = row[1]  # Extract the symptom list
        encoded_symptoms = encode_symptoms(symptoms, max_len)
        features.append(encoded_symptoms)
        labels.append(row[0])  # Disease name

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    return np.array(features), np.array(encoded_labels), label_encoder

# Prepare training data
X, y, label_encoder = prepare_data()
dataset = MedicalDataset(X, y)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize neural network, optimizer, and loss function
input_size = X.shape[1]
output_size = len(np.unique(y))
model = MedicalNetwork(input_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
def train(model, train_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for data, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Train the model
train(model, train_loader, epochs=10)

# Prediction function based on user query
def predict_disease_from_query(query):
    # Step 1: Extract symptoms using NLP model
    extracted_symptoms = symptom_extractor(query)
    symptoms = [entity['word'].lower() for entity in extracted_symptoms]

    # Step 2: Map extracted symptoms to disease-relevant symptoms
    mapped_symptoms = [symptom_to_disease(symptom) for symptom in symptoms]
    
    # Step 3: Encode symptoms to the format required by the model
    max_len = 10  # Same as used in training
    encoded_symptoms = torch.tensor([encode_symptoms(mapped_symptoms, max_len)], dtype=torch.float32)
    
    # Step 4: Predict disease using the trained neural network model
    model.eval()
    with torch.no_grad():
        output = model(encoded_symptoms)
        predicted_label = torch.argmax(output, dim=1).item()
    
    # Step 5: Decode the predicted label to the disease name
    predicted_disease = label_encoder.inverse_transform([predicted_label])[0]
    
    return predicted_disease

# Example usage
user_query = "I've been facing weight loss and coughing"
predicted_disease = predict_disease_from_query(user_query)
print(f"Predicted disease: {predicted_disease}")

  from .autonotebook import tqdm as notebook_tqdm





Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  symptoms = row[1]  # Extract the symptom list
  labels.append(row[0])  # Disease name


Epoch 1/10, Loss: 4.9971
Epoch 2/10, Loss: 4.9963
Epoch 3/10, Loss: 4.9941
Epoch 4/10, Loss: 4.9948
Epoch 5/10, Loss: 4.9935
Epoch 6/10, Loss: 4.9935
Epoch 7/10, Loss: 4.9924
Epoch 8/10, Loss: 4.9916
Epoch 9/10, Loss: 4.9930
Epoch 10/10, Loss: 4.9936
Predicted disease: gout
