<a href="https://colab.research.google.com/github/763730440/Healthcare-Int/blob/main/%E2%80%9Ctask2_ipynb%E2%80%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers
!pip install rank_bm25

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
from google.colab import files
uploaded = files.upload()

Saving QA1.json to QA1.json


In [None]:
from google.colab import files
uploaded = files.upload()

Saving english-train.json to english-train.json


In [None]:
from google.colab import files
uploaded = files.upload()

Saving english-test.json to english-test.json


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import BertTokenizer, BertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader, random_split

# Load dataset 加载数据集
def load_dataset(dataset_path):
    try:
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.load(f)
        return dataset
    except json.JSONDecodeError as e:
        print(f"Error loading JSON file: {e}")
        raise

def split_dataset(dataset, train_ratio=0.8):
    total_size = len(dataset)
    train_size = int(total_size * train_ratio)
    test_size = total_size - train_size
    indices = list(range(total_size))
    train_indices, test_indices = indices[:train_size], indices[train_size:]
    train_dataset = [dataset[i] for i in train_indices]
    test_dataset = [dataset[i] for i in test_indices]
    return train_dataset, test_dataset

def save_dataset(dataset, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

# loading model 加载模型
sentence_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
sentence_model = SentenceTransformer(sentence_model_name)
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
bert_model = BertForQuestionAnswering.from_pretrained('prajjwal1/bert-tiny')

# Define dataset class 定义数据集类
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        dialogue = self.data[idx]
        question = dialogue['utterances'][0].split('patient: ')[1]
        answer = dialogue['utterances'][1].split('doctor: ')[1]

        inputs = self.tokenizer.encode_plus(
            question,
            answer,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze().tolist()
        answer_tokens = self.tokenizer.encode(answer, add_special_tokens=False)

        start_positions = None
        end_positions = None
        for i in range(len(input_ids) - len(answer_tokens) + 1):
            if input_ids[i:i+len(answer_tokens)] == answer_tokens:
                start_positions = i
                end_positions = i + len(answer_tokens) - 1
                break

        if start_positions is None or end_positions is None:
            start_positions = 0
            end_positions = 0

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'start_positions': torch.tensor(start_positions),
            'end_positions': torch.tensor(end_positions)
        }

# Define the model training function 定义模型训练函数
def train_model(model, tokenizer, train_dataset, epochs=10, batch_size=8, learning_rate=2e-5):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)

            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss}")

# Define model evaluation function 定义模型评估函数
def evaluate_model(model, tokenizer, test_dataset):
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    total_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)

            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(test_dataloader)
    print(f"Test Loss: {avg_loss}")

# Define a function to calculate sentence embeddings 定义函数以计算句子嵌入
def get_sentence_embedding(sentence, model):
    return model.encode(sentence)

# Define functions to find the most relevant content 定义函数以查找最相关的上下文
def find_relevant_context(question, dataset, model):
    question_embedding = get_sentence_embedding(question, model)

    best_context = ""
    best_score = float('-inf')

    for dialogue in dataset:
        context = dialogue['utterances'][1].split('doctor: ')[1]
        context_embedding = get_sentence_embedding(context, model)

        score = util.pytorch_cos_sim(question_embedding, context_embedding).item()

        if score > best_score:
            best_score = score
            best_context = context

    return best_context

# Define the function for generating answers 定义生成答案的函数
def generate_answer(question, context, tokenizer, model):
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        return_tensors='pt',
        max_length=512,
        truncation=True
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
        start_index = torch.argmax(outputs.start_logits)
        end_index = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.decode(input_ids[0][start_index:end_index], skip_special_tokens=True)
    return answer

# Define the function for predicting disease 定义猜测疾病的函数
def predict_disease(description, dataset, model):
    description_embedding = get_sentence_embedding(description, model)

    best_match = ""
    best_score = float('-inf')

    for dialogue in dataset:
        disease = dialogue['description']
        disease_embedding = get_sentence_embedding(disease, model)

        score = util.pytorch_cos_sim(description_embedding, disease_embedding).item()

        if score > best_score:
            best_score = score
            best_match = disease

    return best_match

# main program 主程序
def main():
    dataset = load_dataset('/content/QA1.json')
    train_data, test_data = split_dataset(dataset)

    save_dataset(train_data, '/content/train_data.json')
    save_dataset(test_data, '/content/test_data.json')

    train_dataset = QADataset(train_data, tokenizer)
    train_model(bert_model, tokenizer, train_dataset)

    test_dataset = QADataset(test_data, tokenizer)
    evaluate_model(bert_model, tokenizer, test_dataset)

    print("Training and evaluation completed!")

    while True:
        print("\n1. Seeking advice for known illnesses\n2. Describe the situation and predict the disease\n3. Exit")
        choice = input("Please enter your choice: ")

        if choice == '1':
            question = input("Please enter your question: ")
            context = find_relevant_context(question, dataset, sentence_model)

            if context:
                answer = generate_answer(question, context, tokenizer, bert_model)
                print(f"Answer: {answer}")
            else:
                print("Sorry, I am unable to provide advice based on the information you have provided. If you feel unwell, please contact a doctor!")

        elif choice == '2':
            description = input("Please describe your symptoms: ")
            disease = predict_disease(description, dataset, sentence_model)
            print(f"Robot guessing: {disease}, Robots can only roughly diagnose diseases based on your description. If you feel unwell, please contact a professional doctor immediately.")

        elif choice == '3':
            break

        else:
            print("Invalid choice, please try again.")

if __name__ == "__main__":
    main()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 85.8674783706665
Epoch 2/10, Loss: 82.45511102676392
Epoch 3/10, Loss: 78.42881345748901
Epoch 4/10, Loss: 74.5737829208374
Epoch 5/10, Loss: 69.82380104064941
Epoch 6/10, Loss: 65.05723428726196
Epoch 7/10, Loss: 60.12103462219238
Epoch 8/10, Loss: 55.170539140701294
Epoch 9/10, Loss: 50.558276891708374
Epoch 10/10, Loss: 47.18704843521118
Test Loss: 2.8884381907326833
Training and evaluation completed!

1. Seeking advice for known illnesses
2. Describe the situation and predict the disease
3. Exit
Please enter your choice: 1
Please enter your question: Common Cold
Answer: use a humidifier to ease coughing and congestion. over - the - counter cough medicines and expectorants can help. consult a doctor if symptoms persist.

1. Seeking advice for known illnesses
2. Describe the situation and predict the disease
3. Exit
Please enter your choice: Stomach Ulcer
Invalid choice, please try again.

1. Seeking advice for known illnesses
2. Describe the situation and predict t