In [1]:
!pip install transformers torch scikit-learn pandas flask


Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl (101 kB)
     ------------------------------------ 101.7/101.7 kB 983.8 kB/s eta 0:00:00
Collecting Werkzeug>=3.0.0
  Downloading werkzeug-3.0.4-py3-none-any.whl (227 kB)
     -------------------------------------- 227.6/227.6 kB 2.0 MB/s eta 0:00:00
Collecting itsdangerous>=2.1.2
  Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Collecting click>=8.1.3
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
     ---------------------------------------- 97.9/97.9 kB 2.8 MB/s eta 0:00:00
Collecting blinker>=1.6.2
  Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
Successfully installed Werkzeug-3.0.4 blinker-1.8.2 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0



[notice] A new release of pip available: 22.3 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from flask import Flask, request, jsonify
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_csv('disease_data.csv')

data['Symptoms'] = data['Symptoms'].str.lower()
data['Disease'] = data['Disease'].str.lower()


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')



In [6]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    sentence_embedding = torch.mean(embeddings, dim=1)
    return sentence_embedding.squeeze().numpy()

data['Embedding'] = data['Symptoms'].apply(lambda x: get_embedding(x))


In [7]:
import numpy as np

def predict_disease(user_input):
    user_embedding = get_embedding(user_input.lower())
    
    embeddings = np.stack(data['Embedding'].values)
    similarities = cosine_similarity([user_embedding], embeddings)[0]
    
    closest_match_index = similarities.argmax()
    
    disease = data.iloc[closest_match_index]['Disease']
    solution = data.iloc[closest_match_index]['Solution']
    
    return disease, solution
