In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# 1. Load and Clean Data
df = pd.read_excel('dataset/Specialist.xlsx')
df.columns = df.columns.str.strip()  # Clean column names
df['Disease'] = df['Disease'].str.strip().replace({
    'Dermatologists': 'Dermatologist',
    'Rheumatologists': 'Rheumatologist',
    'Internal Medcine': 'Internal Medicine'
})

# 2. Prepare Features and Target
X = df.drop(columns=['Unnamed: 0', 'Disease'], errors='ignore')
y = df['Disease']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 3. Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 4. Predict and Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Function to predict specialist for new symptoms
def predict_specialist(symptoms_list):
    # symptoms_list: list of strings matching column names
    input_data = pd.DataFrame(0, index=[0], columns=X.columns)
    for s in symptoms_list:
        if s in input_data.columns:
            input_data[s] = 1
    pred_idx = model.predict(input_data)[0]
    return le.inverse_transform([pred_idx])[0]

# Example Usage:
# print(predict_specialist(['itching', 'skin_rash']))

Accuracy: 1.00
                    precision    recall  f1-score   support

         Allergist       1.00      1.00      1.00        48
      Cardiologist       1.00      1.00      1.00        48
       Common Cold       1.00      1.00      1.00        23
     Dermatologist       1.00      1.00      1.00       110
   Endocrinologist       1.00      1.00      1.00        92
Gastroenterologist       1.00      1.00      1.00       118
      Gynecologist       1.00      1.00      1.00        26
      Hepatologist       1.00      1.00      1.00       179
 Internal Medicine       1.00      1.00      1.00        73
       Neurologist       1.00      1.00      1.00        72
   Osteoarthristis       1.00      1.00      1.00        22
       Osteopathic       1.00      1.00      1.00        30
  Otolaryngologist       1.00      1.00      1.00        18
      Pediatrician       1.00      1.00      1.00        19
      Phlebologist       1.00      1.00      1.00        22
     Pulmonologist      

In [8]:
print(predict_specialist(['headache', 'cough', 'mild_fever']))

Neurologist


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load the dataset to get the vocabulary
df = pd.read_excel('dataset/Specialist.xlsx')
df.columns = df.columns.str.strip()
# Extract only the symptom column names
dataset_symptoms = [col for col in df.columns if col not in ['Unnamed: 0', 'Disease']]

# 2. Pre-processing: Create a human-readable version of the symptoms
# (e.g., 'skin_rash' becomes 'skin rash')
clean_symptoms = [s.replace('_', ' ') for s in dataset_symptoms]

# 3. Initialize the NLP Model (TF-IDF with Character N-Grams)
# Using char n-grams allows matching 'itchy' to 'itching' or 'pained' to 'pain'
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4)).fit(clean_symptoms)
symptom_vectors = vectorizer.transform(clean_symptoms)

def nlp_to_dataset_symptoms(user_input, threshold=0.3):
    """
    Converts user language to a list of dataset-compliant symptom names.
    """
    user_input = user_input.lower()
    
    # Split the user input into logical segments
    # Splitting by common conjunctions helps extract multiple symptoms
    segments = re.split(r'[,.!?]| and | with | having | including | plus ', user_input)
    segments = [s.strip() for s in segments if len(s.strip()) > 2]
    
    found_symptoms = []
    
    for seg in segments:
        # Calculate similarity between this segment and all dataset symptoms
        seg_vec = vectorizer.transform([seg])
        scores = cosine_similarity(seg_vec, symptom_vectors).flatten()
        
        # Get indices of matches that pass the similarity threshold
        # We take the best match for each segment of the sentence
        if np.max(scores) > threshold:
            best_match_idx = np.argmax(scores)
            found_symptoms.append(dataset_symptoms[best_match_idx])
            
    # Deduplicate and return
    return list(set(found_symptoms))



# Now you can pass 'extracted' to your specialist prediction model:
# predicted_specialist = model.predict(extracted)

In [7]:
user_sentence = "I have fever and  cough with mild headache "
extracted = nlp_to_dataset_symptoms(user_sentence)

print(f"User said: '{user_sentence}'")
print(f"Detected Symptoms: {extracted}")
# spec=predict_specialist(extracted)
# print(f"recomended specialist: {spec}")


User said: 'I have fever and  cough with mild headache '
Detected Symptoms: ['headache', 'cough', 'mild_fever']


In [4]:
import joblib

joblib.dump(model, "model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")


['feature_columns.pkl']

In [5]:
from fastapi import FastAPI
import joblib
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

app = FastAPI()

# Load saved files
model = joblib.load("model.pkl")
le = joblib.load("label_encoder.pkl")
feature_columns = joblib.load("feature_columns.pkl")

# Prepare NLP vocabulary
clean_symptoms = [s.replace('_', ' ') for s in feature_columns]
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4)).fit(clean_symptoms)
symptom_vectors = vectorizer.transform(clean_symptoms)


def nlp_to_dataset_symptoms(user_input, threshold=0.3):
    user_input = user_input.lower()
    segments = re.split(r'[,.!?]| and | with | having | including | plus ', user_input)
    segments = [s.strip() for s in segments if len(s.strip()) > 2]

    found_symptoms = []

    for seg in segments:
        seg_vec = vectorizer.transform([seg])
        scores = cosine_similarity(seg_vec, symptom_vectors).flatten()

        if np.max(scores) > threshold:
            best_match_idx = np.argmax(scores)
            found_symptoms.append(feature_columns[best_match_idx])

    return list(set(found_symptoms))


@app.post("/predict")
def predict(data: dict):
    user_sentence = data["sentence"]

    extracted = nlp_to_dataset_symptoms(user_sentence)

    input_data = pd.DataFrame(0, index=[0], columns=feature_columns)
    for s in extracted:
        input_data[s] = 1

    pred_idx = model.predict(input_data)[0]
    specialist = le.inverse_transform([pred_idx])[0]

    return {
        "detected_symptoms": extracted,
        "recommended_specialist": specialist
    }
