## Prepare the model

### Import libraries

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib

### Read the dataset

In [7]:
data = pd.read_csv('Doctors_Specialty_Recommendation/Original_Dataset.csv')
print("Dataset loaded successfully.")
print(data.head(5))

Dataset loaded successfully.
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_

## Data Preprocessing

In [8]:
# Replace missing values with 0
data.fillna(0, inplace=True)
print("Missing values replaced with 0.")
print(data.isnull().sum())

# Preprocess column names (lowercase and replace spaces with underscores)
data.columns = data.columns.str.lower().str.replace(' ', '_')
print("Column names preprocessed.")

Missing values replaced with 0.
Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64
Column names preprocessed.


### extracts the values, and assigns the appropriate subset to the "Symptoms" column.

In [9]:
symptoms = []
for i in range(len(data)):
    value = data.iloc[i].values.tolist()
    if 0 in value:
        symptoms.append(value[1:value.index(0)])
    else:
        symptoms.append(value[1:])

# Add the Symptoms column to the dataset
data["symptoms"] = symptoms
print("Symptoms column added successfully.")
print(data.head(5))

# Clean the symptoms column
data["symptoms"] = data["symptoms"].apply(lambda x: [symptom.strip() for symptom in x])
print("Symptoms column cleaned.")
print(data["symptoms"].head())

Symptoms column added successfully.
            disease   symptom_1              symptom_2              symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              symptom_4 symptom_5 symptom_6 symptom_7 symptom_8 symptom_9  \
0   dischromic _patches         0         0         0         0         0   
1                     0         0         0         0         0         0   
2                     0         0         0         0         0         0   
3                     0         0         0         0         0         0   
4                     0         0         0         0         0         0   

  symptom_10 s

### Transforming Symptom Data into Binary Format

In [10]:
column_values = data[['symptom_1', 'symptom_2', 'symptom_3', 'symptom_4',
                      'symptom_5', 'symptom_6', 'symptom_7', 'symptom_8', 'symptom_9',
                      'symptom_10', 'symptom_11', 'symptom_12', 'symptom_13', 'symptom_14',
                      'symptom_15', 'symptom_16', 'symptom_17']].values.ravel()

symps = pd.unique(column_values).tolist()
symps = [i for i in symps if str(i) != "nan"]  # Remove NaN values
symps = [i for i in symps if isinstance(i, str)]  # Keep only string values
print("Filtered symps list (strings only):", symps)

# Preprocess symptoms
symps = [symptom.lower().replace(' ', '_').lstrip('_') for symptom in symps]
print("Unique symptoms extracted:", symps)

# Create a binary DataFrame for symptoms
new_data = pd.DataFrame(0, columns=symps, index=data.index)
new_data["disease"] = data["disease"]

for col in symps:
    new_data[col] = data['symptoms'].apply(lambda x: 1 if col in x else 0)

print("Binary DataFrame created successfully.")
print(new_data.head())

Filtered symps list (strings only): ['itching', ' skin_rash', ' nodal_skin_eruptions', ' dischromic _patches', ' continuous_sneezing', ' shivering', ' chills', ' watering_from_eyes', ' stomach_pain', ' acidity', ' ulcers_on_tongue', ' vomiting', ' cough', ' chest_pain', ' yellowish_skin', ' nausea', ' loss_of_appetite', ' abdominal_pain', ' yellowing_of_eyes', ' burning_micturition', ' spotting_ urination', ' passage_of_gases', ' internal_itching', ' indigestion', ' muscle_wasting', ' patches_in_throat', ' high_fever', ' extra_marital_contacts', ' fatigue', ' weight_loss', ' restlessness', ' lethargy', ' irregular_sugar_level', ' blurred_and_distorted_vision', ' obesity', ' excessive_hunger', ' increased_appetite', ' polyuria', ' sunken_eyes', ' dehydration', ' diarrhoea', ' breathlessness', ' family_history', ' mucoid_sputum', ' headache', ' dizziness', ' loss_of_balance', ' lack_of_concentration', ' stiff_neck', ' depression', ' irritability', ' visual_disturbances', ' back_pain', ' 

## Training the model

### Splitting the data into features (X) and target variable (y)

In [11]:
X = new_data.drop('disease', axis=1)
y = new_data['disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data split into training and testing sets.")

Data split into training and testing sets.


### Initializing and fitting the logistic regression model

In [12]:
model = LogisticRegression(max_iter=1000, class_weight='balanced', penalty='l2', C=1.0)
model.fit(X_train, y_train)
print("Model trained successfully.")

Model trained successfully.


## Save the trained model and symps list

In [13]:
joblib.dump(model, 'doctor_specialty_model.joblib')
joblib.dump(symps, 'symptom_list.joblib')
print("Model and symptom list saved successfully.")

Model and symptom list saved successfully.


## Calculating accuracy

### Calculate accuracy, precision, recall, and F1-score for each class.

In [14]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score for each class
precision = precision_score(y_test, y_pred, average=None, zero_division=0)
recall = recall_score(y_test, y_pred, average=None, zero_division=0)
f1 = f1_score(y_test, y_pred, average=None, zero_division=0)

# Display metrics
metrics_df = pd.DataFrame({'Class': model.classes_,
                           'Precision': precision,
                           'Recall': recall,
                           'F1-score': f1})
print("Model evaluation metrics:")
print(metrics_df)

Accuracy: 1.0
Model evaluation metrics:
                                      Class  Precision  Recall  F1-score
0   (vertigo) Paroymsal  Positional Vertigo        1.0     1.0       1.0
1                                      AIDS        1.0     1.0       1.0
2                                      Acne        1.0     1.0       1.0
3                       Alcoholic hepatitis        1.0     1.0       1.0
4                                   Allergy        1.0     1.0       1.0
5                                 Arthritis        1.0     1.0       1.0
6                          Bronchial Asthma        1.0     1.0       1.0
7                      Cervical spondylosis        1.0     1.0       1.0
8                               Chicken pox        1.0     1.0       1.0
9                       Chronic cholestasis        1.0     1.0       1.0
10                              Common Cold        1.0     1.0       1.0
11                                   Dengue        1.0     1.0       1.0
12         

In [15]:
print("Missing values in the dataset:\n", data.isnull().sum())
print("Symptoms column data:\n", data["symptoms"].head())
print("Disease column unique values:\n", data["disease"].unique())

Missing values in the dataset:
 disease       0
symptom_1     0
symptom_2     0
symptom_3     0
symptom_4     0
symptom_5     0
symptom_6     0
symptom_7     0
symptom_8     0
symptom_9     0
symptom_10    0
symptom_11    0
symptom_12    0
symptom_13    0
symptom_14    0
symptom_15    0
symptom_16    0
symptom_17    0
symptoms      0
dtype: int64
Symptoms column data:
 0    [itching, skin_rash, nodal_skin_eruptions, dis...
1    [skin_rash, nodal_skin_eruptions, dischromic _...
2    [itching, nodal_skin_eruptions, dischromic _pa...
3            [itching, skin_rash, dischromic _patches]
4           [itching, skin_rash, nodal_skin_eruptions]
Name: symptoms, dtype: object
Disease column unique values:
 ['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 