## Prepare the model

### Import libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import joblib

### Read the dataset

In [20]:
data = pd.read_csv('Doctors_Specialty_Recommendation/Original_Dataset.csv')
data.head(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


## Data Preprocessing

### extracts the values, and assigns the appropriate subset to the "Symptoms" column.

In [21]:
symptoms = []

for i in range(len(data)):
    value = data.iloc[i].values.tolist()
    if 0 in value:
        symptoms.append(value[1:value.index(0)])
    else:
        symptoms.append(value[1:])

data["Symptoms"] = symptoms

data.head(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptoms
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,"[itching, skin_rash, nodal_skin_eruptions, ..."
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"[ skin_rash, nodal_skin_eruptions, dischromi..."
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"[itching, nodal_skin_eruptions, dischromic _..."
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,"[itching, skin_rash, dischromic _patches, na..."
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,"[itching, skin_rash, nodal_skin_eruptions, n..."


### Transforming Symptom Data into Binary Format

In [22]:
column_values = data[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17']].values.ravel()

symps = pd.unique(column_values).tolist()
symps = [i for i in symps if str(i) != "nan"]

# Creating a DataFrame with binary indicator columns for each symptom
new_data = pd.DataFrame(0, columns=symps, index=data.index)
new_data["Disease"] = data["Disease"]

# Setting the binary indicator values
for col in symps:
    new_data[col] = data['Symptoms'].apply(lambda x: 1 if col in x else 0)

print(new_data.head())

   itching   skin_rash   nodal_skin_eruptions   dischromic _patches  \
0        1           1                      1                     1   
1        0           1                      1                     1   
2        1           0                      1                     1   
3        1           1                      0                     1   
4        1           1                      1                     0   

    continuous_sneezing   shivering   chills   watering_from_eyes  \
0                     0           0        0                    0   
1                     0           0        0                    0   
2                     0           0        0                    0   
3                     0           0        0                    0   
4                     0           0        0                    0   

    stomach_pain   acidity  ...   foul_smell_of urine  \
0              0         0  ...                     0   
1              0         0  ...             

## Training the model

### Splitting the data into features (X) and target variable (y)

In [23]:
X = new_data.drop('Disease', axis=1)
y = new_data['Disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Initializing and fitting the logistic regression model

In [24]:
model = LogisticRegression(max_iter=1000) #add max_iter to avoid warnings.
model.fit(X_train, y_train)

## Save the trained model and symps list

In [25]:
joblib.dump(model, 'doctor_specialty_model.joblib')
joblib.dump(symps, 'symptom_list.joblib')

['symptom_list.joblib']

## Calculating accuracy

### Calculate accuracy, precision, recall, and F1-score for each class.

In [26]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score for each class
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Create a DataFrame to display the metrics
metrics_df = pd.DataFrame({'Class': model.classes_,
                           'Precision': precision,
                           'Recall': recall,
                           'F1-score': f1})

print(metrics_df)

Accuracy: 1.0
                                      Class  Precision  Recall  F1-score
0   (vertigo) Paroymsal  Positional Vertigo        1.0     1.0       1.0
1                                      AIDS        1.0     1.0       1.0
2                                      Acne        1.0     1.0       1.0
3                       Alcoholic hepatitis        1.0     1.0       1.0
4                                   Allergy        1.0     1.0       1.0
5                                 Arthritis        1.0     1.0       1.0
6                          Bronchial Asthma        1.0     1.0       1.0
7                      Cervical spondylosis        1.0     1.0       1.0
8                               Chicken pox        1.0     1.0       1.0
9                       Chronic cholestasis        1.0     1.0       1.0
10                              Common Cold        1.0     1.0       1.0
11                                   Dengue        1.0     1.0       1.0
12                                Dia