In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/Healthcare.csv")  # adjust name if needed
df.head(), df.shape


(   Patient_ID  Age  Gender                                           Symptoms  \
 0           1   29    Male              fever, back pain, shortness of breath   
 1           2   76  Female                   insomnia, back pain, weight loss   
 2           3   78    Male                    sore throat, vomiting, diarrhea   
 3           4   58   Other  blurred vision, depression, weight loss, muscl...   
 4           5   55  Female                    swelling, appetite loss, nausea   
 
    Symptom_Count           Disease  
 0              3           Allergy  
 1              3  Thyroid Disorder  
 2              3         Influenza  
 3              4            Stroke  
 4              3     Heart Disease  ,
 (25000, 6))

In [3]:
def parse_symptoms(sym_str):
    return [s.strip().lower() for s in sym_str.split(",")]

df["Symptom_List"] = df["Symptoms"].astype(str).apply(parse_symptoms)
df[["Disease", "Symptom_List"]].head()


Unnamed: 0,Disease,Symptom_List
0,Allergy,"[fever, back pain, shortness of breath]"
1,Thyroid Disorder,"[insomnia, back pain, weight loss]"
2,Influenza,"[sore throat, vomiting, diarrhea]"
3,Stroke,"[blurred vision, depression, weight loss, musc..."
4,Heart Disease,"[swelling, appetite loss, nausea]"


In [4]:
symptoms = sorted({s for lst in df["Symptom_List"] for s in lst})
diseases = sorted(df["Disease"].unique())

len(symptoms), len(diseases)


(28, 30)

In [5]:
M = pd.DataFrame(0, index=diseases, columns=symptoms)

for disease, lst in zip(df["Disease"], df["Symptom_List"]):
    for s in lst:
        M.loc[disease, s] += 1

M.shape


(30, 28)

In [6]:
M_norm = M.div(M.sum(axis=1), axis=0).fillna(0)
M_norm.shape


(30, 28)

In [7]:
from sklearn.cluster import KMeans

k = 6
km = KMeans(n_clusters=k, random_state=42, n_init="auto")
labels = km.fit_predict(M_norm)

cluster_sizes = pd.Series(labels).value_counts().sort_index()
cluster_sizes


Unnamed: 0,count
0,9
1,6
2,1
3,1
4,8
5,5


In [8]:
from sklearn.metrics.pairwise import euclidean_distances

centroids = km.cluster_centers_

singleton_clusters = cluster_sizes[cluster_sizes == 1].index.tolist()
non_singleton_clusters = cluster_sizes[cluster_sizes > 1].index.tolist()

labels_merged = labels.copy()

for sc in singleton_clusters:
    outlier_disease = M_norm.index[labels == sc][0]
    outlier_vec = M_norm.loc[outlier_disease].values.reshape(1, -1)

    dists = euclidean_distances(outlier_vec, centroids)[0]
    target = min(non_singleton_clusters, key=lambda c: dists[c])

    print(f"Merging {outlier_disease} from cluster {sc} → cluster {target}")
    labels_merged[M_norm.index.get_loc(outlier_disease)] = target


Merging Arthritis from cluster 2 → cluster 1
Merging Hypertension from cluster 3 → cluster 4


In [9]:
unique_clusters = sorted(np.unique(labels_merged))
cluster_remap = {old:i for i, old in enumerate(unique_clusters)}

labels_final = np.array([cluster_remap[x] for x in labels_merged])

pd.Series(labels_final).value_counts().sort_index()


Unnamed: 0,count
0,9
1,7
2,9
3,5


In [10]:
disease_to_group = {
    disease: group
    for disease, group in zip(M_norm.index, labels_final)
}

df["Disease_Group"] = df["Disease"].map(disease_to_group)
df["Disease_Group"].value_counts()


Unnamed: 0_level_0,count
Disease_Group,Unnamed: 1_level_1
2,7462
0,7397
1,6015
3,4126


In [11]:
df["Symptom_Text"] = df["Symptom_List"].apply(
    lambda lst: " ".join(s.replace(" ", "_") for s in lst)
)


In [12]:
symptom_index = {s:i for i,s in enumerate(symptoms)}

def multi_hot(lst):
    v = np.zeros(len(symptoms), dtype=np.int32)
    for s in lst:
        v[symptom_index[s]] = 1
    return v

df["Symptom_MultiHot"] = df["Symptom_List"].apply(multi_hot)


In [13]:
df["Symptom_Seq"] = df["Symptom_List"].apply(
    lambda lst: [symptom_index[s] for s in lst]
)

max_len = df["Symptom_Seq"].apply(len).max()
max_len


7

In [14]:
from sklearn.model_selection import train_test_split

y = df["Disease_Group"]

train_idx, temp_idx = train_test_split(
    df.index, test_size=0.30, random_state=42, stratify=y
)
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.50, random_state=42, stratify=y.loc[temp_idx]
)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_tfidf = tfidf.fit_transform(df["Symptom_Text"])
y = df["Disease_Group"]


In [16]:
X_train = X_tfidf[train_idx]
X_val   = X_tfidf[val_idx]
X_test  = X_tfidf[test_idx]

y_train = y.loc[train_idx]
y_val   = y.loc[val_idx]
y_test  = y.loc[test_idx]


In [17]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1.0, max_iter=5000)
svm.fit(X_train, y_train)


In [18]:
from sklearn.metrics import accuracy_score

val_pred = svm.predict(X_val)
test_pred = svm.predict(X_test)

val_acc = accuracy_score(y_val, val_pred)
test_acc = accuracy_score(y_test, test_pred)

print("SVM (TF-IDF) Validation Accuracy:", val_acc)
print("SVM (TF-IDF) Test Accuracy:", test_acc)


SVM (TF-IDF) Validation Accuracy: 0.2850666666666667
SVM (TF-IDF) Test Accuracy: 0.2872


In [19]:
results = {}

results["SVM (TF-IDF)"] = {
    "val_acc": val_acc,
    "test_acc": test_acc
}

results


{'SVM (TF-IDF)': {'val_acc': 0.2850666666666667, 'test_acc': 0.2872}}

In [20]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Features
X_mh = np.stack(df["Symptom_MultiHot"].values)

# Labels
y_cat = to_categorical(df["Disease_Group"], num_classes=4)

X_train = X_mh[train_idx]
X_val   = X_mh[val_idx]
X_test  = X_mh[test_idx]

y_train = y_cat[train_idx]
y_val   = y_cat[val_idx]
y_test  = y_cat[test_idx]


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

ffnn = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(4, activation="softmax")
])

ffnn.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

ffnn.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
history = ffnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=128,
    verbose=1
)


Epoch 1/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.2778 - loss: 1.3823 - val_accuracy: 0.3133 - val_loss: 1.3602
Epoch 2/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3030 - loss: 1.3652 - val_accuracy: 0.3080 - val_loss: 1.3601
Epoch 3/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3056 - loss: 1.3609 - val_accuracy: 0.3099 - val_loss: 1.3600
Epoch 4/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3128 - loss: 1.3597 - val_accuracy: 0.3053 - val_loss: 1.3604
Epoch 5/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3133 - loss: 1.3595 - val_accuracy: 0.3069 - val_loss: 1.3605
Epoch 6/50
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3082 - loss: 1.3585 - val_accuracy: 0.3067 - val_loss: 1.3605
Epoch 7/50
[1m137/137[0m 

In [23]:
val_loss, val_acc = ffnn.evaluate(X_val, y_val, verbose=0)
test_loss, test_acc = ffnn.evaluate(X_test, y_test, verbose=0)

print("FFNN Validation Accuracy:", val_acc)
print("FFNN Test Accuracy:", test_acc)


FFNN Validation Accuracy: 0.2922666668891907
FFNN Test Accuracy: 0.29973334074020386


In [24]:
results["FFNN (Multi-hot)"] = {
    "val_acc": float(val_acc),
    "test_acc": float(test_acc)
}

results


{'SVM (TF-IDF)': {'val_acc': 0.2850666666666667, 'test_acc': 0.2872},
 'FFNN (Multi-hot)': {'val_acc': 0.2922666668891907,
  'test_acc': 0.29973334074020386}}

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_seq = df["Symptom_Seq"].tolist()

X_train = pad_sequences([X_seq[i] for i in train_idx], maxlen=max_len)
X_val   = pad_sequences([X_seq[i] for i in val_idx],   maxlen=max_len)
X_test  = pad_sequences([X_seq[i] for i in test_idx],  maxlen=max_len)

y_train = y_cat[train_idx]
y_val   = y_cat[val_idx]
y_test  = y_cat[test_idx]


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

rnn = Sequential([
    Embedding(input_dim=len(symptoms), output_dim=16, input_length=max_len),
    SimpleRNN(32),
    Dropout(0.3),
    Dense(4, activation="softmax")
])

rnn.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

rnn.summary()




In [27]:
history = rnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=256,
    verbose=1
)


Epoch 1/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.2904 - loss: 1.3715 - val_accuracy: 0.2928 - val_loss: 1.3615
Epoch 2/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.3052 - loss: 1.3619 - val_accuracy: 0.2933 - val_loss: 1.3613
Epoch 3/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3008 - loss: 1.3631 - val_accuracy: 0.2893 - val_loss: 1.3615
Epoch 4/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3162 - loss: 1.3576 - val_accuracy: 0.2936 - val_loss: 1.3622
Epoch 5/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3122 - loss: 1.3588 - val_accuracy: 0.2899 - val_loss: 1.3625
Epoch 6/100
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3118 - loss: 1.3561 - val_accuracy: 0.2899 - val_loss: 1.3631
Epoch 7/100
[1m69/69[0m [32m━

In [28]:
val_loss, val_acc = rnn.evaluate(X_val, y_val, verbose=0)
test_loss, test_acc = rnn.evaluate(X_test, y_test, verbose=0)

print("RNN Validation Accuracy:", val_acc)
print("RNN Test Accuracy:", test_acc)


RNN Validation Accuracy: 0.2917333245277405
RNN Test Accuracy: 0.30533334612846375


In [29]:
results["RNN (Sequence)"] = {
    "val_acc": float(val_acc),
    "test_acc": float(test_acc)
}

results


{'SVM (TF-IDF)': {'val_acc': 0.2850666666666667, 'test_acc': 0.2872},
 'FFNN (Multi-hot)': {'val_acc': 0.2922666668891907,
  'test_acc': 0.29973334074020386},
 'RNN (Sequence)': {'val_acc': 0.2917333245277405,
  'test_acc': 0.30533334612846375}}

In [30]:
import tensorflow as tf
tf.keras.backend.clear_session()


In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Raw sequences
X_seq_all = df["Symptom_Seq"].tolist()

# Build splits
X_train = pad_sequences(
    [X_seq_all[i] for i in train_idx],
    maxlen=max_len,
    padding="post"
)

X_val = pad_sequences(
    [X_seq_all[i] for i in val_idx],
    maxlen=max_len,
    padding="post"
)

X_test = pad_sequences(
    [X_seq_all[i] for i in test_idx],
    maxlen=max_len,
    padding="post"
)

# Labels
y_train = y_cat[train_idx]
y_val   = y_cat[val_idx]
y_test  = y_cat[test_idx]


In [32]:
X_train = X_train.astype("int32")
X_val   = X_val.astype("int32")
X_test  = X_test.astype("int32")

y_train = y_train.astype("float32")
y_val   = y_val.astype("float32")
y_test  = y_test.astype("float32")


In [33]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_train dtype:", X_train.dtype)
print("Unique tokens in X_train:", np.unique(X_train[:5]))


X_train shape: (17500, 7)
y_train shape: (17500, 4)
X_train dtype: int32
Unique tokens in X_train: [ 0  1  3  6  9 11 14 15 21 25 26 27]


In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

lstm = Sequential()
lstm.add(
    Embedding(
        input_dim=len(symptoms),
        output_dim=16,
        input_length=max_len
    )
)
lstm.add(LSTM(32))
lstm.add(Dropout(0.3))
lstm.add(Dense(4, activation="softmax"))

lstm.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

lstm.summary()


In [35]:
history = lstm.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=128,
    verbose=1
)


Epoch 1/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.2925 - loss: 1.3696 - val_accuracy: 0.2963 - val_loss: 1.3613
Epoch 2/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2971 - loss: 1.3627 - val_accuracy: 0.2987 - val_loss: 1.3617
Epoch 3/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2951 - loss: 1.3606 - val_accuracy: 0.2984 - val_loss: 1.3612
Epoch 4/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3074 - loss: 1.3603 - val_accuracy: 0.2968 - val_loss: 1.3606
Epoch 5/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2997 - loss: 1.3609 - val_accuracy: 0.3016 - val_loss: 1.3607
Epoch 6/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3098 - loss: 1.3566 - val_accuracy: 0.3059 - val_loss: 1.3612
Epoch 7/20
[1m137/137[0m 

In [36]:
val_loss, val_acc = lstm.evaluate(X_val, y_val, verbose=0)
test_loss, test_acc = lstm.evaluate(X_test, y_test, verbose=0)

print("LSTM Validation Accuracy:", val_acc)
print("LSTM Test Accuracy:", test_acc)


LSTM Validation Accuracy: 0.29600000381469727
LSTM Test Accuracy: 0.30880001187324524


In [37]:
results["LSTM (Sequence)"] = {
    "val_acc": float(val_acc),
    "test_acc": float(test_acc)
}

results


{'SVM (TF-IDF)': {'val_acc': 0.2850666666666667, 'test_acc': 0.2872},
 'FFNN (Multi-hot)': {'val_acc': 0.2922666668891907,
  'test_acc': 0.29973334074020386},
 'RNN (Sequence)': {'val_acc': 0.2917333245277405,
  'test_acc': 0.30533334612846375},
 'LSTM (Sequence)': {'val_acc': 0.29600000381469727,
  'test_acc': 0.30880001187324524}}

In [38]:
group_to_diseases = (
    df.groupby("Disease_Group")["Disease"]
      .unique()
      .apply(list)
      .to_dict()
)

group_to_diseases


{0: ['Allergy',
  'Heart Disease',
  'Dermatitis',
  'Sinusitis',
  "Parkinson's",
  'Epilepsy',
  'Tuberculosis',
  'Common Cold',
  'Chronic Kidney Disease'],
 1: ['Food Poisoning',
  'Arthritis',
  'Dementia',
  'Liver Disease',
  'IBS',
  'Migraine',
  'Anxiety'],
 2: ['Thyroid Disorder',
  'Influenza',
  'Stroke',
  'Bronchitis',
  'Asthma',
  'Depression',
  'Pneumonia',
  'Ulcer',
  'Hypertension'],
 3: ['COVID-19', 'Diabetes', 'Obesity', 'Gastritis', 'Anemia']}

In [39]:
def predict_svm(symptom_text):
    X = tfidf.transform([symptom_text])
    group = int(svm.predict(X)[0])
    return group


In [40]:
def predict_ffnn(symptom_text):
    symptoms_input = [s.strip().lower() for s in symptom_text.split(",")]
    vec = np.zeros(len(symptoms), dtype=np.int32)
    for s in symptoms_input:
        if s in symptom_index:
            vec[symptom_index[s]] = 1

    probs = ffnn.predict(vec.reshape(1, -1), verbose=0)[0]
    return int(np.argmax(probs))


In [41]:
def predict_rnn(symptom_text):
    symptoms_input = [s.strip().lower() for s in symptom_text.split(",")]
    seq = [symptom_index[s] for s in symptoms_input if s in symptom_index]
    seq = pad_sequences([seq], maxlen=max_len, padding="post")
    seq = seq.astype("int32")

    probs = rnn.predict(seq, verbose=0)[0]
    return int(np.argmax(probs))


In [42]:
def predict_lstm(symptom_text):
    symptoms_input = [s.strip().lower() for s in symptom_text.split(",")]
    seq = [symptom_index[s] for s in symptoms_input if s in symptom_index]
    seq = pad_sequences([seq], maxlen=max_len, padding="post")
    seq = seq.astype("int32")

    probs = lstm.predict(seq, verbose=0)[0]
    return int(np.argmax(probs))


In [43]:
def demo_all_models(symptom_text):
    results = {}

    results["SVM (TF-IDF)"] = predict_svm(symptom_text)
    results["FFNN (Multi-hot)"] = predict_ffnn(symptom_text)
    results["RNN (Sequence)"] = predict_rnn(symptom_text)
    results["LSTM (Sequence)"] = predict_lstm(symptom_text)

    for model, group in results.items():
        print(f"\n{model}: Group {group}")
        print("Diseases:", ", ".join(group_to_diseases[group]))


In [44]:
demo_all_models("fever, cough, fatigue")



SVM (TF-IDF): Group 0
Diseases: Allergy, Heart Disease, Dermatitis, Sinusitis, Parkinson's, Epilepsy, Tuberculosis, Common Cold, Chronic Kidney Disease

FFNN (Multi-hot): Group 0
Diseases: Allergy, Heart Disease, Dermatitis, Sinusitis, Parkinson's, Epilepsy, Tuberculosis, Common Cold, Chronic Kidney Disease

RNN (Sequence): Group 0
Diseases: Allergy, Heart Disease, Dermatitis, Sinusitis, Parkinson's, Epilepsy, Tuberculosis, Common Cold, Chronic Kidney Disease

LSTM (Sequence): Group 0
Diseases: Allergy, Heart Disease, Dermatitis, Sinusitis, Parkinson's, Epilepsy, Tuberculosis, Common Cold, Chronic Kidney Disease
