In [42]:
import pandas as pd
data=pd.read_csv('dataset1.csv')
# Replace '0' with NaN to properly handle missing data
data.replace('0', pd.NA, inplace=True)

# Concatenate the symptom columns into one DataFrame
symptom_columns = [f'Symptom_{i}' for i in range(1, 18)]
symptom_data = data[symptom_columns]

# One-hot encode the symptoms
one_hot_encoded_data = pd.get_dummies(symptom_data.stack(), prefix='', prefix_sep='').groupby(level=0).max()

# Combine the one-hot encoded symptoms with the original 'Disease' column
final_data = pd.concat([data['Disease'], one_hot_encoded_data], axis=1)

# Display the first few rows of the final dataset
print(final_data.head())

            Disease   abdominal_pain   abnormal_menstruation   acidity  \
0  Fungal infection            False                   False     False   
1  Fungal infection            False                   False     False   
2  Fungal infection            False                   False     False   
3  Fungal infection            False                   False     False   
4  Fungal infection            False                   False     False   

    acute_liver_failure   altered_sensorium   anxiety   back_pain  \
0                 False               False     False       False   
1                 False               False     False       False   
2                 False               False     False       False   
3                 False               False     False       False   
4                 False               False     False       False   

    belly_pain   blackheads  ...   watering_from_eyes   weakness_in_limbs  \
0        False        False  ...                False          

In [43]:
final_data.columns

Index(['Disease', ' abdominal_pain', ' abnormal_menstruation', ' acidity',
       ' acute_liver_failure', ' altered_sensorium', ' anxiety', ' back_pain',
       ' belly_pain', ' blackheads',
       ...
       ' watering_from_eyes', ' weakness_in_limbs',
       ' weakness_of_one_body_side', ' weight_gain', ' weight_loss',
       ' yellow_crust_ooze', ' yellow_urine', ' yellowing_of_eyes',
       ' yellowish_skin', 'itching'],
      dtype='object', length=132)

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import xgboost as xgb
import joblib
import gzip


# Machine learning model: XGBoost 

# import the dataset
dataset_df = pd.read_csv('dataset1.csv')

# Preprocess
dataset_df = dataset_df.apply(lambda col: col.str.strip())

test = pd.get_dummies(dataset_df.filter(regex='Symptom'), prefix='', prefix_sep='')
test = test.groupby(test.columns, axis=1).agg(np.max)
clean_df = pd.merge(test,dataset_df['Disease'], left_index=True, right_index=True)

clean_df.to_csv('clean_dataset.tsv', sep='\t', index=False)

# Preprocessing
X_data = clean_df.iloc[:,:-1]
y_data = clean_df.iloc[:,-1]

# Convert y to categorical values
y_data = y_data.astype('category')

# Convert y categories tu numbers with encoder
le = preprocessing.LabelEncoder()
le.fit(y_data)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3)

# Convert labels to numbers
y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Init classifier
# model = xgb.XGBClassifier(use_label_encoder=False, reg_alpha=20, reg_lambda=0, eval_metric='mlogloss')

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

# Convert labels to one-hot encoding
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

from keras.layers import Dropout
from keras.regularizers import l2

model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001))),
model.add(Dropout(0.5)),
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.001))),
model.add(Dropout(0.5)),
model.add(Dense(y_train_enc.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_enc, epochs=20, batch_size=20, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_enc)
print(f"MLP Accuracy: {accuracy}")

# Predict
preds = model.predict(X_test)



  test = test.groupby(test.columns, axis=1).agg(np.max)
  test = test.groupby(test.columns, axis=1).agg(np.max)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.0483 - loss: 3.7255 - val_accuracy: 0.2384 - val_loss: 3.3908
Epoch 2/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1272 - loss: 3.3391 - val_accuracy: 0.5194 - val_loss: 2.7909
Epoch 3/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1865 - loss: 2.9723 - val_accuracy: 0.7306 - val_loss: 2.2773
Epoch 4/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2710 - loss: 2.6034 - val_accuracy: 0.8798 - val_loss: 1.8615
Epoch 5/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2833 - loss: 2.4770 - val_accuracy: 0.9244 - val_loss: 1.5901
Epoch 6/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3678 - loss: 2.2456 - val_accuracy: 0.9244 - val_loss: 1.3571
Epoch 7/20
[1m104/104[0m 

In [45]:
true_symptoms = ["weight_loss", "coma","chest_pain"]  # List the symptoms that are true

# Create an array for the test case
test_case = np.zeros(X_data.shape[1])  # Initialize all symptoms to False (0)
for symptom in true_symptoms:
    test_case[X_data.columns.get_loc(symptom)] = 1  # Set True (1) for specified symptoms

# Reshape for prediction
test_case = test_case.reshape(1, -1)

# Predict the disease
predicted_disease_prob= model.predict(test_case)

print("Predicted Disease:", predicted_disease_prob[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Predicted Disease: [0.00535808 0.00554972 0.00585217 0.01073302 0.00579366 0.00578833
 0.01580361 0.0088888  0.01232117 0.01693414 0.00281377 0.01188658
 0.03254469 0.00787353 0.02029233 0.0284342  0.01069858 0.0131532
 0.13548645 0.03148556 0.0144024  0.02526121 0.01782215 0.02838495
 0.06712464 0.04725146 0.01071616 0.04107678 0.11227002 0.00960565
 0.01163432 0.04048258 0.03517056 0.0105496  0.02473863 0.00398487
 0.0432893  0.0032376  0.00441751 0.04786875 0.01301927]


In [46]:
# Step 1: Get the index of the highest probability
disease_index = np.argmax(predicted_disease_prob)

# Step 2: Decode the prediction to get the disease name
predicted_disease = le.inverse_transform([disease_index])

print("Predicted Disease:", predicted_disease[0])

Predicted Disease: Heart attack


In [47]:
# Threshold for classification
threshold = 0.1

# Check if the predicted probability is above the threshold
predicted_probabilities = predicted_disease_prob # From the prediction you obtained earlier
max_probability = np.max(predicted_probabilities)
predicted_disease_index = np.argmax(predicted_probabilities)

if max_probability >= threshold:
    # predicted_disease = le.inverse_transform([predicted_disease_index])
    print(f"Predicted Disease: {predicted_disease[0]}. Please consult the doctor for further checkup.")
else:
    print("Prediction is uncertain; confidence is below the threshold.")


Predicted Disease: Heart attack. Please consult the doctor for further checkup.


In [50]:
import joblib
# joblib.dump(model,"prediction.joblib")
model.save("prediction.h5")  # Save the entire model in HDF5 format

joblib.dump(le,"le.joblib")



['le.joblib']