In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")

label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le


X = df.drop(columns=["Outcome Variable"])
y = df["Outcome Variable"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


FileNotFoundError: [Errno 2] No such file or directory: 'Disease_symptom_and_patient_profile_dataset.csv'

In [5]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 5.7 MB/s eta 0:00:27
    --------------------------------------- 2.4/150.0 MB 7.9 MB/s eta 0:00:19
    --------------------------------------- 3.7/150.0 MB 8.1 MB/s eta 0:00:19
   - -------------------------------------- 5.0/150.0 MB 6.9 MB/s eta 0:00:22
   - -------------------------------------- 6.0/150.0 MB 6.5 MB/s eta 0:00:23
   -- ------------------------------------- 7.9/150.0 MB 6.7 MB/s eta 0:00:22
   -- ------------------------------------- 9.4/150.0 MB 6.8 MB/s eta 0:00:21
   -- ------------------------------------- 11.0/150.0 MB 7.0 MB/s eta 0:00:20
   --- ------------------------------------ 12.1/150.0 MB 6.7 MB/s eta 0:00:21
   --- ------------------------------------ 13.4/150.0 MB 6.5 MB/s eta 0:00:2

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE


df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")


label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

X = df.drop(columns=["Outcome Variable"])
y = df["Outcome Variable"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


rf_params = {
    'n_estimators': [200, 300],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


best_rf = grid_search.best_estimator_

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)


y_pred_rf = best_rf.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)


accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Optimized Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")

Optimized Random Forest Accuracy: 82.67%
XGBoost Accuracy: 81.33%


In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv").dropna()


label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le


feature_columns = [
    'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing',
    'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level'
]

X = df[feature_columns]
y = df["Disease"]

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)


options = {
    'Fever': ['Yes', 'No'],
    'Cough': ['Yes', 'No'],
    'Fatigue': ['Yes', 'No'],
    'Difficulty Breathing': ['Yes', 'No'],
    'Gender': ['Male', 'Female'],
    'Blood Pressure': ['Low', 'Normal', 'High'],
    'Cholesterol Level': ['Low', 'Normal', 'High']
}


print("\n🔍 Enter patient details to predict disease:")
manual_input = {}

for key, choices in options.items():
    print(f"\n{key} options: {choices}")
    value = input(f"Enter {key}: ").strip().title()
    while value not in choices:
        print(f"Invalid input. Please choose from {choices}")
        value = input(f"Enter {key}: ").strip().title()
    manual_input[key] = value

while True:
    try:
        manual_input["Age"] = int(input("\nEnter Age: "))
        break
    except ValueError:
        print("Please enter a valid number.")

input_df = pd.DataFrame([manual_input])
for col in input_df.columns:
    if col in label_encoders:
        input_df[col] = label_encoders[col].transform(input_df[col])

prediction = model.predict(input_df)
predicted_disease = label_encoders['Disease'].inverse_transform(prediction)

print(f"\n🩺 Predicted Disease: {predicted_disease[0]}")



🔍 Enter patient details to predict disease:

Fever options: ['Yes', 'No']


Enter Fever:  Yes



Cough options: ['Yes', 'No']


Enter Cough:  No



Fatigue options: ['Yes', 'No']


Enter Fatigue:  Yes



Difficulty Breathing options: ['Yes', 'No']


Enter Difficulty Breathing:  Yes



Gender options: ['Male', 'Female']


Enter Gender:  Male



Blood Pressure options: ['Low', 'Normal', 'High']


Enter Blood Pressure:  High



Cholesterol Level options: ['Low', 'Normal', 'High']


Enter Cholesterol Level:  Normal

Enter Age:  30


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
