In [6]:
import pandas as pd

# Step 1: Load your dataset
df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")

# Display the first 5 rows
print("🔹 Sample Data:\n", df.head())

# Check shape
print("\n🔹 Dataset Shape:", df.shape)

# Column info
print("\n🔹 Data Types:\n", df.dtypes)

# Null values check
print("\n🔹 Missing Values:\n", df.isnull().sum())

# Unique values in each column
print("\n🔹 Unique values per column:\n")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Class balance
print("\n🔹 Class Distribution (Target):\n", df['Disease'].value_counts())


🔹 Sample Data:
        Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  

🔹 Dataset Shape: (349, 10)

🔹 Data Types:
 Disease                 object
Fever                   object
Cough                   object
Fatigue                 object
Difficulty Breathing    object
Age                      int

In [7]:
from sklearn.preprocessing import LabelEncoder

# Make a copy of the dataframe to keep the original safe
df_clean = df.copy()

# Step 2.1: Drop rows with missing values (you can also impute if needed)
df_clean.dropna(inplace=True)
print(f"✅ After dropping missing values, shape: {df_clean.shape}")

# Step 2.2: Initialize a dictionary to store LabelEncoders for later use (especially for inverse transforming predictions)
label_encoders = {}

# Step 2.3: Encode all categorical columns
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        le = LabelEncoder()
        df_clean[col] = le.fit_transform(df_clean[col])
        label_encoders[col] = le

# Step 2.4: Final check
print("\n🔹 Cleaned and Encoded Dataset Preview:\n", df_clean.head())
print("\n🔹 Data Types After Encoding:\n", df_clean.dtypes)


✅ After dropping missing values, shape: (349, 10)

🔹 Cleaned and Encoded Dataset Preview:
    Disease  Fever  Cough  Fatigue  Difficulty Breathing  Age  Gender  \
0       56      1      0        1                     1   19       0   
1       24      0      1        1                     0   25       0   
2       37      0      1        1                     0   25       0   
3        6      1      1        0                     1   25       1   
4        6      1      1        0                     1   25       1   

   Blood Pressure  Cholesterol Level  Outcome Variable  
0               1                  2                 1  
1               2                  2                 0  
2               2                  2                 0  
3               2                  2                 1  
4               2                  2                 1  

🔹 Data Types After Encoding:
 Disease                 int32
Fever                   int32
Cough                   int32
Fatigue      

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Disease"])  # replace "Disease" with your target column
y = df["Disease"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.014009661835748793
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.00      0.00      0.00        14
           2       0.05      0.05      0.05        20
           3       0.04      0.05      0.05        19
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00        20
           6       0.12      0.07      0.09        27
           7       0.00      0.00      0.00        21
           8       0.00      0.00      0.00        10
           9       0.00      0.00      0.00        18
          10       0.00      0.00      0.00        19
          11       0.00      0.00      0.00        15
          12       0.00      0.00      0.00        14
          13       0.04      0.04      0.04        24
          14       0.06      0.06      0.06        16
          15       0.00      0.00      0.00        25
          16       0.05      0.06      0.06       