In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the first dataset file
data_file = 'C:\\Users\\Blake\\Desktop\\Machine_Learning_Project\\Datasets\\dataset.csv'
df = pd.read_csv(data_file)

In [3]:
# Check the first few rows of the 'dataset.csv' DataFrame
print(df.head())

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [4]:
# Removing leading and trailing whitespace from all columns in the main dataset
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [5]:
# Drop symptom columns 4 to 17
columns_to_drop = ['Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
                   'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15',
                   'Symptom_16', 'Symptom_17']

dropped_df = df.drop(columns=columns_to_drop)

In [6]:
dropped_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches
3,Fungal infection,itching,skin_rash,dischromic _patches
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions


In [7]:
# Perform one-hot encoding on symptom columns
encoded_df = pd.get_dummies(dropped_df, columns=['Symptom_1', 'Symptom_2', 'Symptom_3'])

In [8]:
print(encoded_df.columns)

Index(['Disease', 'Symptom_1_acidity', 'Symptom_1_back_pain',
       'Symptom_1_bladder_discomfort', 'Symptom_1_breathlessness',
       'Symptom_1_burning_micturition', 'Symptom_1_chest_pain',
       'Symptom_1_chills', 'Symptom_1_constipation',
       'Symptom_1_continuous_sneezing',
       ...
       'Symptom_3_stomach_pain', 'Symptom_3_sweating',
       'Symptom_3_swelling_joints', 'Symptom_3_swelling_of_stomach',
       'Symptom_3_ulcers_on_tongue', 'Symptom_3_vomiting',
       'Symptom_3_watering_from_eyes', 'Symptom_3_weakness_of_one_body_side',
       'Symptom_3_weight_loss', 'Symptom_3_yellowish_skin'],
      dtype='object', length=137)


In [9]:
encoded_df.head()

Unnamed: 0,Disease,Symptom_1_acidity,Symptom_1_back_pain,Symptom_1_bladder_discomfort,Symptom_1_breathlessness,Symptom_1_burning_micturition,Symptom_1_chest_pain,Symptom_1_chills,Symptom_1_constipation,Symptom_1_continuous_sneezing,...,Symptom_3_stomach_pain,Symptom_3_sweating,Symptom_3_swelling_joints,Symptom_3_swelling_of_stomach,Symptom_3_ulcers_on_tongue,Symptom_3_vomiting,Symptom_3_watering_from_eyes,Symptom_3_weakness_of_one_body_side,Symptom_3_weight_loss,Symptom_3_yellowish_skin
0,Fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Separate features (X) and target (y)
X = encoded_df.drop(columns=['Disease'])
y = encoded_df['Disease']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
random_forest_model.fit(X_train, y_train)

# Predict disease labels on the test data
y_pred = random_forest_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)



Accuracy: 0.9522357723577236
Classification Report:
                                          precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       0.94      1.00      0.97        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00     

In [12]:
import joblib

# Save the trained model to a file
model_filename = 'random_forest_model.pkl'
joblib.dump(random_forest_model, model_filename)


['random_forest_model.pkl']