In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the first dataset file
data_file = 'C:\\Users\\Blake\\Desktop\\Machine_Learning_Project - Copy\\Datasets\\dataset.csv'
df = pd.read_csv(data_file)

# Load the second dataset file
severity_file = 'C:\\Users\\Blake\\Desktop\\Machine_Learning_Project - Copy\\Datasets\\Symptom-severity.csv'
severity_df = pd.read_csv(severity_file)


In [3]:
# Check the first few rows of the 'dataset.csv' DataFrame
print(df.head())

# Check the first few rows of the 'Symptom-severity.csv' DataFrame
print(severity_df.head())

# Get statistical summary of the 'dataset.csv' DataFrame
print(df.describe())


            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [4]:
# Removing leading and trailing whitespace from all columns in the main dataset
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Removing leading and trailing whitespace from symptom names in the symptom severity dataset
severity_df['Symptom'] = severity_df['Symptom'].str.strip()



In [5]:
# Drop symptom columns 4 to 17
columns_to_drop = ['Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
                   'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15',
                   'Symptom_16', 'Symptom_17']

dropped_df = df.drop(columns=columns_to_drop)


In [6]:
# Merge symptom severity information into the main dataset
merged_df = dropped_df.merge(severity_df, how='left', left_on='Symptom_1', right_on='Symptom')


In [7]:
# Perform one-hot encoding on symptom columns
encoded_df = pd.get_dummies(merged_df, columns=['Symptom_1', 'Symptom_2', 'Symptom_3'])

In [16]:
print(encoded_df.columns)

Index(['Disease', 'Symptom', 'weight', 'Symptom_1_acidity',
       'Symptom_1_back_pain', 'Symptom_1_bladder_discomfort',
       'Symptom_1_breathlessness', 'Symptom_1_burning_micturition',
       'Symptom_1_chest_pain', 'Symptom_1_chills',
       ...
       'Symptom_3_stomach_pain', 'Symptom_3_sweating',
       'Symptom_3_swelling_joints', 'Symptom_3_swelling_of_stomach',
       'Symptom_3_ulcers_on_tongue', 'Symptom_3_vomiting',
       'Symptom_3_watering_from_eyes', 'Symptom_3_weakness_of_one_body_side',
       'Symptom_3_weight_loss', 'Symptom_3_yellowish_skin'],
      dtype='object', length=139)


In [None]:
Symptom_3_foul_smell_of_urine


In [None]:
# Checking column names
column_names = merged_df.columns
print(column_names)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Splitting into features (X) and target (y)
X = encoded_df.drop(['Disease', 'Symptom'], axis=1)
y = encoded_df['Disease']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Random Forest model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict the disease labels on the testing data
y_pred_rf = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

# Calculate the classification report
classification_rep = classification_report(y_test, y_pred_rf, target_names=y.unique())

# Print the classification report
print("Classification Report:\n", classification_rep)


Random Forest Accuracy: 0.9491869918699187
Classification Report:
                                          precision    recall  f1-score   support

                       Fungal infection       1.00      1.00      1.00        18
                                Allergy       1.00      1.00      1.00        30
                                   GERD       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        25
                          Drug Reaction       1.00      1.00      1.00        24
                    Peptic ulcer diseae       1.00      1.00      1.00        23
                                   AIDS       0.94      1.00      0.97        33
                               Diabetes       1.00      1.00      1.00        23
                        Gastroenteritis       1.00      1.00      1.00        21
                       Bronchial Asthma       1.00      1.00      1.00        15
                           Hypertension  

In [14]:
import joblib

# Save the trained model to a file
model_filename = 'random_forest_model.pkl'
joblib.dump(rf_model, model_filename)



['random_forest_model.pkl']

In [None]:
# Save the DataFrame to a CSV
merged_df.to_csv('merged_data.csv', index=False)
