In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the first dataset file
data_file = 'symptoms/dataset.csv'
df = pd.read_csv(data_file)

# Load the second dataset file
severity_file = 'symptoms/Symptom-severity.csv'
severity_df = pd.read_csv(severity_file)


In [3]:
# Check the first few rows of the 'dataset.csv' DataFrame
print(df.head())

# Check the first few rows of the 'Symptom-severity.csv' DataFrame
print(severity_df.head())

# Get statistical summary of the 'dataset.csv' DataFrame
print(df.describe())


            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [4]:
# Remove any leading and trailing whitespace
df['Symptom_1'] = df['Symptom_1'].str.strip()


In [5]:
# Merge severity information with the main dataset based on 'Symptom_1'
merged_df = df.merge(severity_df, left_on='Symptom_1', right_on='Symptom', how='left').drop(columns='Symptom')

In [6]:
# Dropping columns with lower amounts
merged_df = merged_df.drop(columns=['Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8',
                                    'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12',
                                    'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17'])


In [7]:
# Check for missing values in the dataset
print(merged_df.isnull().sum())

# Drop rows with missing values:
merged_df = merged_df.dropna()

Disease      0
Symptom_1    0
Symptom_2    0
Symptom_3    0
weight       0
dtype: int64


In [8]:
# Save the DataFrame to a CSV file named 'merged_data.csv'
# merged_df.to_csv('merged_data.csv', index=False)
    ## commenting out due to needing csv from after training making this irrelevent

In [9]:
# Checking column names
column_names = merged_df.columns
print(column_names)

Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'weight'], dtype='object')


In [10]:
# Make Dummies
merged_df = pd.get_dummies(merged_df, columns=['Symptom_1', 'Symptom_2', 'Symptom_3'])

In [11]:
# Checking column names
column_names = merged_df.columns
print(column_names)


Index(['Disease', 'weight', 'Symptom_1_acidity', 'Symptom_1_back_pain',
       'Symptom_1_bladder_discomfort', 'Symptom_1_breathlessness',
       'Symptom_1_burning_micturition', 'Symptom_1_chest_pain',
       'Symptom_1_chills', 'Symptom_1_constipation',
       ...
       'Symptom_3_ stomach_pain', 'Symptom_3_ sweating',
       'Symptom_3_ swelling_joints', 'Symptom_3_ swelling_of_stomach',
       'Symptom_3_ ulcers_on_tongue', 'Symptom_3_ vomiting',
       'Symptom_3_ watering_from_eyes', 'Symptom_3_ weakness_of_one_body_side',
       'Symptom_3_ weight_loss', 'Symptom_3_ yellowish_skin'],
      dtype='object', length=138)


In [12]:
# Importing for training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Separate the features (X) and target (y)
X = merged_df.drop('Disease', axis=1)  
y = merged_df['Disease']  

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9522357723577236
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       0.94      1.00      0.97        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      0.96      0.98        2

In [13]:
# Import joblib to be able to save model
import joblib

# Save the model to a file
model_filename = 'rf_model.joblib'
joblib.dump(rf_classifier, model_filename)


['rf_model.joblib']

In [14]:
# Save the DataFrame to a CSV
merged_df.to_csv('merged_data.csv', index=False)
