In [1]:
import pandas as pd 
import numpy as np 

In [2]:
data = pd.read_csv('DataSets/train.csv')
data.sample(5)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
2612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Heart attack
3279,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,Impetigo
2569,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Pneumonia
3287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Diabetes
3876,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Hepatitis D


In [3]:
X = data.iloc[:,0:132].values
Y = data.iloc[:,-1]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [5]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

models = {
    'SVC': SVC(kernel='linear'),
    'RFC': RandomForestClassifier(n_estimators=100, random_state=42),
    'GBC': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42),
    'GaussianNB': GaussianNB(),
    'KNC': KNeighborsClassifier(n_neighbors=5),
    'LogisticRegression': LogisticRegression(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
}

best_model_name = None
best_model = None
best_accuracy = 0

for model_name, model in models.items():
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    accur = accuracy_score(Y_test, y_pred)
    cm = confusion_matrix(Y_test, y_pred)
    
    print(f"\n{model_name} - accuracy: {accur}")
    print(f"{model_name} - confusion_matrix:\n {cm}")
    
    if accur > best_accuracy:
        best_accuracy = accur
        best_model_name = model_name
        best_model = model

print(f"\nBest model is {best_model_name} with accuracy of {best_accuracy}")


SVC - accuracy: 1.0
SVC - confusion_matrix:
 [[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]

RFC - accuracy: 1.0
RFC - confusion_matrix:
 [[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]

GBC - accuracy: 1.0
GBC - confusion_matrix:
 [[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]

AdaBoost - accuracy: 0.2764227642276423
AdaBoost - confusion_matrix:
 [[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ... 25  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]

GaussianNB - accuracy: 1.0
GaussianNB - confusion_matrix:
 [[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [

In [7]:
prec = pd.read_csv('DataSets/precautions.csv')
work = pd.read_csv('DataSets/workout.csv')
descr = pd.read_csv('DataSets/description.csv')
medi = pd.read_csv('DataSets/medications.csv')
diet = pd.read_csv('DataSets/diets.csv')

In [8]:
symptom_dict = {column_name: index for index, column_name in enumerate(data.columns[:132])}
disease_dict = dict(zip(range(len(le.classes_)),le.classes_))
print(symptom_dict)
print('\n')
print(disease_dict)

{'itching': 0, 'skin_rash': 1, 'nodal_skin_eruptions': 2, 'continuous_sneezing': 3, 'shivering': 4, 'chills': 5, 'joint_pain': 6, 'stomach_pain': 7, 'acidity': 8, 'ulcers_on_tongue': 9, 'muscle_wasting': 10, 'vomiting': 11, 'burning_micturition': 12, 'spotting_ urination': 13, 'fatigue': 14, 'weight_gain': 15, 'anxiety': 16, 'cold_hands_and_feets': 17, 'mood_swings': 18, 'weight_loss': 19, 'restlessness': 20, 'lethargy': 21, 'patches_in_throat': 22, 'irregular_sugar_level': 23, 'cough': 24, 'high_fever': 25, 'sunken_eyes': 26, 'breathlessness': 27, 'sweating': 28, 'dehydration': 29, 'indigestion': 30, 'headache': 31, 'yellowish_skin': 32, 'dark_urine': 33, 'nausea': 34, 'loss_of_appetite': 35, 'pain_behind_the_eyes': 36, 'back_pain': 37, 'constipation': 38, 'abdominal_pain': 39, 'diarrhoea': 40, 'mild_fever': 41, 'yellow_urine': 42, 'yellowing_of_eyes': 43, 'acute_liver_failure': 44, 'fluid_overload': 45, 'swelling_of_stomach': 46, 'swelled_lymph_nodes': 47, 'malaise': 48, 'blurred_and

In [10]:
def predict_symptoms(symptom_input):
    ip_vector = np.zeros(len(symptom_dict))
    
    for symptom in symptom_input:
        ip_vector[symptom_dict[symptom]] = 1
        
    prediction_index = best_model.predict([ip_vector])[0]
    disease_name = disease_dict[prediction_index]
    
    description = descr[descr['Disease'] == disease_name]['Description'].values[0]
    
    precautions = prec[prec['Disease'] == disease_name].iloc[:, 2:].values.flatten().tolist()
    
    medications = medi[medi['Disease'] == disease_name]['Medication'].values[0]
    if isinstance(medications, str):
        medications = medications.split(", ")

    workouts = work[work['disease'] == disease_name]['workout'].values[0]
    if isinstance(workouts, str):
        workouts = workouts.split(", ")

    diets = diet[diet['Disease'] == disease_name]['Diet'].values[0]
    if isinstance(diets, str):
        diets = diets.split(", ")
    
    result = {
        'Disease': disease_name,
        'Description': description,
        'Precautions': precautions,
        'Medications': medications,
        'Workouts': workouts,
        'Diets': diets
    }
    return result

user_input = input("Enter symptoms separated by commas (e.g., cough,join_pain): ")
user_symptoms = [symptom.strip() for symptom in user_input.split(',')]
result = predict_symptoms(user_symptoms)

print("\nDisease:", result['Disease'])
print("\nDescription: ", result['Description'])

print("\nPrecautions:")
for i, precaution in enumerate(result['Precautions'], 1):
    print(f"{i}. {precaution}")

print("\nMedications:")
for i, medication in enumerate(result['Medications'], 1):
    print(f"{i}. {medication.strip('[]').strip('\'')}")

print("\nWorkouts:")
for i, workout in enumerate(result['Workouts'], 1):
    print(f"{i}. {workout}")

print("\nDiets:")
for i, diet in enumerate(result['Diets'], 1):
    print(f"{i}. {diet.strip('[]').strip('\'')}")

Enter symptoms separated by commas (e.g., cough,join_pain):  cough


TypeError: string indices must be integers, not 'str'

In [None]:
import pickle 
pickle.dump(best_model,open("Output_file/model.pkl","wb"))