<a href="https://colab.research.google.com/github/01fe22bec002/ml_model_3/blob/main/finalcode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# --- 1. INSTALL DEPENDENCIES (if not preinstalled) ---
!pip install -q pandas numpy scikit-learn tensorflow joblib

# --- 2. IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# --- 3. LOAD DATA ---
train_df = pd.read_csv("/content/Training.csv")
test_df = pd.read_csv("/content/Testing.csv")

# Drop unnamed column if exists
if 'Unnamed: 133' in train_df.columns:
    train_df.drop('Unnamed: 133', axis=1, inplace=True)

# --- 4. DATA CHECKS ---
print("Training Shape:", train_df.shape)
print("Test Shape:", test_df.shape)
print("Nulls in Training:", train_df.isnull().sum().sum())
print("Unique Diseases:", train_df['prognosis'].nunique())

# --- 5. ENCODING & SPLIT ---
X_train = train_df.drop("prognosis", axis=1)
y_train = train_df["prognosis"]
X_test = test_df.drop("prognosis", axis=1)
y_test = test_df["prognosis"]

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# Save label encoder
os.makedirs("models", exist_ok=True)
joblib.dump(label_encoder, "models/label_encoder.pkl")

# --- 6. TRAIN-VAL SPLIT ---
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train_enc, test_size=0.2, stratify=y_train_enc, random_state=42
)

# --- 7. SCALING FOR NN ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, "models/scaler.pkl")

# --- 8. RANDOM FOREST ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_split, y_train_split)

# Validation RF
rf_val_preds = rf_model.predict(X_val)
print("RF Val Accuracy:", accuracy_score(y_val, rf_val_preds))

# Save RF
joblib.dump(rf_model, "models/random_forest_model.pkl")

# --- 9. FEEDFORWARD NEURAL NETWORK ---
input_dim = X_train_scaled.shape[1]
num_classes = len(label_encoder.classes_)

fnn_model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

fnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

fnn_model.fit(X_train_scaled, y_train_split, validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=32, callbacks=[early_stop], verbose=1)

fnn_model.save("models/fnn_model.keras")

# --- 10. TEST EVALUATION ---

# RF
rf_preds_test = rf_model.predict(X_test)
print("RF Test Accuracy:", accuracy_score(y_test_enc, rf_preds_test))
print(classification_report(y_test_enc, rf_preds_test, target_names=label_encoder.classes_))

# FNN
fnn_preds_test = fnn_model.predict(X_test_scaled)
fnn_preds_class = np.argmax(fnn_preds_test, axis=1)
print("FNN Test Accuracy:", accuracy_score(y_test_enc, fnn_preds_class))
print(classification_report(y_test_enc, fnn_preds_class, target_names=label_encoder.classes_))

# --- 11. SAVE SYMPTOM LIST ---
symptom_list = list(X_train.columns)
joblib.dump(symptom_list, "models/symptom_list.pkl")
# --- 1. INSTALL DEPENDENCIES (if not preinstalled) ---
!pip install -q pandas numpy scikit-learn tensorflow joblib

# --- 2. IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# --- 3. LOAD DATA ---
train_df = pd.read_csv("/content/Training.csv")
test_df = pd.read_csv("/content/Testing.csv")

# Drop unnamed column if exists
if 'Unnamed: 133' in train_df.columns:
    train_df.drop('Unnamed: 133', axis=1, inplace=True)

# --- 4. DATA CHECKS ---
print("Training Shape:", train_df.shape)
print("Test Shape:", test_df.shape)
print("Nulls in Training:", train_df.isnull().sum().sum())
print("Unique Diseases:", train_df['prognosis'].nunique())

# --- 5. ENCODING & SPLIT ---
X_train = train_df.drop("prognosis", axis=1)
y_train = train_df["prognosis"]
X_test = test_df.drop("prognosis", axis=1)
y_test = test_df["prognosis"]

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# Save label encoder
os.makedirs("models", exist_ok=True)
joblib.dump(label_encoder, "models/label_encoder.pkl")

# --- 6. TRAIN-VAL SPLIT ---
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train_enc, test_size=0.2, stratify=y_train_enc, random_state=42
)

# --- 7. SCALING FOR NN ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, "models/scaler.pkl")

# --- 8. RANDOM FOREST ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_split, y_train_split)

# Validation RF
rf_val_preds = rf_model.predict(X_val)
print("RF Val Accuracy:", accuracy_score(y_val, rf_val_preds))

# Save RF
joblib.dump(rf_model, "models/random_forest_model.pkl")

# --- 9. FEEDFORWARD NEURAL NETWORK ---
input_dim = X_train_scaled.shape[1]
num_classes = len(label_encoder.classes_)

fnn_model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

fnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

fnn_model.fit(X_train_scaled, y_train_split, validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=32, callbacks=[early_stop], verbose=1)

fnn_model.save("models/fnn_model.keras")

# --- 10. TEST EVALUATION ---

# RF
rf_preds_test = rf_model.predict(X_test)
print("RF Test Accuracy:", accuracy_score(y_test_enc, rf_preds_test))
print(classification_report(y_test_enc, rf_preds_test, target_names=label_encoder.classes_))

# FNN
fnn_preds_test = fnn_model.predict(X_test_scaled)
fnn_preds_class = np.argmax(fnn_preds_test, axis=1)
print("FNN Test Accuracy:", accuracy_score(y_test_enc, fnn_preds_class))
print(classification_report(y_test_enc, fnn_preds_class, target_names=label_encoder.classes_))

# --- 11. SAVE SYMPTOM LIST ---
symptom_list = list(X_train.columns)
joblib.dump(symptom_list, "models/symptom_list.pkl")

Training Shape: (4920, 133)
Test Shape: (42, 133)
Nulls in Training: 0
Unique Diseases: 41
RF Val Accuracy: 1.0
Epoch 1/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.4194 - loss: 2.5978 - val_accuracy: 1.0000 - val_loss: 0.0553
Epoch 2/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9830 - loss: 0.1593 - val_accuracy: 1.0000 - val_loss: 0.0061
Epoch 3/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9963 - loss: 0.0460 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 4/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9981 - loss: 0.0267 - val_accuracy: 1.0000 - val_loss: 9.6114e-04
Epoch 5/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9997 - loss: 0.0160 - val_accuracy: 1.0000 - val_loss: 4.8574e-04
Epoch 6/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



RF Test Accuracy: 0.9761904761904762
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00 



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
FNN Test Accuracy: 0.9761904761904762
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis  

['models/symptom_list.pkl']

In [6]:
# --- 12. LOAD DOCTOR SUGGESTION DATASET ---
doctor_df = pd.read_excel("/content/diseaseandspeci.xlsx")

# Normalize disease names for consistency
doctor_df['Disease'] = doctor_df['Disease'].str.strip().str.lower()

# --- 13. CREATE MAPPING: Disease -> Specialist ---
disease_to_doctor = {
    row['Disease']: row['Specialist']
    for _, row in doctor_df.iterrows()
}

# Save mapping
joblib.dump(disease_to_doctor, "models/disease_to_doctor_mapping.pkl")

# --- 14. DOCTOR SUGGESTION FUNCTION ---
def suggest_doctor(predicted_disease):
    """Returns doctor name and specialization for a given disease."""
    disease_key = predicted_disease.strip().lower()
    return disease_to_doctor.get(disease_key, "No doctor found for this disease")

# --- 15. SAMPLE USAGE AFTER PREDICTION ---
predicted_disease = label_encoder.inverse_transform([rf_preds_test[0]])[0]
doctor = suggest_doctor(predicted_disease)


# New section