<a href="https://colab.research.google.com/github/01fe22bec002/ml_model_3/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. INSTALL DEPENDENCIES (Colab-only) ---
!pip install -q pandas numpy scikit-learn joblib openpyxl

# --- 2. IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# --- 3. LOAD DATA ---
train_df = pd.read_csv("/content/Training.csv")
test_df = pd.read_csv("/content/Testing.csv")

# Remove extra column
if 'Unnamed: 133' in train_df.columns:
    train_df.drop('Unnamed: 133', axis=1, inplace=True)

# --- 4. CHECK DATA ---
print("Training Shape:", train_df.shape)
print("Test Shape:", test_df.shape)
print("Nulls in Training:", train_df.isnull().sum().sum())
print("Unique Diseases:", train_df['prognosis'].nunique())

# --- 5. ENCODING ---
X_train = train_df.drop("prognosis", axis=1)
y_train = train_df["prognosis"]
X_test = test_df.drop("prognosis", axis=1)
y_test = test_df["prognosis"]

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)
os.makedirs("models", exist_ok=True)
joblib.dump(label_encoder, "models/label_encoder.pkl")

# --- 6. TRAIN-VALIDATION SPLIT ---
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train_enc, test_size=0.2, stratify=y_train_enc, random_state=42
)

# --- 7. SCALING ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "models/scaler.pkl")

# --- 8. TRAIN SVM ---
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_scaled, y_train_split)

# --- 9. EVALUATE ---
train_acc = accuracy_score(y_train_split, svm_model.predict(X_train_scaled)) * 100
val_acc = accuracy_score(y_val, svm_model.predict(X_val_scaled)) * 100
test_acc = accuracy_score(y_test_enc, svm_model.predict(X_test_scaled)) * 100

print(f"✅ Train Accuracy: {train_acc:.2f}%")
print(f"✅ Validation Accuracy: {val_acc:.2f}%")
print(f"📊 Test Accuracy: {test_acc:.2f}%")
print(classification_report(y_test_enc, svm_model.predict(X_test_scaled), target_names=label_encoder.classes_))

joblib.dump(svm_model, "models/svm_model.pkl")

# --- 10. SAVE SYMPTOM LIST ---
symptom_list = list(X_train.columns)
joblib.dump(symptom_list, "models/symptom_list.pkl")

# --- 11. LOAD DOCTOR DATASET ---
doctor_df = pd.read_excel("/content/diseaseandspeci.xlsx")
doctor_df['Disease'] = doctor_df['Disease'].str.strip().str.lower()
disease_to_doctor = {
    row['Disease']: row['Specialist']
    for _, row in doctor_df.iterrows()
}

FileNotFoundError: [Errno 2] No such file or directory: '/content/Training.csv'