In [1]:
# 1Ô∏è‚É£ Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [3]:
# 2Ô∏è‚É£ Load the Preprocessed Data
train_data = pd.read_csv("D:\\HealthCareChatBot\\data\\train_data.csv")
test_data = pd.read_csv("D:\\HealthCareChatBot\\data\\test_data.csv")


In [4]:
print(" Train and Test Data Loaded Successfully!")
print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")


 Train and Test Data Loaded Successfully!
Train shape: (580, 2)
Test shape: (145, 2)


In [5]:
# Split Features and Labels
X_train = train_data['Symptoms']
y_train = train_data['Disease']

X_test = test_data['Symptoms']
y_test = test_data['Disease']

In [6]:
#TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [8]:
# Instead of this ‚ùå
# joblib.dump(vectorizer, 'models/tfidf_vectorizer.joblib')

# Use this ‚úÖ
joblib.dump(vectorizer, '../models/tfidf_vectorizer.joblib')

print(" TF-IDF Vectorizer saved successfully!")

 TF-IDF Vectorizer saved successfully!


In [9]:
# 5Ô∏è‚É£ Initialize Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "SVM (Linear)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

In [11]:
results ={}
for name, model in models.items():
    print(f"\nüîπ Training {name}...")
    model.fit(X_train_tfidf, y_train)
    
    y_pred = model.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"‚úÖ {name} Accuracy: {acc*100:.2f}%")
    
    results[name] = acc
    
    # ‚úÖ Fix path here
    file_name = name.lower().replace(" ", "_").replace("(", "").replace(")", "") + ".joblib"
    joblib.dump(model, f"../models/{file_name}")
    print(f"üíæ Model saved: ../models/{file_name}")





üîπ Training Logistic Regression...
‚úÖ Logistic Regression Accuracy: 100.00%
üíæ Model saved: ../models/logistic_regression.joblib

üîπ Training SVM (Linear)...
‚úÖ SVM (Linear) Accuracy: 100.00%
üíæ Model saved: ../models/svm_linear.joblib

üîπ Training Random Forest...
‚úÖ Random Forest Accuracy: 100.00%
üíæ Model saved: ../models/random_forest.joblib


In [None]:
# 7Ô∏è‚É£ Display Summary of Results
print("\nüìä Model Performance Summary:")
for model_name, acc in results.items():
    print(f"{model_name}: {acc*100:.2f}%")

In [15]:
# 7Ô∏è‚É£ Display Summary of Results
print("\nüìä Model Performance Summary:")
for model_name, acc in results.items():
    print(f"{model_name}: {acc*100:.2f}%")




üìä Model Performance Summary:
Logistic Regression: 100.00%
SVM (Linear): 100.00%
Random Forest: 100.00%


In [17]:
# 8Ô∏è‚É£ Best Model Selection (Optional)
best_model_name = max(results, key=results.get)
print(f"\n Best Performing Model: {best_model_name}")



 Best Performing Model: Logistic Regression


In [18]:
# 9Ô∏è‚É£ Save Best Model Name
with open('../models/best_model.txt', 'w') as f:
    f.write(best_model_name)
print("‚úÖ Best model name saved to models/best_model.txt")

‚úÖ Best model name saved to models/best_model.txt


In [19]:

# üîü Classification Report for the Best Model
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test_tfidf)
print("\nüìã Detailed Classification Report:")
print(classification_report(y_test, y_pred_best))


üìã Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00         8
           4       1.00      1.00      1.00        20
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00        16
           8       1.00      1.00      1.00        18
           9       1.00      1.00      1.00        17

    accuracy                           1.00       145
   macro avg       1.00      1.00      1.00       145
weighted avg       1.00      1.00      1.00       145

