In [9]:
import joblib
from sklearn.linear_model import LogisticRegression

In [10]:
vectorizer = joblib.load("../artifacts/vectorizer.joblib")
df_train = joblib.load("../artifacts/df_train_preprocessed.joblib")
df_validation = joblib.load("../artifacts/df_val_preprocessed.joblib")
df_test = joblib.load("../artifacts/df_test_preprocessed.joblib")

Loading the fitted vectorizer and the preprocessed dataset from 1_eda_preprocessing.ipynb

In [None]:

X_train_vec = vectorizer.transform(df_train["combined_text"])
X_val_vec = vectorizer.transform(df_validation["combined_text"])

# Train model
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_vec, df_train["medical_specialty"])

# Save final model pipeline (vectorizer + classifier)
from sklearn.pipeline import Pipeline
final_pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', clf)
])



['../artifacts/model.joblib']

The model gets trained here and an artifact is created from the model. 
- Model was evaluated thoroughly before settling down with this version.

In [13]:
# Evaluate Validation
val_acc = final_pipeline.score(
    df_validation["combined_text"], 
    df_validation["medical_specialty"]
)

# Evaluate Test
test_acc = final_pipeline.score(
    df_test["combined_text"], 
    df_test["medical_specialty"]
)

print(f"📊 Validation Accuracy: {val_acc:.4f}")
print(f"📊 Test Accuracy: {test_acc:.4f}")


📊 Validation Accuracy: 0.8243
📊 Test Accuracy: 0.8135


Here the accuracy ratings for the model is shown. 
- This 0.8243 showed the best potential withing the given time period therefore is selected as the model for deploying.

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV


# -----------------------------
# Step 1: Define Pipeline
# -----------------------------
class_weights = {
    "Orthopedic": 2.0,
    "Neurology": 1.5,
    "Neurosurgery": 1.5,
    # others default to 1.0
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('clf', LogisticRegression(
        multi_class='multinomial',  # Softmax regression
        solver='lbfgs',
        max_iter=1000,
        class_weight=class_weights
    ))
])

# -----------------------------
# Step 2: Cross-Validation
# -----------------------------
cv_scores = cross_val_score(
    pipeline, 
    df_train['combined_text'], 
    df_train['medical_specialty'], 
    cv=5, 
    scoring='accuracy'
)
print(f"📊 Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# -----------------------------
# Step 3: Hyperparameter Tuning (GridSearch)
# -----------------------------
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__min_df': [2, 5, 10],
    'tfidf__max_features': [5000, 10000, 20000],
    'clf__C': [0.01, 0.1, 1, 10]
}

grid = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3, 
    scoring='recall_macro',  # recall_macro helps with rare classes
    n_jobs=-1, 
    verbose=2
)

grid.fit(df_train['combined_text'], df_train['medical_specialty'])

print("✅ Best Parameters:", grid.best_params_)
print(f"📊 Best CV Score: {grid.best_score_:.4f}")

# -----------------------------
# Step 4: Evaluate on Validation & Test Sets
# -----------------------------
best_model = grid.best_estimator_

val_acc = best_model.score(
    df_validation['combined_text'], 
    df_validation['medical_specialty']
)
test_acc = best_model.score(
    df_test['combined_text'], 
    df_test['medical_specialty']
)

print(f"📊 Validation Accuracy (best model): {val_acc:.4f}")
print(f"📊 Test Accuracy (best model): {test_acc:.4f}")

# -----------------------------
# Step 5: Save Best Model
# -----------------------------
joblib.dump(best_model, "../artifacts/model.joblib")
print("💾 Model saved to ../artifacts/model.joblib")



📊 Cross-Validation Accuracy: 0.8167 (+/- 0.0140)
Fitting 3 folds for each of 216 candidates, totalling 648 fits




✅ Best Parameters: {'clf__C': 10, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 1)}
📊 Best CV Score: 0.7902
📊 Validation Accuracy (best model): 0.8459
📊 Test Accuracy (best model): 0.8324
💾 Model saved to ../artifacts/model.joblib
