In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
# Load preprocessed data
artifacts_dir = '../artifacts'

print("Loading preprocessed data...")
data = joblib.load(os.path.join(artifacts_dir, 'preprocessed_data.joblib'))
vectorizer = joblib.load(os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))

# Extract data components
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']
feature_names = data['feature_names']
class_names = data['class_names']

print(f"Data loaded successfully!")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"Number of classes: {len(class_names)}")
print(f"Classes: {class_names}")

Loading preprocessed data...
Data loaded successfully!
Training set: (1724, 5000)
Validation set: (370, 5000)
Test set: (370, 5000)
Number of classes: 13
Classes: ['Cardiovascular / Pulmonary', 'ENT - Otolaryngology', 'Gastroenterology', 'Hematology - Oncology', 'Nephrology', 'Neurology', 'Neurosurgery', 'Obstetrics / Gynecology', 'Ophthalmology', 'Orthopedic', 'Pediatrics - Neonatal', 'Psychiatry / Psychology', 'Radiology']


In [3]:
# Initialize Softmax Regression model
print("Initializing Softmax Regression (Multinomial Logistic Regression)...")

# Configure the model for multi-class classification
softmax_model = LogisticRegression(
    multi_class='multinomial',  # Use multinomial (softmax) approach
    solver='lbfgs',            # Limited-memory BFGS solver (good for small datasets)
    max_iter=1000,             # Increase iterations for convergence
    random_state=42,           # For reproducibility
    class_weight='balanced'    # Handle class imbalance
)

print("Model Configuration:")
print(f"- Multi-class strategy: {softmax_model.multi_class}")
print(f"- Solver: {softmax_model.solver}")
print(f"- Class weight: {softmax_model.class_weight}")
print(f"- Max iterations: {softmax_model.max_iter}")

Initializing Softmax Regression (Multinomial Logistic Regression)...
Model Configuration:
- Multi-class strategy: multinomial
- Solver: lbfgs
- Class weight: balanced
- Max iterations: 1000


In [4]:
# Train the baseline model
print("Training baseline Softmax Regression model...")

softmax_model.fit(X_train, y_train)

print("Model training completed!")
print(f"Number of classes learned: {len(softmax_model.classes_)}")
print(f"Feature coefficient shape: {softmax_model.coef_.shape}")
print(f"Classes: {softmax_model.classes_}")

Training baseline Softmax Regression model...
Model training completed!
Number of classes learned: 13
Feature coefficient shape: (13, 5000)
Classes: ['Cardiovascular / Pulmonary' 'ENT - Otolaryngology' 'Gastroenterology'
 'Hematology - Oncology' 'Nephrology' 'Neurology' 'Neurosurgery'
 'Obstetrics / Gynecology' 'Ophthalmology' 'Orthopedic'
 'Pediatrics - Neonatal' 'Psychiatry / Psychology' 'Radiology']
