In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("cluster_adverse_event.csv")
df["ae_text"] = df["Adverse Events"].str.strip().str.lower()
df["label"] = df["Cluster"].str.strip()

label_counts = df["label"].value_counts()
valid_labels = label_counts[label_counts >= 2].index
df = df[df["label"].isin(valid_labels)]

model_name = "all-MiniLM-L6-v2"
sbert = SentenceTransformer(model_name)
X = sbert.encode(df["ae_text"].tolist(), show_progress_bar=True)

le = LabelEncoder()
y = le.fit_transform(df["label"])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

models = {
    "Random Forest": (RandomForestClassifier(class_weight='balanced', random_state=42), {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }),
    "Logistic Regression": (LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42), {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    }),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.1]
    })
}

metrics_summary = {}

for name, (model, param_grid) in models.items():
    print(f"\n🔵 Model: {name}")

    grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, scoring='f1_macro', verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f" Best Parameters: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')

    metrics_summary[name] = {
        'Accuracy': acc,
        'F1 Macro': f1_macro,
        'F1 Weighted': f1_weighted
    }

    from sklearn.utils.multiclass import unique_labels
    labels_in_test = unique_labels(y_test, y_pred)
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred, labels=labels_in_test, target_names=le.inverse_transform(labels_in_test)))

metrics_df = pd.DataFrame(metrics_summary).T

fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# 1. Accuracy
axs[0].bar(metrics_df.index, metrics_df['Accuracy'], edgecolor='black')
axs[0].set_title('Model Accuracy Comparison', fontsize=14)
axs[0].set_ylabel('Accuracy', fontsize=12)
axs[0].set_ylim(0, 1)
for i, v in enumerate(metrics_df['Accuracy']):
    axs[0].text(i, v + 0.01, f"{v:.2f}", ha='center', fontsize=10)

# 2. F1 Macro
axs[1].bar(metrics_df.index, metrics_df['F1 Macro'], color='lightblue', edgecolor='black')
axs[1].set_title('Model F1 Macro Comparison', fontsize=14)
axs[1].set_ylabel('F1 Macro Score', fontsize=12)
axs[1].set_ylim(0, 1)
for i, v in enumerate(metrics_df['F1 Macro']):
    axs[1].text(i, v + 0.01, f"{v:.2f}", ha='center', fontsize=10)

plt.tight_layout()
plt.show()
