In [None]:
# Install necessary packages
!pip install transformers torch scikit-learn sentence-transformers matplotlib seaborn

# Import necessary libraries
import torch
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, silhouette_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set up plotting style
sns.set(style='whitegrid')

# Initialize the SciBERT model using SentenceTransformer
model_name = "allenai/scibert_scivocab_uncased"
model = SentenceTransformer(model_name)

# Generate SciBERT embeddings for the entire dataset
embeddings = [model.encode(text) for text in df['Processed_Description'].values]

# Convert labels to numpy array
y = df['MI_Incident'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

# Finding the best K value using Elbow Plot and Silhouette Score
k_values = range(1, 21)
accuracies = []
silhouette_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    knn.fit(X_train, y_train)
    y_preds = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_preds))
    
    # Calculate silhouette score for K > 1
    if k > 1:
        silhouette_scores.append(silhouette_score(X_test, y_preds, metric='cosine'))
    else:
        silhouette_scores.append(None)

# Plot Elbow Plot
plt.figure(figsize=(12, 6))
plt.plot(k_values, accuracies, marker='o')
plt.title('Elbow Plot: K vs. Accuracy')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.grid(True)
plt.show()

# Plot Silhouette Scores
plt.figure(figsize=(12, 6))
plt.plot(k_values[1:], silhouette_scores[1:], marker='o')
plt.title('Silhouette Scores for Different K')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Silhouette Score')
plt.xticks(k_values[1:])
plt.grid(True)
plt.show()

# Assuming the best K is 5 based on the plots
best_k = 5
knn = KNeighborsClassifier(n_neighbors=best_k, metric='cosine')
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_preds)
print(f'Accuracy with K={best_k}: {accuracy * 100:.2f}%')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_preds)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_preds)
print("Classification Report:")
print(class_report)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-MI", "MI"], yticklabels=["Non-MI", "MI"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Identify misclassified sentences
misclassified = []
for i, (pred, true) in enumerate(zip(y_preds, y_test)):
    if pred != true:
        misclassified.append((df['Processed_Description'].values[i], pred, true))

# Print the misclassified sentences with their predicted and true labels
print("Misclassified Sentences:\n")
for text, pred, true in misclassified:
    print(f"Sentence: {text}")
    print(f"Predicted Label: {pred}, True Label: {true}\n")
