In [None]:
# Install necessary packages
!pip install transformers torch scikit-learn plotly

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Load the tokenizer and SciBERT model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

# Tokenization and Embedding Generation Function
def get_embeddings(texts, model, tokenizer, max_len=128):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize text
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Move to GPU if available
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        # Get CLS token embeddings
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_embedding)
    
    return embeddings

# Load your data (replace 'df' with your DataFrame)
X = df['Processed_Description'].values
y = df['MI_Incident'].values

# Generate SciBERT embeddings
embeddings = get_embeddings(X, model, tokenizer)

# Convert to DataFrame for easy manipulation
embeddings_df = pd.DataFrame(embeddings)

# Perform PCA to reduce dimensionality to 2D
pca_2d = PCA(n_components=2)
pca_2d_result = pca_2d.fit_transform(embeddings_df)

# Perform PCA to reduce dimensionality to 3D
pca_3d = PCA(n_components=3)
pca_3d_result = pca_3d.fit_transform(embeddings_df)

# Plot 2D PCA with Plotly
pca_2d_df = pd.DataFrame(pca_2d_result, columns=['PCA1', 'PCA2'])
pca_2d_df['label'] = y
fig_2d = px.scatter(pca_2d_df, x='PCA1', y='PCA2', color=pca_2d_df['label'].astype(str),
                    title="2D PCA Visualization of SciBERT Embeddings",
                    labels={'color': 'Label'})
fig_2d.show()

# Plot 3D PCA with Plotly
pca_3d_df = pd.DataFrame(pca_3d_result, columns=['PCA1', 'PCA2', 'PCA3'])
pca_3d_df['label'] = y
fig_3d = px.scatter_3d(pca_3d_df, x='PCA1', y='PCA2', z='PCA3', color=pca_3d_df['label'].astype(str),
                       title="3D PCA Visualization of SciBERT Embeddings",
                       labels={'color': 'Label'})
fig_3d.show()

# Perform PCA to reduce dimensionality before applying t-SNE
pca = PCA(n_components=50)  # Reduce to 50 dimensions
pca_result = pca.fit_transform(embeddings_df)

# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
tsne_result = tsne.fit_transform(pca_result)

# Create a DataFrame with t-SNE results and labels
tsne_df = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2'])
tsne_df['label'] = y

# Visualize using Plotly
fig_tsne = px.scatter(tsne_df, x='TSNE1', y='TSNE2', color=tsne_df['label'].astype(str),
                      title="t-SNE Visualization of SciBERT Embeddings",
                      labels={'color': 'Label'})
fig_tsne.show()
