In [1]:
import pandas as pd

# Load the CSV file into a DataFrame to be used in the clustering process
df = pd.read_csv('Data/Partitioned_Abstracts.csv', sep=',', header=0, encoding='utf-8')

#verify that the DataFrame is loaded correctly
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1228 entries, 0 to 1227
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Title                 1228 non-null   object
 1   Year                  1228 non-null   int64 
 2   Authors               1228 non-null   object
 3   Label                 1228 non-null   object
 4   Partitioned Abstract  1228 non-null   object
 5   Target                1228 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 57.7+ KB


In [2]:
from sentence_transformers import SentenceTransformer

#Create the combined text by concatenating the "Title" and "Partitioned Abstract" columns
combined_texts = df["Title"].astype(str) + "\n\n" + df["Partitioned Abstract"].astype(str)
# Convert the "Target" column to integer type for classification purposes
targets = df["Target"].astype(int)

# Load the SentenceTransformer model for generating embeddings
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

# Define the task type for the model
task = "clustering"

# Encode the combined texts using the SentenceTransformer model
embeddings = model.encode(
    combined_texts.tolist(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
)
embeddings.shape

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

torch.Size([1228, 1024])

In [None]:
import evoc
import torch

# Initialize the EVoC clustering algorithm
clusterer = evoc.EVoC(
    #min_num_clusters=10
    n_neighbors = 60
)
# Fit the EVoC clustering algorithm to the embeddings and predict the cluster labels
labels = clusterer.fit_predict(embeddings.cpu().to(dtype=torch.float32).numpy())


In [51]:
import evoc
import torch
import pandas as pd
import numpy as np
import umap.umap_ as umap
import plotly.express as px
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# Run UMAP to reduce the dimensionality of the embeddings to 3D for visualization
umap_3d = umap.UMAP(
    n_components=3,
    random_state=42,
    n_neighbors=5,
    min_dist=0.0,
    metric='cosine'
).fit_transform(embeddings.cpu().to(dtype=torch.float32).numpy())

# Prepare the DataFrame for plotting
df_plot = pd.DataFrame(umap_3d, columns=["x", "y", "z"])
df_plot["Cluster"] = labels
df_plot["Title"] = df["Title"]
df_plot["FullText"] = combined_texts  # List of concatenated title+abstract, or similar

# Generate topic labels with BERTopic
topic_labels = {}
vectorizer = CountVectorizer(
    stop_words=None,
    lowercase=True,
    max_features=3000
)
cluster_texts = df_plot[df_plot["Cluster"] != -1].groupby("Cluster")["FullText"].apply(list)


for label, texts in cluster_texts.items():
    try:
        topic_model = BERTopic(
            vectorizer_model=vectorizer,
            calculate_probabilities=False,
            # representation_model=KeyBERTInspired(),
            verbose=False
        )
        topics, _ = topic_model.fit_transform(texts)
        
        

        # FIXED: convert dict_keys to list before indexing
        top_topic_id = list(topic_model.get_topics().keys())[0]
        top_words = topic_model.get_topic(top_topic_id)

        label_text = ", ".join([word for word, _ in top_words[:3]])
        topic_labels[label] = label_text
    except Exception as e:
        print(f"[WARN] Cluster {label} failed: {e}")
        topic_labels[label] = f"Cluster {label}"

# 🔧 Assign ClusterLabel using the topic labels
df_plot["ClusterLabel"] = df_plot["Cluster"].apply(
    lambda x: "Noise" if x == -1 else topic_labels.get(x, f"Cluster {x}")
)


# 🎨 Create a color map
unique_labels = df_plot["ClusterLabel"].unique()
base_colors = px.colors.qualitative.Dark2
color_map = {}
color_index = 0

for label in unique_labels:
    if label == "Noise":
        color_map[label] = "darkgrey"
    else:
        color_map[label] = base_colors[color_index % len(base_colors)]
        color_index += 1

# Sort so noise is plotted last
df_plot = df_plot.sort_values(by="Cluster", key=lambda col: col == -1)

# 📊 Create interactive 3D scatter plot
fig = px.scatter_3d(
    df_plot,
    x="x", y="y", z="z",
    color="ClusterLabel",
    color_discrete_map=color_map,
    hover_name="Title",
    title="EVoC Clusters in 3D",
    opacity=0.8
)

fig.update_layout(scene=dict(
    xaxis_title="X",
    yaxis_title="Y",
    zaxis_title="Z",
    bgcolor="lightgrey"
))


fig.write_html("evoc_clusters_plot.html")
print("Plot saved as 'evoc_clusters_plot.html'")



Plot saved as 'evoc_clusters_plot.html'


In [30]:
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
import numpy as np

# Filter to remove noise
mask = labels != -1

# Ground truth
true_labels = df["Target"].values  # shape: (1228,)

cluster_labels = labels[mask]
gt_labels = true_labels[mask]

assert len(cluster_labels) == len(gt_labels)

# Hungarian matching to align clusters with ground truth
def align_clusters(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    row_ind, col_ind = linear_sum_assignment(-cm)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping.get(p, -1) for p in y_pred])
    return aligned

aligned_preds = align_clusters(gt_labels, cluster_labels)

# Compute Cohen's Kappa
kappa = cohen_kappa_score(gt_labels, aligned_preds)
print(f"Cohen’s Kappa (EVoC vs. Target): {kappa:.4f}")

Cohen’s Kappa (EVoC vs. Target): 0.7922
