# SBERT Clustering Directions
1.   Select "Runtime" at the top menu >> "Change Runtime Type" >> "GPU"
2.   Run the first code cell below by clicking the icon on the left of the cell (a triangle inside a circle). You'll only need to run this once at the beginning of your session.
3.   To upload your data, click the "Files" icon on the left menu >> "Upload to session storage" icon and upload your spreadsheet.
4.   Follow the directions below (starting at the second code cell) to run SBERT Clustering.



In [None]:
# install sentence transformers
!pip install -U sentence-transformers

# load libraries
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from google.colab import files

# the pretrained model that SBERT embeddings are computed with
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# function for SBERT Cluster
def SBERTCluster(df, k):

  # make sure the column of questions is named 'Question' otherwise change column name in the line below
  corpus = list(df.final_reviewed_text)

  # Generate sentence embeddings with SBERT
  corpus_embeddings = embedder.encode(corpus)

  # Perform kmeans clustering
  num_clusters = k
  clustering_model = KMeans(n_clusters=num_clusters)
  clustering_model.fit(corpus_embeddings)
  cluster_assignment = clustering_model.labels_ # array of the cluster number assigned for each embedded sentence


  # Add all cluster results into a dataframe
  clustered_sentences = [[] for i in range(num_clusters)]
  for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])
  # Instead of using DataFrame.append(), build a list of dictionaries
  rows = []
  for i, cluster in enumerate(clustered_sentences):
    rows.append({
      'Cluster': i + 1,
      'Sentence_Count': len(cluster),
      'Text': cluster
    })

  cluster_df = pd.DataFrame(rows)

  # Save cluster number for each row in the original dataframe (adding 1 to cluster index for consistency)
  df['Cluster'] = cluster_assignment + 1

  return cluster_df, df



In [None]:
# install sentence transformers
!pip install -U sentence-transformers

# load libraries
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from google.colab import files

# the pretrained model that SBERT embeddings are computed with
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

def SBERTCluster(df, k):
    # Extract the list of sentences from the dataframe.
    # Make sure the column of questions is named 'final_reviewed_text'
    corpus = list(df.final_reviewed_text)

    # Generate SBERT embeddings for the corpus
    corpus_embeddings = embedder.encode(corpus)

    # Perform k-means clustering
    clustering_model = KMeans(n_clusters=k, random_state=42)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_  # Cluster labels for each sentence

    # Build clusters for each sentence to form a summary dataframe
    clustered_sentences = [[] for _ in range(k)]
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(corpus[sentence_id])

    # Create a dataframe summarizing clusters with their count and text group
    rows = []
    for i, cluster in enumerate(clustered_sentences):
        rows.append({
            'Cluster': i + 1,
            'Sentence_Count': len(cluster),
            'Text': cluster
        })
    cluster_df = pd.DataFrame(rows)

    # Save cluster number for each sentence in the original dataframe (adjust cluster index by adding 1)
    df['Cluster'] = cluster_assignment + 1

    # Compute silhouette scores:
    #   - Average silhouette score for all clusters.
    #   - Silhouette score for each individual sample.
    avg_silhouette = silhouette_score(corpus_embeddings, cluster_assignment)
    sample_silhouette_values = silhouette_samples(corpus_embeddings, cluster_assignment)

    # Optionally, add the per-sample silhouette scores back to the dataframe
    df['Silhouette_Score'] = sample_silhouette_values

    print("Average silhouette score: {:.3f}".format(avg_silhouette))

    return cluster_df, df, avg_silhouette



In [None]:
# Load in data (change to the name of your spreadsheet file)
# uncomment the line below if it's a csv file
# df = pd.read_csv('CHANGE TO NAME OF YOUR SPREADSHEET.csv')
# otherwise uncomment the line below if it's an excel file
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_excel('/content/drive/MyDrive/Holocaust and Genocide Studies Digital Research Lab/Current Projects/Aditya and Billy (URW)/Final Disambiguated Questions.xlsx')

# Run SBERT Clustering on your data
# - change "k" depending on how many clusters you want
# - depending on the size of your spreadsheet and the number of clusters you define, this step may take 1-15ish minutes
# - each time the line below is run, new results will be produced
results = SBERTCluster(df, k=95)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Average silhouette score: 0.119


In [None]:
# see the results of SBERT Clustering
# change the 0 to 1 if you want to see the cluster numbers for each unique question
results

(    Cluster  Sentence_Count                                               Text
 0         1              44  [Yes… . Do they write here to… ’Huit et quaran...
 1         2              85  [ Mr. Barzilai, can you tell me if you were bo...
 2         3             172  [How old were you when the Soviets came to Est...
 3         4             148  [Did you have a ration card?, Did the Spaniard...
 4         5             899  [Yes?, Well?, Yes. And?, Yes, and?, Yes?, Yes....
 ..      ...             ...                                                ...
 90       91             208  [Were you born in Portugal?,  Now, Jürgen, wil...
 91       92             104  [Who was in the forest during the deportations...
 92       93             274  [Why is it so 'of course' that the Jewish Comm...
 93       94             119  [What do you mean when you said you ate dirt f...
 94       95             174  [Did you go over to the side where the women a...
 
 [95 rows x 3 columns],
        file_p

In [None]:
# Download your cluster spreadsheet
# change to the file name you want the spreadsheet to have
download_file = 'DATA_Clusters.xlsx'

# change the 0 to 1 in the lines below, depending on which spreadsheet you want to download
# 0 = spreadsheet where each row is a cluster
# 1 = spreadsheet where each row is a question (with its corresponding cluster)
results[0].to_excel(download_file)
files.download(download_file)

# you can download the spreadsheet by hovering over the spreadsheet on the right panel and clicking the three dots

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>