In [8]:
import pandas as pd
import numpy as np
import umap
import hdbscan
import torch
from transformers import AutoTokenizer, AutoModel

# Load the preprocessed data
data = pd.read_csv("../processedData/processedData.csv")

# Convert the timePeriod column to datetime format
data["timePeriod"] = pd.to_datetime(data["timePeriod"])

# Extract the processedText and timePeriod columns
docs = data["processedText"].tolist()
time_periods = data["timePeriod"].tolist()

# Initialize the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=True, output_hidden_states=True)
model.half()  # convert model to half-precision format

# Define a forward function for checkpointing
def forward(input_ids, attention_mask):
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    last_hidden_state = outputs.last_hidden_state
    attentions = outputs.attentions
    hidden_states = outputs.hidden_states
    return last_hidden_state, attentions, hidden_states

# Encode the documents using BERT with gradient checkpointing and half-precision format
batch_size = 16
outputs_list = []

for i in range(0, len(docs), batch_size):
    docs_batch = docs[i:i+batch_size]
    inputs = tokenizer(docs_batch, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.half() for k, v in inputs.items()}  # convert input tensors to half-precision format
    outputs = model.checkpoint(forward, inputs["input_ids"], inputs["attention_mask"])
    outputs_list.append(outputs[0].detach().float().numpy())

outputs = np.concatenate(outputs_list)

# Project the document embeddings to a lower-dimensional space using UMAP
umap_model = umap.UMAP(n_components=2, random_state=42)
umap_embeddings = umap_model.fit_transform(outputs)

# Cluster the documents using HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=50)
hdbscan_clusters = hdbscan_model.fit_predict(umap_embeddings)

# Create a new dataframe with the UMAP embeddings and HDBSCAN clusters
umap_df = pd.DataFrame(umap_embeddings, columns=["UMAP 1", "UMAP 2"])
umap_df["HDBSCAN Cluster"] = hdbscan_clusters
umap_df["Time Period"] = time_periods

# Plot the UMAP embeddings and HDBSCAN clusters by time period
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=umap_df, x="UMAP 1", y="UMAP 2", hue="HDBSCAN Cluster", style="HDBSCAN Cluster", alpha=0.5)
plt.title("Topic Modeling Over Time")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'BertModel' object has no attribute 'checkpoint'