In [1]:
"""
Load the pre-trained BERT model and tokenizer using a library such as Hugging Face's Transformers.
"""
import torch
from transformers import AutoTokenizer, AutoModel


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
"""
Load the text data and split it into sentences using a library such as NLTK.
"""
import nltk

with open("data/simple_text.txt", "r") as f:
    transcript = f.read()

sentences = nltk.sent_tokenize(transcript)
sentences

['Chairman Wormsley: Each of you has received the agenda.',
 'I will entertain a motion that the agenda be approved.',
 'Commissioner Brown: So moved.',
 'Commissioner Hobbs: Seconded\n\nChairman Wormsley: It has been moved and seconded that the agenda be approved as received by the members.',
 'All those in favor signify by saying "Aye"?...Opposed by saying "No"?...The agenda is approved.',
 'You have received a copy of the minutes of the last meeting.',
 'Are there any corrections or additions to the meeting?',
 'Commissioner McCroskey: Mister Chairman, my name has been omitted from the Special Committee on Indigent Care.',
 'Chairman Wormsley: Thank you.',
 'If there are no objections, the minutes will be corrected to include the name of Commissioner McCroskey.',
 'Will the clerk please make this correction.',
 'Any further corrections?',
 'Seeing none, without objection the minutes will stand approved as read.',
 '(This is sort of a short cut way that is commonly used for approval 

In [3]:
"""
Generate BERT embeddings for each sentence by passing them through the pre-trained BERT model.
"""
embeddings = []
for sentence in sentences:
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    embeddings.append(last_hidden_states.mean(dim=1).squeeze().detach().numpy())

In [4]:
len(embeddings)

91

In [5]:
len(embeddings[0])

768

In [6]:
"""
Use a clustering algorithm, to group the sentences into clusters based on their embeddings.
"""
from sklearn.cluster import AgglomerativeClustering

clusterer = AgglomerativeClustering(n_clusters=5, affinity='cosine', linkage='average')
clusters = clusterer.fit_predict(embeddings)




In [14]:
"""
Apply heuristics to merge or split the clusters into coherent and meaningful topical segments.
"""
segments = []
prev_cluster = clusters[0]
prev_idx = 0
segment_ids = []

for i in range(1, len(clusters)):
    if clusters[i] != prev_cluster:
        segments.append(" ".join(sentences[prev_idx:i]))
        segment_ids.append((prev_idx, i))
        prev_cluster = clusters[i]
        prev_idx = i

segments.append(" ".join(sentences[prev_idx:]))
segment_ids.append((prev_idx, len(sentences)))

print('\n\n<-- Topic Change -->\n\n'.join(segments[:5]))

Chairman Wormsley: Each of you has received the agenda. I will entertain a motion that the agenda be approved.

<-- Topic Change -->

Commissioner Brown: So moved. Commissioner Hobbs: Seconded

Chairman Wormsley: It has been moved and seconded that the agenda be approved as received by the members.

<-- Topic Change -->

All those in favor signify by saying "Aye"?...Opposed by saying "No"?...The agenda is approved. You have received a copy of the minutes of the last meeting. Are there any corrections or additions to the meeting? Commissioner McCroskey: Mister Chairman, my name has been omitted from the Special Committee on Indigent Care.

<-- Topic Change -->

Chairman Wormsley: Thank you.

<-- Topic Change -->

If there are no objections, the minutes will be corrected to include the name of Commissioner McCroskey.


In [8]:
from scipy.spatial.distance import cdist
import numpy as np

# Calculate the centroid sentence for each cluster
centroid_sentences = []
for start, end in set(segment_ids):
    sentences_in_cluster = sentences[start: end]
    sentence_embeddings = np.array(embeddings[start: end])
    centroid_idx = cdist([s for s in sentence_embeddings], [sentence_embeddings.mean(axis=0)], 'euclidean').argmin()
    centroid_sentences.append(sentences_in_cluster[centroid_idx])

# Print the topic and its corresponding headline or summary
for i, centroid_sentence in enumerate(centroid_sentences):
    print(f"Topic {i+1}: {centroid_sentence}")

Topic 1: Commissioner Adams: Move for a roll call vote.
Topic 2: Nine votes for, nine votes against, one not voting.
Topic 3: Commissioner Carmical.
Topic 4: Chairman Wormsley: Each of you has received the agenda.
Topic 5: Chairman Wormsley: Thank you.
Topic 6: You should vote for this motion if you wish to cut off further debate of the wheel tax increase at this point.
Topic 7: Chairman Wormsley: This resolution has a motion and second.
Topic 8: Commissioner Carmical: I second the motion.
Topic 9: The increase fails.
Topic 10: Will the clerk please take the vote.
Topic 11: Commissioner Hailey.
Topic 12: Chairman Wormsley: Commissioner Thompson

Commissioner Thompson: I second.
Topic 13: Commissioner McCroskey: Mister Chairman, my name has been omitted from the Special Committee on Indigent Care.
Topic 14: Commissioner Carmical: There will be a chili supper at County Elementary School on August 16 at 6:30 p.m. Everyone is invited.
Topic 15: Since no member is seeking recognition, are t