In [None]:
import pandas as pd
import numpy as np
import wordninja
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from pprint import pprint
import os
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def get_statuses(fname):

    pattern = re.compile(
    r'_H.R.\d{1,4}_(?P<status>.+?) \((?P<date>\d{2}_\d{2}_\d{4})\)\.txt$'
    )

    m = pattern.search(fname)
    if m:
        return m.group('status').strip(), m.group('date').strip()

In [None]:
def remove_boilerplate(text: str) -> str:
    pattern =  re.compile(r'^Shown Here:.*?\((\d{2}/\d{2}/\d{4})\)')        # captures date MM/DD/YYYY
    text = re.sub(pattern, '', text)                                        # removes boilerplate text
    parenthesis_content_pattern = re.compile(r'\([^)]*\)')
    text = re.sub(parenthesis_content_pattern, '', text)                                 # captures content in parenthesis
    return text if text else None

In [None]:
def remove_divisions_heading(text: str) -> str:

    header_re = re.compile(
        r'DIVISION\s+'      # the word DIVISION and some space
        r'[A-Z]--'          # one uppercase letter, two hyphens
        r'[A-Z\s,]+'        # all‑caps title (letters, spaces, commas)
    )
    cleaned_text = re.sub(header_re, '', text)
    return cleaned_text

In [None]:
def remove_stopwords(text : str) -> str:

    stops = set(stopwords.words('english'))
    no_divisions_headers = remove_divisions_heading(text)
    tokens = word_tokenize(no_divisions_headers)
    words = [w.lower() for w in tokens if w.lower() not in stops and w.isalpha()]
    return words if words else None


In [None]:
folder_path = 'summaries118'
data = pd.DataFrame()

In [None]:
for idx,entry in enumerate(os.scandir(folder_path)):

    if not entry.is_file():
        continue

    file_path = os.path.join(folder_path, entry.name)
    with open (file_path, 'r', encoding='utf-8') as f:
        corpus = f.read()
    status,date = get_statuses(entry.name)
    rec = pd.DataFrame({
        'status': status,
        'date': date.replace('_','/'),
        'content': corpus
    }, index=[idx])
    data = pd.concat([data,rec], ignore_index=False)

In [None]:
def clean_text(content: str) -> str:

    no_boilerplate = remove_boilerplate(content)

    if not no_boilerplate:
        return None
    
    no_stops = remove_stopwords(no_boilerplate)

    return no_stops if no_stops else None 

In [None]:
data['cleaned_content'] = data['content'].map(lambda x: clean_text(x))

# **1. Topic Modeling**

## **A. LDA**

In [None]:
dictionary = Dictionary(data['cleaned_content'].tolist())
dictionary.filter_extremes(no_below=0.01, no_above=0.25)
corpus = [dictionary.doc2bow(text) for text in data['cleaned_content'].tolist() if text]

temp = dictionary[0]

id2word = dictionary.id2token

In [None]:
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=10,
    iterations=1000,
    random_state=42,
)

In [None]:
topics = model.print_topics(num_words=10)
topics

In [None]:
len(topics)

In [None]:
topics[0]

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(vis)

##  **B. BERTtopic**

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

examples = [ InputExample(texts=[doc_a, doc_b], label=0.8) for doc_a, doc_b in zip(data['content'].tolist(), data['content'][1:].tolist()) ]
loader   = DataLoader(examples, batch_size=16, shuffle=True)

# 3) Choose a loss (e.g. cosine similarity)
train_loss = losses.CosineSimilarityLoss(model)

# 4) Fine‑tune
embedding_model.fit(train_objectives=[(loader, train_loss)], epochs=2, warmup_steps=100)

In [None]:
vectorizer_model = CountVectorizer(
  ngram_range=(1,2),    
  stop_words="english",
  min_df=5,             
  max_df=0.75            
)

In [None]:
umap_model = UMAP(
    n_neighbors=30, 
    n_components=10, 
    min_dist=0.1,
    metric='cosine', 
    random_state=42
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=20,        
    min_samples=5,              
    cluster_selection_epsilon=0.0, 
    cluster_selection_method='eom',  
    metric='euclidean',        
    prediction_data=True        
)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       umap_model=umap_model,      
                       hdbscan_model=hdbscan_model, 
                       calculate_probabilities=True,
                       vectorizer_model=vectorizer_model,  
                       verbose=True)

In [None]:
docs = data['content'].tolist()
topics, probs = topic_model.fit_transform(docs)

In [None]:
len(topics)

In [None]:
topics

In [None]:
fig = topic_model.visualize_topics()
fig.write_html("./topics.html")

# **2. Semantic Similarity**