In [20]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from pathlib import Path
import os


# Get the project root directory and data path
project_root = Path.cwd().parent  # Go up one level from notebooks directory
data_dir = project_root / "data" / "actions"

# Get the first text file
text_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
first_file = os.path.join(data_dir, text_files[0])

# Read the contents
with open(first_file, 'r') as f:
    text = f.read()



# Initialize the SBERT model - using a larger, more accurate model
model = SentenceTransformer('all-mpnet-base-v2')

# Split document into sentences (this example assumes sentences are separated by newlines)
sentences = text.split(".")
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Generate embeddings for each sentence
sentence_embeddings = model.encode(sentences)

# Perform clustering on sentence embeddings to find semantically similar sentences
# Adjust `n_clusters` or `distance_threshold` based on desired chunking
clustering_model = AgglomerativeClustering(n_clusters=8)
clustering_model.fit(sentence_embeddings)

# Group sentences by cluster
clusters = {}
for sentence_id, cluster_id in enumerate(clustering_model.labels_):
    if cluster_id not in clusters:
        clusters[cluster_id] = []
    clusters[cluster_id].append(sentences[sentence_id])

# # Display the semantic chunks
# for cluster_id, cluster_sentences in clusters.items():
#     print(f"Chunk {cluster_id + 1}:")
#     for sentence in cluster_sentences:
#         print(f" - {sentence}")
#     print("\n")


# from rake_nltk import Rake
# # Initialize RAKE
# rake = Rake()

# # Display the semantic chunks with keyword topics
# for cluster_id, cluster_sentences in clusters.items():
#     # Join all sentences in cluster
#     cluster_text = " ".join(cluster_sentences)
    
#     # Extract keywords
#     rake.extract_keywords_from_text(cluster_text)
#     keywords = rake.get_ranked_phrases()[:3]  # Get top 3 keyword phrases
#     topic = " | ".join(keywords)
    
#     print(f"\nChunk {cluster_id + 1} - Keywords: {topic}")
#     print("-" * 80)
#     for sentence in cluster_sentences:
#         print(f" - {sentence}")
#     print()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
from openai import OpenAI
client = OpenAI()

def get_topic_summary(text):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a topic modeling assistant which determines the main message from a set of similar sentences extracted from a Defra policy document. 
                Generate a very brief topic that captures the main theme of the text.
                
                Example:
                Input: ' - 5 years £215 per hectare (ha) per year This actions aim is that there’s a well-managed, intact grass sward growing over the historic or archaeological feature throughout the year, with minimal scrub cover and bare ground'
                Output: 'Pricing and Policy Aims'
                """
            },
            {
                "role": "user",
                "content": f"Generate a brief topic for this text: {text}"
            }
        ],
        temperature=0.3,
        max_tokens=20
    )
    return response.choices[0].message.content.strip()

# Display chunks with LLM-generated topics and optionally save to file and prepare for vector DB
def display_chunks(clusters, save_to_file=False):
    documents = []
    for cluster_id, cluster_sentences in clusters.items():
        cluster_text = " ".join(cluster_sentences)
        topic = get_topic_summary(cluster_text)
        
        output = f"\nChunk {cluster_id + 1} - Topic: {topic}\n"
        output += "-" * 80 + "\n"
        for sentence in cluster_sentences:
            output += f" - {sentence}\n"
        output += "\n"
        
        print(output)  # Print to console
        
        if save_to_file:
            with open('chunk_topics.txt', 'a') as f:
                f.write(output)  # Write to file
                
        # Prepare document for vector DB
        doc = {
            "id": f"chunk_{cluster_id + 1}", # Add 1 to match chunk numbering
            "content": cluster_text,
            "metadata": {
                "topic": topic,
                "chunk_id": cluster_id
            }
        }
        documents.append(doc)
    
    return documents  # Return documents ready for Chroma ingestion

# Call with save_to_file=True to save output
documents = display_chunks(clusters, save_to_file=False)


Chunk 2 - Topic: 'Sustainable Farming Incentive Scheme Guidelines'
--------------------------------------------------------------------------------
 - This is an action in the Sustainable Farming Incentive (SFI) scheme: expanded offer for 2024
 - You must read the SFI scheme information to understand the scheme rules and how to apply
 - Read section 6 ‘Eligible land in other funding schemes’ for more information



Chunk 3 - Topic: Land Management and Conservation Policy
--------------------------------------------------------------------------------
 - 5 years £215 per hectare (ha) per year This action’s aim is that there’s a well-managed, intact grass sward growing over the historic or archaeological feature throughout the year, with minimal scrub cover and bare ground
 - The purpose of this is to: You can do this action on land located above and below the moorland line that’s: Total or part of the available area in a land parcel
 - You must manage the area containing the historic o

In [25]:
documents

[{'id': 'chunk_2',
  'content': 'This is an action in the Sustainable Farming Incentive (SFI) scheme: expanded offer for 2024 You must read the SFI scheme information to understand the scheme rules and how to apply Read section 6 ‘Eligible land in other funding schemes’ for more information',
  'metadata': {'topic': "'Sustainable Farming Incentive Scheme Guidelines'",
   'chunk_id': np.int64(1)}},
 {'id': 'chunk_3',
  'content': '5 years £215 per hectare (ha) per year This action’s aim is that there’s a well-managed, intact grass sward growing over the historic or archaeological feature throughout the year, with minimal scrub cover and bare ground The purpose of this is to: You can do this action on land located above and below the moorland line that’s: Total or part of the available area in a land parcel You must manage the area containing the historic or archaeological feature in a way that can reasonably be expected to achieve this action’s aim This includes: You must not: Before yo