In [None]:

import os
import torch
import requests
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
from docling.document_converter import DocumentConverter
load_dotenv() 


files = [
    "Hotel Marketing.pdf"
]

pdf_filename =  files[0]


""" Testing: markdown extraction with docling"""
print(torch.cuda.is_available())   
 

def write_file(filename: str, content: str):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

 
attempt = 5

FOLDER = "output(docling)"
Path(FOLDER).mkdir(parents=True, exist_ok=True)

table_data = []

for filename in files:
    filepath = f"pdf/{filename}"
    
    time_taken_list = []
    for _ in range(attempt):   
        start_time = datetime.now()

        converter = DocumentConverter()
        result = converter.convert(filepath)
        markdown_content = result.document.export_to_markdown()

        end_time = datetime.now()
        time_taken = end_time - start_time
        time_taken_list.append(time_taken)

        md_filename = filename.replace(".pdf", ".md")
        write_file(f"{FOLDER}/{md_filename}", markdown_content)
 
    avg_time_taken = sum(time_taken_list, timedelta()) / len(time_taken_list)
    table_data.append((filename, time_taken_list, avg_time_taken))

 


In [None]:
markdown_table = "| File Name                                    | Time Taken (1st) | Time Taken (2nd) | Time Taken (3rd) | Time Taken (4th) | Time Taken (5th) | Average Time Taken |\n"
markdown_table += "|----------------------------------------------|------------------|------------------|------------------|------------------|------------------|--------------------|\n"

for file, time_taken_list, avg_time_taken in table_data:
    time_taken_str = " | ".join([get_time_str(time) for time in time_taken_list])
    markdown_table += f"| {file} | {time_taken_str} | {get_time_str(avg_time_taken)} |\n"

print(markdown_table)

write_file(f"{FOLDER}/time_taken_summary.md", markdown_table)

Parsing hasn't finished yet. Please try later


In [None]:
import os
import boto3
import json
import torch 
import openai
from igraph import Graph, plot
from typing import Tuple, Optional, Any
import nltk
from nltk.tokenize import sent_tokenize
from dotenv import load_dotenv
 
load_dotenv() 

nltk.download("punkt_tab")

if torch.cuda.is_available():
    print("Using GPU with PyTorch")
 
region_name = os.environ.get("AWS_REGION") 
aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID") 
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY") 

bedrock_model = os.environ.get("BEDROCK_MODEL_NAME")
modelId = os.environ["BEDROCK_MODEL_NAME"]
LLM_BASE_URL = os.environ["LLM_BASE_URL"]
LLM_MODEL=os.environ["LLM_MODEL"]

client = boto3.client('bedrock-runtime', 
                      region_name=os.environ["AWS_REGION"],
                      aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
                      aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])

 
system_message = """You are a text network analyst working on a project to build a knowledge graph from the given text.
The text will be passed as a dictionary where the key is the sentence number and the value is the sentence content.
Identify at least 10 entities and 10 relationships from the given text. Must include PERSON, TECH, ORG, TEAM, and other relevant entities.
Do not include dates and numbers. Ensure that all entities are in their lemma and lower case forms.
The source parameter should indicate the sentence number from which the relationship is extracted.
Provide the output strictly in the following JSON format with **only the JSON object** (no explanation or extra text):
   {
    "relationships": [
        {
            "subject": "Entity1",
            "subjectType": "Person",
            "predicate": "works_for",
            "object": "Entity2",
            "objectType": "Organization",
            "source": "sentence_1"
        },
        {
            "subject": "Policy1",
            "subjectType": "Policy",
            "predicate": "focuses_on",
            "object": "Objective1",
            "objectType": "Location",
            "source": "sentence_2."
        },
        ...
    ]
}
"""

def save_txt_file(filename:str, text:str):
    with open(filename, mode="w") as f:
        f.write(text)

def save_json_file(filename:str, data:dict):
    with open(filename , "w") as f:
        json.dump(data, f, indent=4)
 

def generate_topic_with_deepseek(text:str): 
    client = openai.Client(
        base_url=f"{LLM_BASE_URL}/v1",
        api_key="ollama"  
    )
    user_message = f"""
        Summarize the main topic of this paragarph: {text} in one short phrase, not exceeding ten words.
        For example: 'Guidance on implementing Artificial Intelligence (AI) within a government organization.'
    """

    response = client.chat.completions.create(
            model=LLM_MODEL,
            messages=[
                    {"role": "user", "content": user_message},
            ],
            stream=False,
    )

    return response.choices[0].message.content


def generate_topic_with_bedrock(text:str): 
    print("Input text: ", text)
     
    user_message = f"""
        Summarize the main topic of this paragarph: {text} in one short phrase, not exceeding ten words.
        For example: 'Guidance on implementing Artificial Intelligence (AI) within a government organization.'
    """

    conversation = [
        {
            "role": "user",
            "content": [{"text": user_message}],
        }
    ]

    response = client.converse(
        modelId=modelId,
        messages=conversation,
        inferenceConfig={
            "maxTokens": 512,
            "temperature": 0.5,
            "topP": 0.9,
        },
    )

    response_text = response["output"]["message"]["content"][0]["text"]
    return response_text


def get_json_text(text: str) -> str:
    
    if not text.startswith("{"):
        text = text[text.index("{"):]

    # Ensure the sentence is closed properly
    if text.count('"') % 2 != 0:
        text+= '"'

    # Ensure the JSON object is properly closed
    if text.count("{") != text.count("}"):
        if text.count("{") - text.count("}") == 2:
            text += "}"
        if text.count("[") != text.count("]"):
            text += "]" 
        text += "}"
        
    return text


def generate_relationships_deepseek(text:str, chunk_size:int = 2400, max_tokens:int = 1000):
    
    client = openai.Client(
        base_url=f"{LLM_BASE_URL}/v1",
        api_key="ollama"  
    )

    sentences = sent_tokenize(text)
    user_messages = []
    sentence_dict = {}
    
    for i, sentence in enumerate(sentences):
        if len(sentence.strip()) > 0:
            sentence_dict[f"sentence_{i + 1}"] = sentence.strip()
    
    chunk = {}
    current_chunk_len = 0
    for key, sentence in sentence_dict.items():
        sentence_len = len(sentence)
        if current_chunk_len + sentence_len <= chunk_size:
            chunk[key] = sentence
            current_chunk_len += sentence_len
        else:
            user_messages.append( json.dumps(chunk))
            chunk = {key: sentence}
            current_chunk_len = sentence_len

 
    if chunk:
        user_messages.append(json.dumps(chunk))
            
    relationships  = []
    fails = []
    total_chunks = len(user_messages)

    for indx, message in enumerate(user_messages):
        
        try:
            response = client.chat.completions.create(
                model=LLM_MODEL,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": message},
                ],
                stream=False,
                max_tokens=max_tokens 
            )

            response_text = response.choices[0].message.content
            if response_text:
                response_text = get_json_text(response_text)
                response_json = json.loads(response_text)
                
                print(f"Successfuly prossed {indx}/{total_chunks}...")
                relationships.append(response_json["relationships"])
        except json.JSONDecodeError:
            print("Error occurred: Failed to decode JSON response")
            fails.append(response_text) # for debugging
            continue
    
    return relationships

def generate_relationships_chunk_batch(text: str, chunk_size: int = 2400, max_tokens:int = 1000) -> list[dict[str, Any]]:
    """
    Extract entities and relationships from the provided text.
    Reference: https://bluetickconsultants.medium.com/dual-approaches-to-building-knowledge-graphs-traditional-techniques-or-llms-400fee0f5ac9
    """
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    current_chunk = ""
    user_messages =[]
    
    # to ensure the response is return as complete
    sentence_token_limit = 2000
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= sentence_token_limit:
            current_chunk += " " + sentence
        else:
            user_messages.append({"text": current_chunk.strip()})
            current_chunk = sentence

    if current_chunk:
        user_messages.append({"text": current_chunk.strip()})
    
    try:
        response = client.converse(
            modelId=modelId,
            messages=[{"role": "user", "content": user_messages}],
            inferenceConfig={
                "maxTokens": max_tokens,
                "temperature": 0.5,
                "topP": 0.9,
            },
            system= [{"text": system_message}]
        )
        response_text = response["output"]["message"]["content"][0]["text"]
        if response_text:
            response_text = get_json_text(response_text)
            response_json = json.loads(response_text)
            return response_json["relationships"]
    except json.JSONDecodeError:
        print("Error occurred: Failed to decode JSON response")
    
    return response_text

def generate_relationships_chunk(text: str, chunk_size: int = 3000, max_tokens:int = 2000) ->tuple[list[dict[str, Any]], list[str]]:
    """
    Extract entities and relationships from the provided text.
    Reference: https://bluetickconsultants.medium.com/dual-approaches-to-building-knowledge-graphs-traditional-techniques-or-llms-400fee0f5ac9
    """
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    user_messages = []
    sentence_dict = {}
    
    for i, sentence in enumerate(sentences):
        if len(sentence.strip()) > 0:
            sentence_dict[f"sentence_{i + 1}"] = sentence.strip()
    
    chunk = {}
    current_chunk_len = 0
    for key, sentence in sentence_dict.items():
        sentence_len = len(sentence)
        if current_chunk_len + sentence_len <= chunk_size:
            chunk[key] = sentence
            current_chunk_len += sentence_len
        else:
            user_messages.append({"text": json.dumps(chunk)})
            chunk = {key: sentence}
            current_chunk_len = sentence_len

 
    if chunk:
        user_messages.append({"text": json.dumps(chunk)})
            
    relationships  = []
    fails = []
    total_chunks = len(user_messages)

    for indx, message in enumerate(user_messages):

        try:
            response = client.converse(
                modelId=modelId,
                messages=[{"role": "user", "content": [message]}],
                inferenceConfig={
                    "maxTokens": max_tokens,
                    "temperature": 0.5,
                    "topP": 0.9,
                },
                system= [{"text": system_message}]
            )
            response_text = response["output"]["message"]["content"][0]["text"]
            if response_text:
                response_text = get_json_text(response_text)
                response_json = json.loads(response_text)
                relationships.extend(response_json["relationships"])
                print(f"Chunk {indx + 1}/{total_chunks} processed successfully")
                
        except json.JSONDecodeError:
            print("Error occurred: Failed to decode JSON response")
            fails.append(response_text)
            continue
        
    return relationships, fails


def generate_graph_with_community_detection_from_json(
    data: list[dict[str, Any]],
    size_ratio: int = 1,
    min_size: int = 10,
    max_size: int = 100,
    color_range=None,
    hide_edge_label: bool = False,
    max_clusters: Optional[int] = None,
) -> Tuple[Graph, str, dict]:
    """
    Parameters:
    data (list): a relationship data in list of dictionaries.
    size_ratio (int): Scale ratio for vertex sizes.
    min_size (int): Minimum vertex size.
    max_size (int): Maximum vertex size.
    color_range (list, optional): Custom colors for communities.
    hide_edge_label (bool): Flag to hide edge labels.
    max_clusters (Optional[int]): Maximum number of top clusters to display. If None, show all clusters.
   
    Returns:
    Tuple[Graph, str, dict]: A graph object, the most influential vertex, and a dictionary mapping clusters to topics and edges.
    """

    entities = set()
    relationships = set()
    
    
    for rel in data:
        relationships.add((rel['subject'], rel['predicate'], rel['object'], rel['source']))
        entities.add(rel['subject'])
        entities.add(rel['object'])
    relationships = list(relationships)
    entities = list(entities)
    
    vertex_sizes = {vertex: 0 for vertex in entities}
    edge_count = {}
    weighted_edges = []
    edge_sentences = {} 

    temp_graph = Graph(directed=False)
    temp_graph.add_vertices(entities)
    
    # Process the relationships and calculate weights
    for subject, verb, obj, source in relationships:
        edge_count[(subject, obj)] = edge_count.get((subject, obj), 0) + 1
        
        weight = edge_count[(subject, obj)]
        weighted_edges.append((subject, obj, verb, weight))
        
        vertex_sizes[subject] += weight
        vertex_sizes[obj] += weight
        
        if (subject, obj, verb) not in edge_sentences:
            edge_sentences[(subject, obj, verb)] = []
        edge_sentences[(subject, obj, verb)].append(source)
    
    # Add edges to the graph
    temp_graph.add_edges([(subject, obj) for subject, obj, _, _ in weighted_edges])
    temp_graph.es['weight'] = [weight for _, _, _, weight in weighted_edges]
    

    communities = temp_graph.community_multilevel(weights=temp_graph.es['weight'])

    if max_clusters is not None:
        sorted_communities = sorted(communities, key=len, reverse=True)[:max_clusters]
        
        filtered_nodes = set()
        for community in sorted_communities:
            filtered_nodes.update(temp_graph.vs[node]["name"] for node in community)

        filtered_edges = [
            (subject, obj, label, weight)
            for subject, obj, label, weight in weighted_edges
            if subject in filtered_nodes and obj in filtered_nodes
        ]
        final_vertices = list(filtered_nodes)
        communities = sorted_communities
    else:
        filtered_edges = weighted_edges
        final_vertices = entities
    

    final_graph = Graph(directed=False)
    final_graph.add_vertices(final_vertices)
    
    labels = []
    weights = []
    for subject, obj, label, weight in filtered_edges:
        final_graph.add_edge(subject, obj)
        labels.append(label)
        weights.append(weight)
        
    if not hide_edge_label:
        final_graph.es['label'] = labels
    final_graph.es['weight'] = weights

    # Assign colors to communities
    colors = color_range or [
        "green", "cyan", "orange", "purple", "magenta", "yellow",
        "lime", "teal", "pink", "gold", "blue", "red", "maroon", "olive"
    ]
    
    cluster_data = {}
    community_map = {}
    
    for i, community in enumerate(communities):
        color = colors[i % len(colors)]
        cluster_name = f"cluster_{i + 1}"
        cluster_data[cluster_name] = {"edges": [], "sentences": [], "color": color}

        for node in community:
            community_map[temp_graph.vs[node]["name"]] = color
            node_name = temp_graph.vs[node]["name"]
            for edge in filtered_edges:
                if edge[0] == node_name or edge[1] == node_name:
                    edge_str = f"({edge[0]}) -> {edge[2]} -> ({edge[1]})"
                    if edge_str not in cluster_data[cluster_name]["edges"]:
                        sentence = edge_sentences.get((edge[0], edge[1], edge[2]), [])
                        cluster_data[cluster_name]["edges"].append(edge_str)
                        cluster_data[cluster_name]["sentences"].extend(sentence)
    
    # Set color and size properties for final graph
    final_graph.vs["color"] = [community_map.get(v["name"], "gray") for v in final_graph.vs]
    final_graph.es['color'] = [community_map.get(final_graph.vs.find(name=subject)["name"], "gray") for subject, _, _, _ in filtered_edges]
    
    # Compute the most influential vertex based on vertex sizes
    most_influential_vertex = max(vertex_sizes, key=vertex_sizes.get)
    max_count = max(vertex_sizes.values()) if vertex_sizes else 1
    scaled_sizes = {
        vertex: max(min_size, min(int((size / max_count) * max_size), max_size)) * size_ratio
        for vertex, size in vertex_sizes.items()
    }
    final_graph.vs["size"] = [scaled_sizes.get(v["name"], min_size) for v in final_graph.vs]
    final_graph["title"] = "Entity Relationship Graph"
    
    for cluster_name, data in cluster_data.items():
        combined_text = " ".join(data["sentences"])
        topic = ""  # Placeholder for topic generation logic
        data["topic"] = topic

    return final_graph, most_influential_vertex, cluster_data


def visualize_graph(graph: Graph, influential_vertex: str, layout_algorithm:str = "fruchterman_reingold", output_file="graph_with_center.png", bbox:int= 1200):
    """
    Visualize the graph and ensure the most influential vertex is at the center.

    Args:
        graph (Graph): iGraph graph object.
        influential_vertex (str): The name of the most influential vertex.
        output_file (str): Path to save the visualization.
    """
    
    layout = graph.layout(layout_algorithm)
 
    # influential_index = graph.vs.find(name=influential_vertex).index
    
    # placing most influential node as center of the graph
    # layout[influential_index] = [0, 0]  

    scaled_layout = [[coord * 1.5 for coord in pos] for pos in layout]

    plot(
        graph,
        layout=scaled_layout,
        vertex_size=graph.vs["size"],
        vertex_color=graph.vs["color"],
        vertex_label=graph.vs["name"],
        edge_width=[weight for weight in graph.es["weight"]],
        edge_color=graph.es['color'],
        bbox=(bbox, bbox),
        margin=80,
        target=output_file,
        main= graph["title"]
        
    )
 
    print(f"Graph saved as {output_file}")


Using GPU with PyTorch


[nltk_data] Downloading package punkt_tab to /home/hm3/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# reuse local extracted text file

import glob
import os

def read_file(filepath: str) -> str:
    with open(filepath, mode="r") as f:
        return f.read()

txt_folder = pdf_filename.replace(".pdf", "")

file_pattern = f"All_clusters/{txt_folder}/*(org).txt"
matching_files = glob.glob(file_pattern)

if len(matching_files) == 1:
    print(f"Found {matching_files[0]}")
    text = read_file(matching_files[0])
else:
    raise FileNotFoundError(f"No file matching the pattern {file_pattern} found.")

Found All_clusters/public-benefits-and-ai/public-benefits-and-ai(Gliner + Spacy)(org).txt


In [None]:
# the larger the chunk, the processing time will be fast but the accuracy will be low
relationships, fails = generate_relationships_chunk(text, max_tokens=1500, chunk_size = 3000)

In [None]:
save_txt_file(f"{pdf_filename.replace(".pdf", "")}(debug).py", str(relationships))

In [191]:
max_clusters =  5
graph, most_influential_node, topic_clusters  = generate_graph_with_community_detection_from_json(relationships, max_clusters=max_clusters,  min_size=20, max_size=100, size_ratio=1, hide_edge_label=True)

layout_algorithms = ["fruchterman_reingold"]
 
folder_name = pdf_filename.replace(".pdf", "")

folder_name += "(LLM-based)"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
clusters = f"Top {max_clusters} Clusters" if max_clusters is not None else "All Clusters"
 
for layout_algorithm in layout_algorithms:
    output_file = f"{folder_name}/knowledge-graph ({clusters}).png"
    visualize_graph(graph, most_influential_node,  layout_algorithm=layout_algorithm,output_file=output_file, bbox=7500)

Graph saved as public-benefits-and-ai(LLM-based)/knowledge-graph (Top 5 Clusters).png


In [None]:
"""Run to save analysis outcome"""
 
entities = set()
entities_types = set()
readable_edges = []


sentences = sent_tokenize(text)
user_messages =[]
sentence_dict = {}


for i, sentence in enumerate(sentences):
    if len(sentence.strip()) > 0:
        sentence_dict[f"sentence_{i + 1}"] = sentence.strip()
 
save_txt_file("urgent.py", str(sentence_dict))
 
for edge in relationships:
    subj = edge["subject"]
    obj = edge["object"]
    entities.add(subj)
    entities.add(obj)
    entities_types.add(edge["subjectType"])
    entities_types.add(edge["objectType"])
    predicate = edge["predicate"]
    readable_edges.append(f"({subj})->{predicate}->({obj})")
        
   

topic_md = ""
 
for cluster_name, cluster in topic_clusters.items():
    keys = cluster["sentences"]
    combined_text = ""
    sentences = []
    for key in keys:
        sentence = sentence_dict[key]
        sentences.append(sentence)
    # replace teh sentences' keys with the actual sentences
    cluster["sentences"] = sentences
    
    # remove duplicates to generate a topic
    sentences = list(set(sentences))
    combined_text = ". ".join(sentences)
 
    topic = ""
    if combined_text:
        print(f"Generating topic for {cluster_name}")
        while True:
            try:
                topic = generate_topic_with_bedrock(combined_text) 
            except TimeoutError:
                pass
            if topic:
                topic_md += f"# {topic}\n"
                topic_md += f"### {cluster_name} ({cluster['color']})\n"
                topic_md += f"- {combined_text}"
                topic_md += "\n\n"
                break
    cluster["topic"] = topic


metadata = {
    "vertices" : list(entities),
    "edges": readable_edges,
    "most_influential_node": most_influential_node,
    "clusters": {**topic_clusters},
    "captured_entities": list(entities_types)
}

save_txt_file(f"{folder_name}/{folder_name}(org).txt", text)
try:
    save_txt_file(f"{folder_name}/{folder_name}({clusters}' topics).md", topic_md)
except Exception:
    # some text are only included when working with neuralcoref
    pass

save_json_file(f"{folder_name}/{folder_name}-metadata ({clusters}).json", metadata)
 
print(f"All data are saved at {folder_name}")


In [2]:


sample_text = "This pull request implements a new caching mechanism for metadata retrieval in the application. The goal is to reduce database load and speed up response times for frequently accessed metadata."
topic = generate_topic_with_deepseek(sample_text)
print(topic)



<think>
Alright, I need to help this user by summarizing a paragarph into a single phrase. The original paragraph talks about a pull request introducing a new caching mechanism for metadata retrieval. The main goal is reduce database load and speed up response times.

Hmm, the user specifically asked for it in one short phrase, not exceeding ten words. They also gave an example: 'Guidance on implementing Artificial Intelligence (AI) within a government organization.' So I should follow that structure but instead use the relevant terms for caching and metadata.

Let me look at the key points again: new caching mechanism, reduces database load, speeds up response times, frequently accessed metadata. The focus is on performance improvements through caching. Maybe something like "Implementing a caching mechanism to reduce database load..." That captures the action and the benefit clearly.
</think>

Implementing a caching mechanism to reduce database load and speed up response times for fre

In [None]:
import re

response = topic
# Extract the <think> section
think_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
think_section = think_match.group(1).strip() if think_match else None

# Extract the translated result
translated_result = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()

# print("Think Section:")
# print(think_section)
print("\nTranslated Result:")
print(translated_result)