# This Notebook Uses dataset we have created to index and convert the data into a knowledge graph 

## Step 1 : Loading json of 24 profiles data

In [1]:
import json

# Specify the path to your JSON file
filename = 'linkedin_profiles_24.json'

# Open and load the JSON file
with open(filename, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Check if the loaded JSON is a list or a dictionary
if isinstance(data, list):
    # Print the first 5 entries as a sample
    sample = data[:5]
else:
    # Print the whole dictionary if it's not a list
    sample = data

# Pretty-print the sample data
print(json.dumps(sample, indent=4))


[
    {
        "Name": "Alexis Ryan",
        "Headline": "software developer at Ford motors",
        "About": "I am currently doing 6 months of internship at Ford. I am a enthusiast for development , be it web development or  game development. I have developed a 2d platformer game using Unity engine , 3d FPS game using Unreal engine and I along with my team of 3 created a virtual autonomous drone for a IEEE competition , all this in the duration of my B.Tech program .",
        "Articles": [],
        "Activities": [
            {
                "Title": "Hard-coded a leetCode question today with Aditya Rai which beats 100% of users \ud83d\ude02 \ud83d\ude02 \n\n#leetcode #leetcodechallenge"
            },
            {
                "Title": "Achieved the Google Digital Marketing & E-Commerce Professional Certificate!!"
            },
            {
                "Title": "Our 3 years of effort is now open-source\n\nThe course which has been built by mentors from Amazon , DeSha

## Step 2 : We make rule based entity relation using a simple re logic ( instead of 1 node having 1 word , there is chunk of text in one node ) 

In [2]:
import json
import re
import os

# Load the JSON file containing the profiles
with open("linkedin_profiles_24.json", "r", encoding="utf-8") as infile:
    profiles = json.load(infile)

# Create output folder if it does not exist
output_folder = "rule based extraction"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "rule based triplets.json")

def clean_text(text):
    """Remove extra whitespace and newlines from a text."""
    return re.sub(r'\s+', ' ', text).strip()

# Initialize the list for storing triplets
triplets = []

for profile in profiles:
    name = profile.get("Name", "Unknown")
    
    # 1. Headline and About
    headline = clean_text(profile.get("Headline", "Not provided"))
    about = clean_text(profile.get("About", "Not provided"))
    triplets.append({
        "entity1": name,
        "relation": "describes",
        "entity2": f"{headline} and {about}"
    })

    # 2. Experiences
    for exp in profile.get("Experiences", []):
        job_title = clean_text(exp.get("Job Title", "Not provided"))
        company = clean_text(exp.get("Company", "Not provided"))
        description = clean_text(exp.get("Description", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "worked at",
            "entity2": company
        })
        triplets.append({
            "entity1": name,
            "relation": "served as",
            "entity2": job_title
        })
        triplets.append({
            "entity1": name,
            "relation": "experience description",
            "entity2": description
        })

    # 3. Activities
    for act in profile.get("Activities", []):
        title = clean_text(act.get("Title", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "interacted on",
            "entity2": title
        })

    # 4. Articles
    for article in profile.get("Articles", []):
        title = clean_text(article.get("Title", "Not provided"))
        description = clean_text(article.get("Description", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "published article",
            "entity2": title
        })
        triplets.append({
            "entity1": name,
            "relation": "article context",
            "entity2": description
        })

    # 5. Awards
    for award in profile.get("Awards", []):
        award_org = clean_text(award.get("Award Organization", "Not provided"))
        award_title = clean_text(award.get("Award Title", "Not provided"))
        award_desc = clean_text(award.get("Award Description", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "awarded by",
            "entity2": award_org
        })
        triplets.append({
            "entity1": name,
            "relation": "award title",
            "entity2": award_title
        })
        triplets.append({
            "entity1": name,
            "relation": "award description",
            "entity2": award_desc
        })

    # 6. Languages
    for lang in profile.get("Languages", []):
        language = clean_text(lang.get("Language", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "speaks",
            "entity2": language
        })

    # 7. Educations
    for edu in profile.get("Educations", []):
        institution = clean_text(edu.get("Institution", "Not provided"))
        degree = clean_text(edu.get("Degree", "Not provided"))
        field = clean_text(edu.get("Field of Study", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "studied at",
            "entity2": institution
        })
        triplets.append({
            "entity1": name,
            "relation": "obtained degree",
            "entity2": degree
        })
        triplets.append({
            "entity1": name,
            "relation": "field of study",
            "entity2": field
        })

    # 8. Certifications
    for cert in profile.get("Certifications", []):
        cert_title = clean_text(cert.get("Certification Title", "Not provided"))
        issuer = clean_text(cert.get("Issuer", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "certified by",
            "entity2": issuer
        })
        triplets.append({
            "entity1": name,
            "relation": "certification",
            "entity2": cert_title
        })

    # 9. Volunteer Experience
    for vol in profile.get("Volunteer", []):
        position = clean_text(vol.get("Position", "Not provided"))
        organization = clean_text(vol.get("Organization", "Not provided"))
        description = clean_text(vol.get("Description", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "volunteered at",
            "entity2": organization
        })
        triplets.append({
            "entity1": name,
            "relation": "volunteer position",
            "entity2": position
        })
        triplets.append({
            "entity1": name,
            "relation": "volunteer description",
            "entity2": description
        })

    # 10. Courses
    for course in profile.get("Courses", []):
        course_title = clean_text(course.get("Course Name", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "completed course",
            "entity2": course_title
        })

    # 11. Projects
    for project in profile.get("Projects", []):
        project_title = clean_text(project.get("Project Title", "Not provided"))
        description = clean_text(project.get("Description", "Not provided"))
        triplets.append({
            "entity1": name,
            "relation": "worked on project",
            "entity2": project_title
        })
        triplets.append({
            "entity1": name,
            "relation": "project description",
            "entity2": description
        })

# Save the rule-based entity triplets as JSON
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(triplets, outfile, indent=4)

print(f"Successfully created rule based entity triplets in JSON file: {output_file}")


Successfully created rule based entity triplets in JSON file: rule based extraction\rule based triplets.json


## Step 3 : we now will convert json into sentense and convert those segment wise data in json into segment wise chunks of senetese created for the profile data

In [3]:
import json
import re
import os
import hashlib

# Load the JSON file containing the profiles
with open("linkedin_profiles_24.json", "r", encoding="utf-8") as infile:
    profiles = json.load(infile)

# Create output folder if it does not exist
output_folder = "segment wise chunks"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "segments.json")

def clean_text(text):
    """Remove extra whitespace and newlines from a text."""
    return re.sub(r'\s+', ' ', text).strip()

def compute_checksum(text):
    """Compute MD5 checksum for a given text."""
    return hashlib.md5(text.encode("utf-8")).hexdigest()

all_chunks = []

for profile in profiles:
    name = profile.get("Name", "Unknown")
    
    # Create a dictionary to hold segment-wise sentences for each profile
    segments = {}

    # Segment 1: Headline and About
    headline = clean_text(profile.get("Headline", "Not provided"))
    about = clean_text(profile.get("About", "Not provided"))
    segments["headline_about"] = f"{name} describes about himself: {headline} and {about}."
    
    # Segment 2: Experiences
    experience_sentences = []
    for exp in profile.get("Experiences", []):
        job_title = clean_text(exp.get("Job Title", "Not provided"))
        company = clean_text(exp.get("Company", "Not provided"))
        description = clean_text(exp.get("Description", "Not provided"))
        sentence = f"{name} worked at {company} served as {job_title} and has done {description}."
        experience_sentences.append(sentence)
    if experience_sentences:
        segments["experiences"] = " ".join(experience_sentences)
    
    # Segment 3: Activities
    activity_sentences = []
    for act in profile.get("Activities", []):
        title = clean_text(act.get("Title", "Not provided"))
        sentence = f"{name} interacted on post {title}."
        activity_sentences.append(sentence)
    if activity_sentences:
        segments["activities"] = " ".join(activity_sentences)
    
    # Segment 4: Articles
    article_sentences = []
    for article in profile.get("Articles", []):
        title = clean_text(article.get("Title", "Not provided"))
        description = clean_text(article.get("Description", "Not provided"))
        sentence = f"{name} published article {title} on the context {description}."
        article_sentences.append(sentence)
    if article_sentences:
        segments["articles"] = " ".join(article_sentences)
    
    # Segment 5: Awards
    award_sentences = []
    for award in profile.get("Awards", []):
        award_org = clean_text(award.get("Award Organization", "Not provided"))
        award_title = clean_text(award.get("Award Title", "Not provided"))
        award_desc = clean_text(award.get("Award Description", "Not provided"))
        sentence = f"{name} got awarded by {award_org} for {award_title} with description {award_desc}."
        award_sentences.append(sentence)
    if award_sentences:
        segments["awards"] = " ".join(award_sentences)
    
    # Segment 6: Languages
    language_sentences = []
    for lang in profile.get("Languages", []):
        language = clean_text(lang.get("Language", "Not provided"))
        sentence = f"{name} speaks {language}."
        language_sentences.append(sentence)
    if language_sentences:
        segments["languages"] = " ".join(language_sentences)
    
    # Segment 7: Educations
    education_sentences = []
    for edu in profile.get("Educations", []):
        institution = clean_text(edu.get("Institution", "Not provided"))
        degree = clean_text(edu.get("Degree", "Not provided"))
        field = clean_text(edu.get("Field of Study", "Not provided"))
        sentence = f"{name} studied at {institution} for the degree {degree} in the field of {field}."
        education_sentences.append(sentence)
    if education_sentences:
        segments["educations"] = " ".join(education_sentences)
    
    # Segment 8: Certifications
    cert_sentences = []
    for cert in profile.get("Certifications", []):
        cert_title = clean_text(cert.get("Certification Title", "Not provided"))
        issuer = clean_text(cert.get("Issuer", "Not provided"))
        sentence = f"{name} got certified by {issuer} in {cert_title}."
        cert_sentences.append(sentence)
    if cert_sentences:
        segments["certifications"] = " ".join(cert_sentences)
    
    # Segment 9: Volunteer Experience
    volunteer_sentences = []
    for vol in profile.get("Volunteer", []):
        position = clean_text(vol.get("Position", "Not provided"))
        organization = clean_text(vol.get("Organization", "Not provided"))
        description = clean_text(vol.get("Description", "Not provided"))
        sentence = f"{name} volunteered at {organization}, served as {position}, with description {description}."
        volunteer_sentences.append(sentence)
    if volunteer_sentences:
        segments["volunteer"] = " ".join(volunteer_sentences)
    
    # Segment 10: Courses
    course_sentences = []
    for course in profile.get("Courses", []):
        course_title = clean_text(course.get("Course Name", "Not provided"))
        sentence = f"{name} completed the course {course_title}."
        course_sentences.append(sentence)
    if course_sentences:
        segments["courses"] = " ".join(course_sentences)
    
    # Segment 11: Projects
    project_sentences = []
    for project in profile.get("Projects", []):
        project_title = clean_text(project.get("Project Title", "Not provided"))
        description = clean_text(project.get("Description", "Not provided"))
        sentence = f"{name} worked on project {project_title} with description {description}."
        project_sentences.append(sentence)
    if project_sentences:
        segments["projects"] = " ".join(project_sentences)
    
    # For each segment, compute checksum and create a unique chunk
    for segment_name, text in segments.items():
        # Compute a unique id using the profile name, segment name, and the chunk text
        unique_string = f"{name}_{segment_name}_{text}"
        chunk_id = compute_checksum(unique_string)
        
        chunk = {
            "id": chunk_id,
            "profile_name": name,
            "segment": segment_name,
            "text": text
        }
        all_chunks.append(chunk)

# Save all the segment-wise chunks in JSON format
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(all_chunks, outfile, indent=4)

print(f"Successfully created segment-wise chunks in JSON file: {output_file}")


Successfully created segment-wise chunks in JSON file: segment wise chunks\segments.json


## Step 4 : Now we create segment wise custom prompts and save it for future use

In [4]:
import json
import os

# -------------------------------
# Define the PROMPTS dictionary exactly as provided
# -------------------------------
GRAPH_FIELD_SEP = "<SEP>"

PROMPTS = {}

PROMPTS["DEFAULT_LANGUAGE"] = "English"
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]

PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event", "category"]

PROMPTS["DEFAULT_ENTITY_TYPES"] = [
    
    "organization", 
    "person", 
    "geo", 
    "event", 
    "category",
    
    
    "headline_about", "self-description", "professional identity",
    
   
    "experience", "job", "role", "employment", "company",
    
   
    "activity", "post", "update", "interaction", "social engagement",
    
    
    "article", "publication", "blog",
    

    "award", "recognition", "honor",
    
  
    "language", "communication", "multilingual",
    
    
    "education", "degree", "university", "academic",
    
   
    "certification", "credential", "issuer",
    

    "volunteer", "non-profit", "service",
    
    
    "course", "training", "workshop",
    
  
    "project", "initiative", "collaboration", "innovation"
]



PROMPTS["entity_extraction"] = """-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
Use {language} as output language.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalize the name.
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)

3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)

4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.

5. When finished, output {completion_delimiter}

######################
-Examples-
######################
{examples}

#############################
-Real Data-
#############################
Entity_types: {entity_types}
Text: {input_text}
#############################
Output:
"""

PROMPTS["entity_extraction_examples"] = [
    """Example 1:

Entity_types: [person, technology, mission, organization, location]
Text:
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.

Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”

The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.

It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
################
Output:
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}
("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}
("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
#############################""",
    """Example 2:

Entity_types: [person, technology, mission, organization, location]
Text:
They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve.

Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril.

Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly
#############
Output:
("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter}
("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter}
("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter}
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
#############################""",
    """Example 3:

Entity_types: [person, role, technology, organization, event, location, concept]
Text:
their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.

"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning."

Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."

Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.

The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation
#############
Output:
("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}
("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}
("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}
("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}
("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter}
("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
#############################"""]
    
PROMPTS["summarize_entity_descriptions"] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
Make sure it is written in third person, and include the entity names so we the have full context.
Use {language} as output language.

#######
-Data-
Entities: {entity_name}
Description List: {description_list}
#######
Output:
"""

PROMPTS["entiti_continue_extraction"] = """MANY entities were missed in the last extraction.  Add them below using the same format:
"""

PROMPTS["entiti_if_loop_extraction"] = """It appears some entities may have still been missed.  Answer YES | NO if there are still entities that need to be added.
"""

PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."

PROMPTS["rag_response"] = """---Role---

You are a helpful assistant responding to questions about data in the tables provided.

---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
If you don't know the answer, just say so. Do not make anything up.
Do not include information where the supporting evidence for it is not provided.

---Target response length and format---

{response_type}

---Data tables---

{context_data}

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""

PROMPTS["keywords_extraction"] = """---Role---

You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.

---Goal---

Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.

---Instructions---

- Output the keywords in JSON format.
- The JSON should have two keys:
  - "high_level_keywords" for overarching concepts or themes.
  - "low_level_keywords" for specific entities or details.

######################
-Examples-
######################
{examples}

#############################
-Real Data-
######################
Query: {query}
##############################
The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
Output:
"""

PROMPTS["keywords_extraction_examples"] = [
    """Example 1:

Query: "How does international trade influence global economic stability?"
################
Output:
{
  "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
  "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
}
#############################""",
    """Example 2:

Query: "What are the environmental consequences of deforestation on biodiversity?"
################
Output:
{
  "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
  "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
}
#############################""",
    """Example 3:

Query: "What is the role of education in reducing poverty?"
################
Output:
{
  "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
  "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
}
#############################"""
]

PROMPTS["naive_rag_response"] = """---Role---

You are a helpful assistant responding to questions about documents provided.

---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
If you don't know the answer, just say so. Do not make anything up.
Do not include information where the supporting evidence for it is not provided.

---Target response length and format---

{response_type}

---Documents---

{content_data}

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""

PROMPTS["similarity_check"] = """Please analyze the similarity between these two questions:

Question 1: {original_prompt}
Question 2: {cached_prompt}

Please evaluate the following two points and provide a similarity score between 0 and 1 directly:
1. Whether these two questions are semantically similar
2. Whether the answer to Question 2 can be used to answer Question 1
Similarity score criteria:
0: Completely unrelated or answer cannot be reused, including but not limited to:
   - The questions have different topics
   - The locations mentioned in the questions are different
   - The times mentioned in the questions are different
   - The specific individuals mentioned in the questions are different
   - The specific events mentioned in the questions are different
   - The background information in the questions is different
   - The key conditions in the questions are different
1: Identical and answer can be directly reused
0.5: Partially related and answer needs modification to be used
Return only a number between 0-1, without any additional content.
"""

# -------------------------------
# Define segment-specific instructions variants
# -------------------------------
segment_instructions = {
    "headline_about": (
        "This chunk represents the personal introduction of the profile, including the headline and about section. "
        "It typically contains information about the individual's current role, professional interests, and a brief summary about themselves. "
        "Extract all entities and relationships that help to characterize the individual's self-description and professional identity. "
    ),
    "experiences": (
        "This chunk encapsulates the professional experiences of the individual. "
        "It includes details about past and current employment, job roles, companies, and descriptions of responsibilities or achievements. "
        "Extract entities such as company names, job titles, and any relevant descriptors of the work experience, along with their relationships. "
    ),
    "activities": (
        "This chunk captures the individual's recent LinkedIn activities, such as posts and updates. "
        "It often reflects interactions, achievements, and social engagement. "
        "Extract entities and relationships that describe social interactions, professional networking, and activity highlights. "
    ),
    "articles": (
        "This chunk includes information about articles or publications shared by the individual. "
        "It contains details such as article titles, descriptions, and thematic content. "
        "Extract entities and relationships that relate to content creation, subject matter expertise, and publication details. "
    ),
    "awards": (
        "This chunk contains details about any awards or recognitions received by the individual. "
        "It typically includes information about awarding bodies, award titles, and descriptions of the achievement. "
        "Extract entities and relationships that capture the recognition, including who awarded the honor and for what merit. "
    ),
    "languages": (
        "This chunk details the languages the individual is proficient in. "
        "It may also reflect cultural or regional associations. "
        "Extract entities and relationships that identify language skills and the related attributes of communication and cultural context. "
    ),
    "educations": (
        "This chunk provides information about the individual's educational background. "
        "It includes details such as institution names, degrees obtained, and fields of study. "
        "Extract entities and relationships that depict academic credentials and educational achievements. "
    ),
    "certifications": (
        "This chunk describes any certifications the individual has acquired. "
        "It usually lists certification titles, issuing organizations, and sometimes descriptions of the certification process. "
        "Extract entities and relationships that detail the validation of professional skills and credentials through certification. "
    ),
    "volunteer": (
        "This chunk contains information about the individual's volunteer experiences. "
        "It details roles, organizations, and descriptions of volunteer work. "
        "Extract entities and relationships that capture the nature of volunteer contributions and the organizations involved. "
    ),
    "courses": (
        "This chunk details any courses the individual has completed. "
        "It often includes course titles and may mention institutions or training programs. "
        "Extract entities and relationships that reveal continuing education, skill development, and course-related details. "
    ),
    "projects": (
        "This chunk contains information about projects the individual has worked on. "
        "It typically includes project titles and descriptions, along with any notable outcomes or methodologies used. "
        "Extract entities and relationships that represent project work, innovations, and collaborative efforts. "
    )
}

# -------------------------------
# Load the segment chunks from segments.json with updated path
# -------------------------------
segments_file = os.path.join("segment wise chunks", "segments.json")
with open(segments_file, "r", encoding="utf-8") as infile:
    chunks = json.load(infile)

# -------------------------------
# Process each chunk to build a customized prompt variant
# -------------------------------
output_chunks = []
for chunk in chunks:
    seg_type = chunk.get("segment", "default")
    # Get segment-specific instruction variant; if not defined, use a default message.
    seg_instruction = segment_instructions.get(seg_type, "This segment requires extraction of entities and relationships from the text. ")
    
    # Build the customized prompt by concatenating the segment instruction with the full complex base template.
    prompt_text = (
        seg_instruction +
        PROMPTS["entity_extraction"].format(
            language = PROMPTS["DEFAULT_LANGUAGE"],
            entity_types = ", ".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
            tuple_delimiter = PROMPTS["DEFAULT_TUPLE_DELIMITER"],
            record_delimiter = PROMPTS["DEFAULT_RECORD_DELIMITER"],
            completion_delimiter = PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
            examples = "\n".join(PROMPTS["entity_extraction_examples"]),
            input_text = chunk.get("text", "")
        )
    )
    
    # Create a new chunk entry that includes the original information plus the customized prompt.
    new_chunk = {
        "id": chunk.get("id"),
        "profile_name": chunk.get("profile_name"),
        "segment": seg_type,
        "text": chunk.get("text"),
        "prompt": prompt_text
    }
    output_chunks.append(new_chunk)

# -------------------------------
# Save the updated chunks with their variant prompts in a new folder "segment prompts variants"
# -------------------------------
output_folder = "segment prompts variants"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "segments_with_prompts_variants.json")
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(output_chunks, outfile, indent=4)

print(f"Successfully created segment-wise prompt variants in JSON file: {output_file}")


Successfully created segment-wise prompt variants in JSON file: segment prompts variants\segments_with_prompts_variants.json


## Step 5 : Sending propmpts with chunks to extraxt relation and entity 

In [5]:
! pip install -U google-generativeai



In [1]:
# include token boost 

In [6]:
import json
import os
import google.generativeai as genai
import time

# Configure your Gemini API key (either set GEMINI_API_KEY env variable or pass key directly)
genai.configure(api_key=os.environ.get("GEMINI_API_KEY", "AIzaSyCjbGRnG3XdvIeUTKnwZ1HgS0sSYGg-t5E"))

# Define the model to use
model = genai.GenerativeModel('gemini-1.5-flash')

# Paths
input_file = os.path.join("segment prompts variants", "segments_with_prompts_variants.json")
output_folder = "gemini_responses"
log_file = os.path.join(output_folder, "retry_log.json")

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load chunks from the segments file
with open(input_file, "r", encoding="utf-8") as infile:
    chunks = json.load(infile)

# Load previously logged retry ids if available, else initialize an empty list
if os.path.exists(log_file):
    with open(log_file, "r", encoding="utf-8") as log_in:
        retry_log = json.load(log_in)
else:
    retry_log = []

# Initialize a list for chunks that need retrying in this run (in addition to previously logged ones)
new_retry_log = []

# Process each chunk
for chunk in chunks:
    chunk_id = chunk.get("id")
    # Check if a response for this chunk already exists; if so, skip
    response_file = os.path.join(output_folder, f"response_{chunk_id}.json")
    if os.path.exists(response_file):
        print(f"Skipping chunk {chunk_id} (response already exists).")
        continue

    prompt_sent = chunk.get("prompt")
    print(f"Processing chunk {chunk_id} ...")
    
    try:
        # Call Gemini model with the prompt
        response = model.generate_content(prompt_sent)
        response_text = response.text.strip() if response and hasattr(response, "text") else ""
        
        # If no valid response, log chunk id for retry and continue
        if not response_text:
            print(f"No response for chunk {chunk_id}. Logging for retry.")
            new_retry_log.append(chunk_id)
            continue
        
        # Build response data
        output_data = {
            "chunk_id": chunk_id,
            "profile_name": chunk.get("profile_name"),
            "segment": chunk.get("segment"),
            "chunk_text": chunk.get("text"),
            "prompt_sent": prompt_sent,
            "response": response_text
        }
        
        # Save the response as JSON file
        with open(response_file, "w", encoding="utf-8") as outfile:
            json.dump(output_data, outfile, indent=4)
        print(f"Saved response for chunk {chunk_id}.")

        # Add a delay to avoid hitting rate limits
        time.sleep(3)
        
    except Exception as e:
        # Log the chunk id if an exception occurs and print the error
        print(f"Error processing chunk {chunk_id}: {e}")
        new_retry_log.append(chunk_id)
        continue

# Merge with previous retry log entries (ensuring no duplicates)
all_retry = list(set(retry_log + new_retry_log))

# Save the retry log
with open(log_file, "w", encoding="utf-8") as log_out:
    json.dump(all_retry, log_out, indent=4)

print(f"\nProcessing complete. {len(all_retry)} chunks need to be retried. See log: {log_file}")


Processing chunk 0d572b5c29aa2225e7d88ac039ef2132 ...
Saved response for chunk 0d572b5c29aa2225e7d88ac039ef2132.
Processing chunk 5b70211aaef28cb176b913a41060be3a ...
Saved response for chunk 5b70211aaef28cb176b913a41060be3a.
Processing chunk baf603122069ef036efe473e35c13c06 ...
Saved response for chunk baf603122069ef036efe473e35c13c06.
Processing chunk 7a14d64cdd63b849c981afab119f45fc ...
Saved response for chunk 7a14d64cdd63b849c981afab119f45fc.
Processing chunk e5d4a604a0d775c08b18765eb39737db ...
Saved response for chunk e5d4a604a0d775c08b18765eb39737db.
Processing chunk 5a81a5afab0114d6f383c080dd22759b ...
Saved response for chunk 5a81a5afab0114d6f383c080dd22759b.
Processing chunk 7e379e8d40278c7a8954be5e55f4cdaa ...
Saved response for chunk 7e379e8d40278c7a8954be5e55f4cdaa.
Processing chunk d0f2bd6cb354ffa05fcf76174b132ef4 ...
Saved response for chunk d0f2bd6cb354ffa05fcf76174b132ef4.
Processing chunk e7946d6d3c365aab47be8745536351e5 ...
Saved response for chunk e7946d6d3c365aab4

# Step 6 : using RE to extraxt data from llm response 

In [7]:
import json
import os
import re
import time

# -------------------------------
# Define folders
# -------------------------------
gemini_folder = "gemini_responses"
extracted_folder = "llm extracted triplets"
entity_folder = "entity_relation_entity"

os.makedirs(gemini_folder, exist_ok=True)
os.makedirs(extracted_folder, exist_ok=True)
os.makedirs(entity_folder, exist_ok=True)

# -------------------------------
# Define regex patterns for extraction from Gemini responses
# -------------------------------
# Pattern for entity triplets: ("entity"<|>entity_name<|>"entity_type"<|>"entity_description")
entity_pattern = r'\("entity"<\|>(.*?)<\|>"(.*?)"<\|>"(.*?)"\)'
# Pattern for relationship triplets: ("relationship"<|>source_entity<|>target_entity<|>"relationship_description"<\|>"relationship_keywords"<\|>relationship_strength)
relationship_pattern = r'\("relationship"<\|>(.*?)<\|>(.*?)<\|>"(.*?)"<\|>"(.*?)"<\|>(\d+)\)'
# Pattern for content keywords: ("content_keywords"<|>keywords)
content_keywords_pattern = r'\("content_keywords"<\|>(.*?)\)'

# -------------------------------
# Part 1: Process each Gemini response file to extract triplets
# -------------------------------
# List all response JSON files in gemini_responses (assumes filenames start with "response_")
response_files = [f for f in os.listdir(gemini_folder) if f.startswith("response_") and f.endswith(".json")]

for rf in response_files:
    response_path = os.path.join(gemini_folder, rf)
    with open(response_path, "r", encoding="utf-8") as infile:
        response_data = json.load(infile)
    
    response_text = response_data.get("response", "")
    
    # Extract matches using regex
    entities = re.findall(entity_pattern, response_text)
    relationships = re.findall(relationship_pattern, response_text)
    # Exclude content_keywords extraction by setting it to an empty list
    content_keywords = []  # re.findall(content_keywords_pattern, response_text)
    
    # Build lists of triplet dictionaries (with type information)
    entity_triplets = [
        {
            "type": "entity",
            "entity_name": match[0].strip().strip('"'),
            "entity_type": match[1].strip().strip('"'),
            "entity_description": match[2].strip().strip('"')
        }
        for match in entities
    ]
    
    relationship_triplets = [
        {
            "type": "relationship",
            "source_entity": match[0].strip().strip('"'),
            "target_entity": match[1].strip().strip('"'),
            "relationship_description": match[2].strip().strip('"'),
            "relationship_keywords": match[3].strip().strip('"'),
            "relationship_strength": int(match[4].strip())
        }
        for match in relationships
    ]
    
    # Do not include content_keywords triplets
    content_keywords_triplet = []  # Removed per requirements
    
    # Merge all triplets
    all_triplets = entity_triplets + relationship_triplets + content_keywords_triplet
    
    # Prepare output data for this response file
    output_data = {
        "chunk_id": response_data.get("chunk_id"),
        "profile_name": response_data.get("profile_name"),
        "segment": response_data.get("segment"),
        "extracted_triplets": all_triplets
    }
    
    # Save the extracted triplets to JSON file in the "llm extracted triplets" folder
    chunk_id = response_data.get("chunk_id")
    extracted_filename = f"extracted_triplets_{chunk_id}.json"
    extracted_path = os.path.join(extracted_folder, extracted_filename)
    with open(extracted_path, "w", encoding="utf-8") as outfile:
        json.dump(output_data, outfile, indent=4)
    
    print(f"Extracted triplets saved in {extracted_path}")
    # Optional: delay between processing files to avoid rate limits
    time.sleep(1)

# -------------------------------
# Part 2: Aggregate all extracted triplets into a uniform "entity relation entity" format
# -------------------------------
# List all JSON files in the extracted_folder that match the naming pattern
extracted_files = [f for f in os.listdir(extracted_folder) if f.startswith("extracted_triplets_") and f.endswith(".json")]

all_results = []  # This will hold results from all files

# Process each extracted file
for filename in extracted_files:
    filepath = os.path.join(extracted_folder, filename)
    with open(filepath, "r", encoding="utf-8") as infile:
        data = json.load(infile)
    
    # Each file contains meta information and a list of extracted triplets under "extracted_triplets"
    extracted = data.get("extracted_triplets", [])
    uniform_triplets = []
    
    for item in extracted:
        ttype = item.get("type", "").lower()
        
        if ttype == "entity":
            # For entity entries, we use the entity name, "is a <entity_type>" as relation, and the description.
            uniform_triplets.append({
                "entity1": item.get("entity_name", "").strip(),
                "relation": f"is a {item.get('entity_type', '').strip()}",
                "entity2": item.get("entity_description", "").strip()
            })
        elif ttype == "relationship":
            # For relationship entries, map the source, relationship description, and target.
            uniform_triplets.append({
                "entity1": item.get("source_entity", "").strip(),
                "relation": item.get("relationship_description", "").strip(),
                "entity2": item.get("target_entity", "").strip()
            })
        # Skip content_keywords entirely
        else:
            continue

    # Create an aggregated result per file with its meta data and uniform triplets
    file_result = {
        "chunk_id": data.get("chunk_id"),
        "profile_name": data.get("profile_name"),
        "segment": data.get("segment"),
        "triplets": uniform_triplets
    }
    all_results.append(file_result)

# Save the aggregated results to one JSON file in the "entity_relation_entity" folder
final_output_file = os.path.join(entity_folder, "all_extracted_triplets.json")
with open(final_output_file, "w", encoding="utf-8") as outfile:
    json.dump(all_results, outfile, indent=4)

print(f"\nAll extracted uniform triplets have been saved in {final_output_file}")


Extracted triplets saved in llm extracted triplets\extracted_triplets_0247c5c4920ff836e11e665a2e1601e2.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_02acd4c823ea7d3ad25bfe9d300ff46b.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_02bc85c31a32aaed8869f961f02e48fd.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_054ecc08a688e1753f65d4ee8f88b576.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_0851d1bf4bb683269277d4dfb31178f6.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_0c2fa176d66101675a99766c5d1fcba5.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_0ccf3d6a7f79a8c6d3a47c774b067790.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_0d572b5c29aa2225e7d88ac039ef2132.json
Extracted triplets saved in llm extracted triplets\extracted_triplets_0d7ac62359950fdbf73dace0ce17dc20.json
Extracted triplets saved in 

## Step 7 : Mergeing rule based triplets and all_extracted_triplets.json

In [8]:
import json
import os

# -------------------------------
# Step 7A: Merge LLM and Rule-Based Triplets
# -------------------------------

# Define input file paths
llm_triplets_file = os.path.join("entity_relation_entity", "all_extracted_triplets.json")
rule_based_file = os.path.join("rule based extraction", "rule based triplets.json")

# Define output folder and file for merged data
merged_folder = "merged_triplets"
os.makedirs(merged_folder, exist_ok=True)
merged_output_file = os.path.join(merged_folder, "merged_triplets.json")

# Load LLM-based triplets
with open(llm_triplets_file, "r", encoding="utf-8") as infile:
    llm_triplets = json.load(infile)

# Load rule-based triplets
with open(rule_based_file, "r", encoding="utf-8") as infile:
    rule_based_triplets = json.load(infile)

# Merge the two sets into a dictionary with separate keys for clarity
merged_data = {
    "llm_triplets": llm_triplets,
    "rule_based_triplets": rule_based_triplets
}

# Save the merged output to JSON file
with open(merged_output_file, "w", encoding="utf-8") as outfile:
    json.dump(merged_data, outfile, indent=4)

print(f"Merged triplets have been saved in {merged_output_file}")


# -------------------------------
# Step 7B: Extract and Combine All Triplets into a Final List
# -------------------------------

# Define the merged input file (from previous step) and final output file
final_output_folder = "final_triplets"
os.makedirs(final_output_folder, exist_ok=True)
final_output_file = os.path.join(final_output_folder, "final_entity_relation_triplets.json")

# Load merged triplets file
with open(merged_output_file, "r", encoding="utf-8") as infile:
    merged_data = json.load(infile)

# Extract triplets from both LLM and rule-based outputs
llm_triplets_list = merged_data.get("llm_triplets", [])
rule_based_triplets = merged_data.get("rule_based_triplets", [])

# Depending on how the LLM triplets are stored, they might be a list of files (each with its own metadata).
# Here, we assume each element in llm_triplets_list is a dictionary with a "triplets" key.
extracted_llm = []
for item in llm_triplets_list:
    if "triplets" in item:
        extracted_llm.extend(item["triplets"])
    else:
        extracted_llm.append(item)

# Merge all triplets into one final list
final_triplets = extracted_llm + rule_based_triplets

# Save the final list of triplets in JSON format
with open(final_output_file, "w", encoding="utf-8") as outfile:
    json.dump(final_triplets, outfile, indent=4)

print(f"Final entity-relation-entity triplets saved in {final_output_file}")


Merged triplets have been saved in merged_triplets\merged_triplets.json
Final entity-relation-entity triplets saved in final_triplets\final_entity_relation_triplets.json


# Looking for duplicate Pairs of en rl en 

In [9]:
import json
import os

# Define the path to the final triplets file
final_file = os.path.join("final_triplets", "final_entity_relation_triplets.json")

# Load the final triplets
with open(final_file, "r", encoding="utf-8") as infile:
    triplets = json.load(infile)

# Create a dictionary to count occurrences of each triplet key (entity1, relation, entity2)
triplet_counts = {}
for triplet in triplets:
    key = (
        triplet.get("entity1", "").strip(),
        triplet.get("relation", "").strip(),
        triplet.get("entity2", "").strip()
    )
    triplet_counts[key] = triplet_counts.get(key, 0) + 1

# Identify duplicates (i.e. keys with count > 1)
duplicates = {key: count for key, count in triplet_counts.items() if count > 1}

# Compute statistics
total_triplets = len(triplets)
unique_triplets = len(triplet_counts)
duplicate_count = sum(count - 1 for count in triplet_counts.values() if count > 1)

# Print summary statistics and duplicate details
print(f"Total triplets: {total_triplets}")
print(f"Unique triplets: {unique_triplets}")
print(f"Total duplicate occurrences (excluding first instance): {duplicate_count}\n")

if duplicates:
    print("Duplicate triplets (entity1, relation, entity2) and their counts:")
    for key, count in duplicates.items():
        print(f"{key}: {count}")
else:
    print("No duplicate triplets found.")


Total triplets: 2417
Unique triplets: 2281
Total duplicate occurrences (excluding first instance): 136

Duplicate triplets (entity1, relation, entity2) and their counts:
('Kattankulathur', 'is a geo', 'Kattankulathur is a location where SRMIST is situated.'): 3
('Chennai', 'is a geo', 'Chennai is a city in Tamil Nadu, India, where SRMIST is located.'): 4
('Tamil Nadu', 'is a geo', 'Tamil Nadu is a state in India, where Chennai and SRMIST are located.'): 3
('Ashish Kumar', 'experience description', 'Not Found'): 2
('Ashish Kumar', 'studied at', 'Kendriya Vidyalaya'): 2
('Ashish Kumar', 'certified by', '365 Data Science'): 2
('Bala Rakesh', 'worked at', 'Sankar Foundation Eye Hospital'): 2
('Bala Rakesh', 'experience description', 'Not Found'): 2
('Cheerayu Chowhan', 'experience description', 'Not Found'): 2
('Devshri Rao', 'served as', 'Human Resources Manager'): 2
('Devshri Rao', 'experience description', 'Not Found'): 3
('Greeshma G A', 'experience description', 'Not Found'): 3
('Gree

# Deleting the duplicate ones 

In [10]:
import json
import os

# Define the path to the final triplets file
final_file = os.path.join("final_triplets", "final_entity_relation_triplets.json")

# Load the final triplets
with open(final_file, "r", encoding="utf-8") as infile:
    triplets = json.load(infile)

# Use a dictionary to keep track of unique triplets based on a key
unique_triplet_dict = {}
for triplet in triplets:
    key = (
        triplet.get("entity1", "").strip(),
        triplet.get("relation", "").strip(),
        triplet.get("entity2", "").strip()
    )
    # Only add the triplet if it hasn't been seen before
    if key not in unique_triplet_dict:
        unique_triplet_dict[key] = triplet

# Convert the unique triplets back to a list
unique_triplets = list(unique_triplet_dict.values())

# Print summary statistics
print(f"Total triplets before deduplication: {len(triplets)}")
print(f"Unique triplets after deduplication: {len(unique_triplets)}")

# Save the deduplicated triplets to a new JSON file
output_file = os.path.join("final_triplets", "final_entity_relation_triplets_dedup.json")
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(unique_triplets, outfile, indent=4)

print(f"Deduplicated triplets saved in {output_file}")


Total triplets before deduplication: 2417
Unique triplets after deduplication: 2281
Deduplicated triplets saved in final_triplets\final_entity_relation_triplets_dedup.json


In [11]:
import json
import os

# Define the path to the deduplicated triplets file
dedup_file = os.path.join("final_triplets", "final_entity_relation_triplets_dedup.json")

# Load the deduplicated triplets
with open(dedup_file, "r", encoding="utf-8") as infile:
    triplets = json.load(infile)

# Collect all unique entities from entity1 and entity2
entities = set()
for triplet in triplets:
    entity1 = triplet.get("entity1", "").strip()
    entity2 = triplet.get("entity2", "").strip()
    if entity1:
        entities.add(entity1)
    if entity2:
        entities.add(entity2)

# Sort entities alphabetically (ignoring case)
sorted_entities = sorted(entities, key=lambda s: s.lower())

# Print each entity in alphabetical order
print("Entities in alphabetical order:")
for entity in sorted_entities:
    print(entity)


Entities in alphabetical order:
"Hey Siri, update my LinkedIn headline from Upcoming SDE Intern @ Flipkart to SDE Intern @ Flipkart !" I have officially started my…
"Jo dikhta hai, wahi bikta hai" – Visibility is key, and Shark Tank India proves that. Here’s proof: • Paradyes saw a 20x spike in website traffic.…
"Leadership Lessons from Karate: Beyond the Dojo" In the disciplined world of the traditional karate system, leadership is not just taught—it is…
"The Future of Cost Auditing is Here! The Ministry of Corporate Affairs (MCA) is encouraging cost auditors to leverage Artificial Intelligence (AI)…
#BHASHINIRAJYAM Sai Prakash, Lead AI/ML, showcases the voice-based payment system in his native language, #Telugu, at the Co-creation Workshop for…
#BREAKING | #HindenburgResearch, known for its role in the #Adani and #Nikola sell-offs, has announced its closure, citing reasons in a blog post by…
#CMA inter / CA inter Urgent requirement Account Manager in Caprihans India Ltd; Location: Na

# Correcting the node annotations 

In [12]:
import json
import os
import re

# Define the path to the deduplicated triplets file
dedup_file = os.path.join("final_triplets", "final_entity_relation_triplets_dedup.json")

# Load the deduplicated triplets
with open(dedup_file, "r", encoding="utf-8") as infile:
    triplets = json.load(infile)

def clean_entity(entity):
    """
    Clean the entity string by removing leading symbols:
    - Remove leading '<|>' if present.
    - Remove starting double quotes.
    - Remove leading bullet points (like '•' or '-').
    """
    entity = entity.strip()
    # Remove leading '<|>' symbols if present
    if entity.startswith("<|>"):
        entity = entity[3:].strip()
    # Remove leading double quotes
    if entity.startswith('"'):
        entity = entity.lstrip('"').strip()
    # Remove any leading bullet points (• or -) and spaces
    entity = re.sub(r'^[•\-]+\s*', '', entity)
    return entity

# Collect all unique entities from entity1 and entity2 fields after cleaning
entities = set()
for triplet in triplets:
    for field in ["entity1", "entity2"]:
        value = triplet.get(field, "").strip()
        if value:
            cleaned = clean_entity(value)
            if cleaned:  # Only add non-empty strings
                entities.add(cleaned)

# Sort entities alphabetically (case-insensitive)
sorted_entities = sorted(entities, key=lambda s: s.lower())

# Print each entity in alphabetical order
print("Entities in alphabetical order:")
for entity in sorted_entities:
    print(entity)


Entities in alphabetical order:
#BHASHINIRAJYAM Sai Prakash, Lead AI/ML, showcases the voice-based payment system in his native language, #Telugu, at the Co-creation Workshop for…
#BREAKING | #HindenburgResearch, known for its role in the #Adani and #Nikola sell-offs, has announced its closure, citing reasons in a blog post by…
#CMA inter / CA inter Urgent requirement Account Manager in Caprihans India Ltd; Location: Nashik JOB SPECIALIZATIONS: Accounting, Taxation…
#ETEDNXT Pune
#ETEDNXT Pune is an event where Dr. Shilpa Joshi spoke.
*Reflecting on an Incredible 2024 at Intellize* As we wrap up an extraordinary year at Intellize, I’m filled with pride and gratitude for the…
1-month internship
1. Improved conversion rate and click rate of social media campaigns by 40% by designing Channel Affinity models for Facebook, Email and Pinterest. The models helped the business target audience most likely to respond to the ad on that platform 2. Optimised the runtime of Model Inferencing workfl

## Combining the Outputs 

In [13]:
import json
import os
import pandas as pd

# --- File paths ---
# Segments file: Contains each chunk with its id, profile_name, segment, text, and prompt
segments_file = os.path.join("segment prompts variants", "segments_with_prompts_variants.json")

# LLM extraction file: Contains aggregated LLM results per chunk (including extracted triplets and optionally the raw LLM response)
llm_file = os.path.join("merged_triplets", "merged_triplets.json")
# We assume that in the merged file, there is a key "llm_triplets" that is a list of dictionaries, each having:
#   "chunk_id", "profile_name", "segment", "extracted_triplets", and optionally "response".
# If "response" is not present, we'll set llm_response to None.

# Rule-based triplets file: Contains a list of rule-based triplets (each coming from a profile)
rule_based_file = os.path.join("rule based extraction", "rule based triplets.json")

# --- Load data ---
# Load segments (each with chunk id, text, prompt, etc.)
with open(segments_file, "r", encoding="utf-8") as f:
    segments_data = json.load(f)

# Load LLM-based aggregated data (we use the "llm_triplets" key)
with open(llm_file, "r", encoding="utf-8") as f:
    merged_data = json.load(f)
llm_data = merged_data.get("llm_triplets", [])

# Load rule-based triplets (assumed to be a list of triplets, each with keys "entity1", "relation", "entity2")
with open(rule_based_file, "r", encoding="utf-8") as f:
    rule_based_triplets = json.load(f)

# --- Organize rule-based triplets by profile name ---
# We'll group rule-based triplets by the profile (i.e., the "entity1" should equal the profile name)
rule_based_by_profile = {}
for trip in rule_based_triplets:
    # Assume rule-based triplets have been generated with "entity1" equal to the profile name.
    profile = trip.get("entity1", "").strip()
    if profile:
        rule_based_by_profile.setdefault(profile, []).append(trip)

# --- Build a dictionary for LLM data keyed by chunk_id ---
llm_by_chunk = {}
for item in llm_data:
    cid = item.get("chunk_id")
    if cid:
        llm_by_chunk[cid] = item  # item should have keys: chunk_id, profile_name, segment, extracted_triplets, and possibly "response"

# --- Create one row per chunk ---
rows = []
for seg in segments_data:
    # Each segment in segments_data has keys: "id", "profile_name", "segment", "text", "prompt"
    chunk_id = seg.get("id")
    profile_name = seg.get("profile_name", "").strip()
    segment = seg.get("segment", "").strip()
    chunk_text = seg.get("text", "").strip()
    prompt_used = seg.get("prompt", "").strip()
    
    # Get corresponding LLM data by chunk id (if available)
    llm_entry = llm_by_chunk.get(chunk_id, {})
    # Extract the LLM response if available; if not, use None
    llm_response = llm_entry.get("response", None)
    # Also get LLM extracted triplets (if available)
    llm_triplets_list = llm_entry.get("triplets") or llm_entry.get("extracted_triplets") or []
    
    # Get rule-based triplets for this profile (if available)
    rule_based_for_profile = rule_based_by_profile.get(profile_name, [])
    
    # Add source labels to each triplet (for clarity)
    # For LLM-based triplets, we add a field "source": "LLM"
    llm_triplets_labeled = []
    for t in llm_triplets_list:
        t_copy = t.copy()
        t_copy["source"] = "LLM"
        llm_triplets_labeled.append(t_copy)
    
    # For rule-based triplets, add "source": "Rule-Based"
    rule_based_labeled = []
    for t in rule_based_for_profile:
        t_copy = t.copy()
        t_copy["source"] = "Rule-Based"
        rule_based_labeled.append(t_copy)
    
    # Combine both lists (LLM-based and rule-based)
    combined_triplets = llm_triplets_labeled + rule_based_labeled
    
    # Create a row dictionary
    row = {
        "chunk_id": chunk_id,
        "profile_name": profile_name,
        "segment": segment,
        "chunk_text": chunk_text,
        "prompt": prompt_used,
        "llm_response": llm_response,
        "triplets": combined_triplets
    }
    rows.append(row)

# --- Create the DataFrame ---
df = pd.DataFrame(rows)

# Optionally, you can explode the triplets column so that each triplet becomes a separate row.
# Here, we keep one row per chunk with a list of triplets.
print("DataFrame with one row per chunk:")
print(df.head())

# Save the DataFrame to a CSV file (or pickle, or any other format)
output_csv = os.path.join("final_triplets", "chunks_with_triplets.csv")
df.to_csv(output_csv, index=False)
print(f"\nDataFrame saved to {output_csv}")


DataFrame with one row per chunk:
                           chunk_id   profile_name         segment  \
0  0d572b5c29aa2225e7d88ac039ef2132    Alexis Ryan  headline_about   
1  5b70211aaef28cb176b913a41060be3a    Alexis Ryan     experiences   
2  baf603122069ef036efe473e35c13c06    Alexis Ryan      activities   
3  7a14d64cdd63b849c981afab119f45fc    Alexis Ryan      educations   
4  e5d4a604a0d775c08b18765eb39737db  Anchal Baheti  headline_about   

                                          chunk_text  \
0  Alexis Ryan describes about himself: software ...   
1  Alexis Ryan worked at Ford Motor Company serve...   
2  Alexis Ryan interacted on post Hard-coded a le...   
3  Alexis Ryan studied at Queen Mary University o...   
4  Anchal Baheti describes about himself: NMIMS M...   

                                              prompt llm_response  \
0  This chunk represents the personal introductio...         None   
1  This chunk encapsulates the professional exper...         None   
2

## Saving KG in nanovector db 

In [14]:
! pip install qdrant-client sentence-transformers pandas


Collecting qdrant-client
  Downloading qdrant_client-1.13.3-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.71.0-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client)
  Downloading h2-4.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting hyperframe<7,>=6.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)
  Downloading hyperframe-6.1.0-py3-none-any.whl.metadata (4.3 kB)
Collecting hpack<5,>=4.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)
  Downloading hpack-4.1.0-py3-none-any.whl.metadata (4.6 kB)
Downloading qdrant_client-1.13.3-py3-none-any.whl (306 kB)
Downloading grpcio_tools-1.71.0-cp312-cp312-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0

In [15]:
%%writefile setup_vector_db.py
import os
import ast
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import time
from nano_vectordb import NanoVectorDB

# --------- Load DataFrame from CSV ---------
df_path = os.path.join("final_triplets", "chunks_with_triplets.csv")
df = pd.read_csv(df_path)

def parse_triplets(val):
    try:
        return ast.literal_eval(val)
    except Exception as e:
        print(f"Error parsing triplets: {e}")
        return []

df['triplets'] = df['triplets'].apply(lambda x: parse_triplets(x) if isinstance(x, str) else x)

# --------- Initialize Embedding Model ---------
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dim = 384

# --------- Prepare Records for Each Collection ---------
chunk_points = []
relation_points = []
entity_points = {}  # Use dict for unique entities

def normalize_entity(ent):
    return ent.strip().lower().replace(" ", "_")

for idx, row in df.iterrows():
    # For every row, process the chunk
    chunk_id = row["chunk_id"]
    profile_name = row["profile_name"]
    segment = row["segment"]
    chunk_text = row["chunk_text"]
    
    # Compute chunk embedding
    chunk_emb = model.encode(chunk_text)
    chunk_point = {
        "id": chunk_id,
        "__vector__": chunk_emb,  # Store as NumPy array
        "payload": {
            "profile_name": profile_name,
            "segment": segment,
            "text": chunk_text
        }
    }
    chunk_points.append(chunk_point)
    
    # Process triplets (if any) in the row
    triplets = row.get("triplets", [])
    for i, trip in enumerate(triplets):
        entity1 = trip.get("entity1", "")
        relation = trip.get("relation", "")
        entity2 = trip.get("entity2", "")
        source = trip.get("source", "LLM")
        
        # Compute embeddings for each component and average them
        e1_emb = model.encode(entity1)
        rel_emb = model.encode(relation)
        e2_emb = model.encode(entity2)
        combined_emb = (e1_emb + rel_emb + e2_emb) / 3.0
        
        trip_id = f"{chunk_id}_{i}"
        relation_point = {
            "id": trip_id,
            "__vector__": combined_emb,  # Store as NumPy array
            "payload": {
                "chunk_id": chunk_id,
                "profile_name": profile_name,
                "segment": segment,
                "source": source,
                "entity1": entity1,
                "relation": relation,
                "entity2": entity2
            }
        }
        relation_points.append(relation_point)
        
        # Process individual entities
        for ent in [entity1, entity2]:
            if ent:
                ent_id = normalize_entity(ent)
                if ent_id not in entity_points:
                    ent_emb = model.encode(ent)
                    entity_points[ent_id] = {
                        "id": ent_id,
                        "__vector__": ent_emb,  # Store as NumPy array
                        "payload": {"text": ent, "chunk_ids": [chunk_id]}
                    }
                else:
                    if chunk_id not in entity_points[ent_id]["payload"].get("chunk_ids", []):
                        entity_points[ent_id]["payload"].setdefault("chunk_ids", []).append(chunk_id)

# Convert entity_points dict to a list
entity_points_list = list(entity_points.values())

# --------- Initialize NanoVectorDB Instances ---------
chunk_db = NanoVectorDB(embedding_dim, storage_file="chunk_db.json")
relation_db = NanoVectorDB(embedding_dim, storage_file="relation_db.json")
entity_db = NanoVectorDB(embedding_dim, storage_file="entity_db.json")

# --------- Upsert the Prepared Records ---------
print("Upserting chunk points...")
chunk_upsert_result = chunk_db.upsert(chunk_points)
print("Upserted chunks:", chunk_upsert_result)

print("Upserting relation points...")
if relation_points:
    relation_upsert_result = relation_db.upsert(relation_points)
    print("Upserted relations:", relation_upsert_result)
else:
    print("No relation points to upsert.")

print("Upserting entity points...")
if entity_points_list:
    entity_upsert_result = entity_db.upsert(entity_points_list)
    print("Upserted entities:", entity_upsert_result)
else:
    print("No entity points to upsert.")

# --------- Save the Databases ---------
chunk_db.save()
relation_db.save()
entity_db.save()

print("\nAll embeddings have been stored in NanoVectorDB:")
print(" - Chunk embeddings saved in 'chunk_db.json'")
print(" - Relation embeddings saved in 'relation_db.json'")
print(" - Entity embeddings saved in 'entity_db.json'")


Writing setup_vector_db.py


In [16]:
!python setup_vector_db.py

Upserting chunk points...
Upserted chunks: {'update': [], 'insert': ['2289fc76aa6fb31ead8050940e58980d', 'd66f9d478400414588b650fc359bdc17', '65bc6215ffc7d7244dbc1a645b59cc7c', '31a571cb699cc6eb003cd78322b59765', '91cdcc2a596b698df18023adf3a061e3', 'a6ca4ba25f38c12113cdfadc54c7fb77', '5cdabd3addb0006b6277ea4daa4c7f1c', '8751d3c5dbd042ba23634840485509f6', 'f83540e1beacb8634c42a04a8a80a685', '395b2e80eef04f5fc6201f5b57036b4b', '12aadbf127cb51bf88496d1b00d07956', 'f9d2ea6de29c8f59f3dab5e7d157ad26', '3650467f5ecfb7bdbde8d7f3d9a8cb19', '298efb998c469d8fdc5979434bf4a519', '13fbc2cb2e7ef99c2a52b4e1eef5ef83', '6c588a250ed1010c2eb8aba976b3b98e', 'cc1cf7063fdac37007be22e7c623f27b', '6346f20f7fb784adb5a48572829b08ba', '0831b9193b2cabf0d86fcf193de6906b', 'fd247ecee032c52802d0b4d20d62d780', '769546425a03561ab868117db6429122', '4f7728d8e5cd04c57181fc3f995837c5', '71f42cbb7dcac17cd9eb7e58797b3645', '5667e1372d783b1b3f880715b210fabe', 'f448454dde10c78467d2b522c86c28d2', 'bfbfa8049a24dde450a40f814eee6d

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00,  1.14it/s]
Batches: 100%|##########| 1/1 [00:00<00:00,  1.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 49.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 82.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 38.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 99.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 42.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 99.40it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
## eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.BAyhyFc69JcoiIfLzgFS6TLG6cr8-y-T95TsTJZ-onE


Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 105.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 98.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 99.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 94.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 101.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 78.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 80.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 81.73it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00, 103.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|########

In [18]:


# Now, run the generated script from the notebook:


# After running the script, load each DB file and print the first record for verification.
def load_db(db_file):
    with open(db_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    # NanoVectorDB saves data as a dict with "data" key if records exist.
    if isinstance(data, dict) and "data" in data:
        return data["data"]
    elif isinstance(data, list):
        return data
    else:
        return []

def print_first_record(name, db_file):
    records = load_db(db_file)
    print(f"\n{name} (Total {len(records)} records):")
    if records:
        # Convert numpy arrays to lists for printing, if needed
        rec = records[0]
        if not isinstance(rec.get("__vector__"), list) and hasattr(rec.get("__vector__"), "tolist"):
            rec["__vector__"] = rec["__vector__"].tolist()
        print(json.dumps(rec, indent=2))
    else:
        print("No records found.")

print_first_record("Chunk DB", "chunk_db.json")
print_first_record("Relation DB", "relation_db.json")
print_first_record("Entity DB", "entity_db.json")



Chunk DB (Total 128 records):
{
  "id": "0d572b5c29aa2225e7d88ac039ef2132",
  "payload": {
    "profile_name": "Alexis Ryan",
    "segment": "headline_about",
    "text": "Alexis Ryan describes about himself: software developer at Ford motors and I am currently doing 6 months of internship at Ford. I am a enthusiast for development , be it web development or game development. I have developed a 2d platformer game using Unity engine , 3d FPS game using Unreal engine and I along with my team of 3 created a virtual autonomous drone for a IEEE competition , all this in the duration of my B.Tech program .."
  },
  "__id__": "2289fc76aa6fb31ead8050940e58980d"
}

Relation DB (Total 2280 records):
{
  "id": "0d572b5c29aa2225e7d88ac039ef2132_0",
  "payload": {
    "chunk_id": "0d572b5c29aa2225e7d88ac039ef2132",
    "profile_name": "Alexis Ryan",
    "segment": "headline_about",
    "source": "LLM",
    "entity1": "Alexis Ryan",
    "relation": "is a person",
    "entity2": "Alexis Ryan is a so

# Visualization 

In [19]:
import os
import json
import networkx as nx
from pyvis.network import Network
import webbrowser

def load_db(db_file):
    """Load records from a NanoVectorDB JSON file."""
    with open(db_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    # NanoVectorDB typically stores records under a "data" key.
    if isinstance(data, dict) and "data" in data:
        return data["data"]
    elif isinstance(data, list):
        return data
    else:
        return []

def normalize_entity(ent):
    """Normalize an entity string to use as an ID."""
    return ent.strip().lower().replace(" ", "_")

# --- File paths for your saved JSON databases ---
entity_db_file = "entity_db.json"
relation_db_file = "relation_db.json"

# --- Load data from the entity and relation JSON files ---
entity_records = load_db(entity_db_file)
relation_records = load_db(relation_db_file)

# --- Build a dictionary of nodes (entities) ---
nodes = {}
# First, add nodes from the entity DB if available.
for rec in entity_records:
    # Use rec["id"] or rec["__id__"] as the entity id.
    eid = rec.get("id") or rec.get("__id__")
    if not eid:
        continue
    # Use the payload text as the label, or fallback to eid.
    label = rec.get("payload", {}).get("text", eid)
    nodes[eid] = label

# Then, ensure every entity mentioned in the relations exists.
for rec in relation_records:
    payload = rec.get("payload", {})
    ent1 = payload.get("entity1", "")
    ent2 = payload.get("entity2", "")
    norm1 = normalize_entity(ent1)
    norm2 = normalize_entity(ent2)
    if norm1 and norm1 not in nodes:
        nodes[norm1] = ent1  # use the raw text as label
    if norm2 and norm2 not in nodes:
        nodes[norm2] = ent2

# --- Build a NetworkX graph using these nodes and relation records as edges ---
G = nx.DiGraph()

# Add entity nodes to the graph
for eid, label in nodes.items():
    G.add_node(eid, label=label)

# Add edges based on relation records
for rec in relation_records:
    payload = rec.get("payload", {})
    ent1 = payload.get("entity1", "")
    ent2 = payload.get("entity2", "")
    rel_text = payload.get("relation", "")
    norm1 = normalize_entity(ent1)
    norm2 = normalize_entity(ent2)
    if norm1 and norm2:
        # Add edge from entity1 to entity2 with the relation text as label.
        G.add_edge(norm1, norm2, label=rel_text, title=rel_text)

# --- Visualize the Knowledge Graph using PyVis ---
net = Network(height="750px", width="100%", directed=True, cdn_resources="remote", notebook=False)
net.from_nx(G)

# Customize node appearance: set tooltip to show node label.
for node in net.nodes:
    node["title"] = node.get("label", "")
    node["label"] = node.get("label", "")

# Customize edge appearance
for edge in net.edges:
    edge["title"] = edge.get("title", "")
    edge["label"] = edge.get("label", "")

# Generate the HTML file for the visualization.
output_path = "kg_entities_relations.html"
html_str = net.generate_html(notebook=False)
with open(output_path, "w", encoding="utf-8") as f:
    f.write(html_str)

# Open the generated HTML in the default web browser.
webbrowser.open("file://" + os.path.realpath(output_path))
print("Graph visualization saved as:", output_path)


Graph visualization saved as: kg_entities_relations.html


In [20]:
import os
import json
import networkx as nx

def load_db(db_file):
    """Load records from a NanoVectorDB JSON file."""
    with open(db_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "data" in data:
        return data["data"]
    elif isinstance(data, list):
        return data
    else:
        return []

def normalize_entity(ent):
    """Normalize an entity string to use as an ID."""
    return ent.strip().lower().replace(" ", "_")

# --- File paths for your saved JSON databases ---
entity_db_file = "entity_db.json"
relation_db_file = "relation_db.json"

# --- Load data from the entity and relation JSON files ---
entity_records = load_db(entity_db_file)
relation_records = load_db(relation_db_file)

# --- Build a dictionary of nodes (entities) with full metadata ---
nodes = {}
# Add nodes from the entity DB.
for rec in entity_records:
    node_id = rec.get("id") or rec.get("__id__")
    if not node_id:
        continue
    rec_meta = rec.copy()
    rec_meta["label"] = rec.get("payload", {}).get("text", node_id)
    nodes[node_id] = rec_meta

# Ensure every entity mentioned in relations exists.
for rec in relation_records:
    payload = rec.get("payload", {})
    ent1 = payload.get("entity1", "")
    ent2 = payload.get("entity2", "")
    norm1 = normalize_entity(ent1)
    norm2 = normalize_entity(ent2)
    if norm1 and norm1 not in nodes:
        nodes[norm1] = {"label": ent1, "payload": {"text": ent1}}
    if norm2 and norm2 not in nodes:
        nodes[norm2] = {"label": ent2, "payload": {"text": ent2}}

# --- Build a directed graph using NetworkX ---
G = nx.DiGraph()

# Add nodes with their metadata.
for node_id, meta in nodes.items():
    G.add_node(node_id, **meta)

# Add edges based on relation records.
for rec in relation_records:
    payload = rec.get("payload", {})
    ent1 = payload.get("entity1", "")
    ent2 = payload.get("entity2", "")
    norm1 = normalize_entity(ent1)
    norm2 = normalize_entity(ent2)
    if norm1 and norm2:
        edge_meta = rec.copy()
        edge_meta["label"] = payload.get("relation", "")
        G.add_edge(norm1, norm2, **edge_meta)

# --- Pre-process graph attributes for GraphML export ---
# Convert dictionary attributes to JSON strings.
def convert_attrs(attrs):
    for key, value in attrs.items():
        if isinstance(value, dict):
            attrs[key] = json.dumps(value)
    return attrs

# Update node attributes.
for node, data in G.nodes(data=True):
    G.nodes[node].update(convert_attrs(data))

# Update edge attributes.
for u, v, data in G.edges(data=True):
    G.edges[u, v].update(convert_attrs(data))

# --- Save the graph in GraphML format with converted attributes ---
graphml_path = "kg_entities_relations.graphml"
nx.write_graphml(G, graphml_path)
print("GraphML file saved as:", os.path.realpath(graphml_path))


GraphML file saved as: C:\Users\cheer\rv_v1.3\Indexer\kg_entities_relations.graphml


In [21]:
import os
import json
import networkx as nx
from sentence_transformers import SentenceTransformer
import numpy as np

# --- Settings ---
graphml_path = "kg_entities_relations.graphml"
output_json_path = "kg_entities_word_embeddings.json"
# Using a MiniLM model (all-MiniLM-L6-v2 is common)
model_name = "all-MiniLM-L6-v2"

# --- Load the MiniLM model ---
model = SentenceTransformer(model_name)

def compute_word_embedding(word: str):
    """
    Compute the embedding for a given word using the model.
    Returns the embedding as a list of floats.
    """
    # The model expects text input; even though it's optimized for sentences,
    # we use it on individual words.
    embedding = model.encode(word)
    # Convert to a list for JSON serialization.
    return embedding.tolist()

def process_text(text: str):
    """
    Splits the text into words and computes an embedding for each word.
    Returns a list of dictionaries, each containing the word and its embedding.
    """
    words_info = []
    # Use a simple split on whitespace; you might want to improve this by removing punctuation, etc.
    words = text.split()
    for word in words:
        emb = compute_word_embedding(word)
        words_info.append({
            "word": word,
            "embedding": emb
        })
    return words_info

def safe_parse_json(data_str):
    """
    Try to parse a JSON string safely; if parsing fails, return an empty dict.
    """
    try:
        return json.loads(data_str)
    except Exception:
        return {}

# --- Load the graph from GraphML ---
G = nx.read_graphml(graphml_path)

# --- Process Nodes and Edges ---
output_data = {
    "nodes": {},
    "edges": {}
}

# Process nodes.
for node_id, attrs in G.nodes(data=True):
    # Retrieve the payload if available (it's stored as a JSON string)
    payload = safe_parse_json(attrs.get("payload", "{}"))
    text = payload.get("text", "")
    chunk_ids = payload.get("chunk_ids", [])
    
    # Compute word embeddings for the text.
    words_info = process_text(text)
    
    output_data["nodes"][node_id] = {
        "original_text": text,
        "chunk_ids": chunk_ids,
        "words": words_info,
        # Optionally include any other metadata from the node:
        "meta": {k: v for k, v in attrs.items() if k not in ["payload", "label"]}
    }

# Process edges.
# For each edge we use a compound key based on source, target, and an edge index.
edge_index = 0
for u, v, attrs in G.edges(data=True):
    # Retrieve the payload if available (it's stored as a JSON string)
    payload = safe_parse_json(attrs.get("payload", "{}"))
    relation_text = payload.get("relation", "")
    chunk_ids = payload.get("chunk_id", None)  # might be a single value or list depending on your data
    # Compute word embeddings for the relation text.
    words_info = process_text(relation_text)
    
    edge_key = f"{u}--{v}--{edge_index}"
    output_data["edges"][edge_key] = {
        "source": u,
        "target": v,
        "relation_text": relation_text,
        "chunk_ids": chunk_ids,
        "words": words_info,
        "meta": {k: v for k, v in attrs.items() if k not in ["payload", "label"]}
    }
    edge_index += 1

# --- Save the structured output as JSON ---
with open(output_json_path, "w", encoding="utf-8") as out_file:
    json.dump(output_data, out_file, indent=2)

print("Embeddings for every word in nodes and edges have been saved to:", os.path.realpath(output_json_path))


Embeddings for every word in nodes and edges have been saved to: C:\Users\cheer\rv_v1.3\Indexer\kg_entities_word_embeddings.json


In [22]:
import os
import json
from sentence_transformers import SentenceTransformer

# --- Settings ---
chunk_db_file = "chunk_db.json"           # Input chunk database file
output_json_path = "chunk_word_embeddings.json"  # Output JSON file path
model_name = "all-MiniLM-L6-v2"              # MiniLM model name

# --- Load the MiniLM model ---
model = SentenceTransformer(model_name)

def load_db(db_file):
    """Load records from a NanoVectorDB JSON file."""
    with open(db_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "data" in data:
        return data["data"]
    elif isinstance(data, list):
        return data
    else:
        return []

def compute_word_embedding(word: str):
    """Compute and return the embedding for a given word as a list of floats."""
    embedding = model.encode(word)
    return embedding.tolist()

def process_text(text: str):
    """
    Splits the text into words and computes an embedding for each word.
    Returns a list of dictionaries, each containing the word and its embedding.
    """
    words_info = []
    # Split on whitespace. You can use more advanced tokenization if needed.
    words = text.split()
    for word in words:
        emb = compute_word_embedding(word)
        words_info.append({
            "word": word,
            "embedding": emb
        })
    return words_info

# --- Load chunk database ---
chunks = load_db(chunk_db_file)

# --- Process each chunk ---
output_data = {"data": []}

for rec in chunks:
    # Extract the chunk ID and payload metadata.
    chunk_id = rec.get("id")
    internal_id = rec.get("__id__")
    payload = rec.get("payload", {})
    profile_name = payload.get("profile_name")
    segment = payload.get("segment")
    text = payload.get("text", "")
    
    # Process the text: compute word-level embeddings.
    words_info = process_text(text)
    
    # Prepare structured data for this chunk.
    chunk_entry = {
        "id": chunk_id,
        "__id__": internal_id,
        "payload": {
            "profile_name": profile_name,
            "segment": segment,
            "text": text,
            "words": words_info  # Each word with its embedding.
        }
    }
    output_data["data"].append(chunk_entry)

# --- Save the structured word embeddings along with metadata as JSON ---
with open(output_json_path, "w", encoding="utf-8") as out_file:
    json.dump(output_data, out_file, indent=2)

print("Chunk word embeddings have been saved to:", os.path.realpath(output_json_path))


Chunk word embeddings have been saved to: C:\Users\cheer\rv_v1.3\Indexer\chunk_word_embeddings.json
