In [1]:
import sys, os
import json
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


sys.path.append(sys.path[0])
from src.aws_download import download_from_s3
from src.dharma_preprocessing import *
from src.agent import MyAgent
from src.openai_inference import *

Agent = MyAgent()

## Step 0: Download `Dharma Overground` exported data from AWS S3 Bucket
Exported and cleaned data from the [Dharma Overground](https://www.dharmaoverground.org/), publically hosted in an AWS S3 bucket.

In [2]:
### DOWNLOAD THE CLEANED DHARMA OVERGROUND DATA
download_from_s3(
    bucket_name="dharma-overground-export", 
    object_key="DHO_CLEANED.csv", 
    local_file_path="./DHO_CLEANED.csv"
)

### FIT DATA INTO PANDAS DF
file_path = "./DHO_CLEANED.csv"
dharma_df = pd.read_csv(file_path, sep='\t', low_memory=False, on_bad_lines='skip')

### REPORT DATA INSIGHTS
print(f"n message threads == {len(set(dharma_df['thread_id']))}")
print(f"n unique users == {len(set(dharma_df['user_id']))}")
print(f"n total posts == {len(dharma_df)}")


./DHO_CLEANED.csv already exists. Skipping download.
n message threads == 16824
n unique users == 4472
n total posts == 171261


## Step 1: Organizing the data into message threads
Taking this table, we'll create message threads tagged with user IDs in the order of conversation flow...

In [3]:
### ORGANIZE DATAFRAME INTO A COLLECTION OF TXT MESSAGE THREADS
dharma_threads = structure_dharma(dharma_df, force_rebuild=False)

### VISUALIZE THE TOKEN LEN DISTRIBUTION AMONGST THE MESSAGE THREADS
# visualize_token_length_distribution(dharma_threads)

### SELECT A TOKEN LENGTH SUBSET RANGE
selected_dharma_threads = filter_threads_by_token_length(dharma_threads, min_tokens=1000, max_tokens=9000)

./dho_message_threads.json already exists. Reading data.
Filtered 6503 threads out of 16824 total message threads.
38.7% of original threads remain in filtered dataset.
Token length range: 1000 to 9000


## Step 2: Pull a subset from the dataset based on token count of the complete thread
This will demonstrate the following methods with a quality excerpt from the complete Dharma Overground, and ensure all threads can be provided to fit within GPT-4o's context window.

# Step 3: Knowledge Graph Generation
Given the message threads that have now been structured and selected, use GPT-4o with structured API outputs to generate a knowledge graph of each thread.

In [4]:
### SELECT A TESTING SUBSET OF MESSAGES
# subselected_message_threads = Agent.KG.select_random_threads(selected_dharma_threads, subset_size=99)

### WEAVE THE META-NETWORK
# dharma_network, network_cost = await Agent.KG.weave_meta_net(subselected_message_threads, network_name=None)

### VISUALIZE THE NETWORK
graph_html = Agent.KG.generate_knowledge_graph_html('knowledge_graph')