# Imports

In [16]:
# !pip install langchain

from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

In [17]:
## parse .env yaml file
import yaml
import os
import json

with open(".env.yaml", 'r') as stream:
    ## add to env variables OPENAI_API_KEY
    obj = yaml.safe_load(stream)
    os.environ["OPENAI_API_KEY"] = obj["OPENAI_API_KEY"]


## Helper Methods

In [62]:
import os
import re
import json
from langchain import LLMChain


class AtlasBuilder():

    template = '''
Please help me create a hierarchical taxonomy for the Atlas of Knowledge, an organized and very comprehensive map of all topics/categories/areas/techniques/items within "{topic}". 

Start with systematic and thorough enumeration of the broad topics, then break them down into subtopics where applicable. Each topic should have roughly 3-8 subitems. 
First brainstorm the response and then systematically build out the taxonomy where only the items and sub-items are provided and nothing else.

---- Sample Output for "All Knowledge/[Domain]/[Parent Topic]" ----
Brainstorming : 
- [Include unstructured thoughts/text here... Some relevant pieces of info within provided topic are : Topic1, SubTopic2, Topic3, etc...]

Taxonomy of Knowledge : 
1. Topic1
    1.1. SubTopic1
        1.1.1. SubSubTopic1
        1.1.2. SubSubTopic2
    1.2. SubTopic2
        1.2.1. SubSubTopic1
2. Topic2
    2.1. SubTopic1
        2.1.1. SubSubTopic1
        2.1.2. SubSubTopic2
        2.1.3. SubSubTopic3
        2.1.4. SubSubTopic4
        2.1.5. SubSubTopic5
        2.1.6. SubSubTopic6
        2.1.7. SubSubTopic7
        2.1.8. SubSubTopic8
    2.2. SubTopic2
        ...
        ...
        ...
'''

    def __init__(self, model_name="gpt-3.5-turbo", root_folder_path='public/knowledge_atlas'):
        self.root_folder_path = root_folder_path

        self.llm = ChatOpenAI(model_name=model_name, temperature=0.0, max_tokens=3000, top_p=1, frequency_penalty=0.0, presence_penalty=0.0)
        prompt_template = PromptTemplate(input_variables=["topic"], template=self.template)
        self.answer_chain = LLMChain(llm=self.llm, prompt=prompt_template)
        
    def query_openai(self, topic_path):
        answer = self.answer_chain.run(topic=topic_path)
        return answer
    
    def get_topic_hierarchy(self, topic_path):
        answer = self.query_openai(topic_path)
        return self.parse_hierarchical_list(answer, start_path=topic_path)

    def write_topic_hierarchy(self, topic_json):
        leaf_folder_path = os.path.join(self.root_folder_path, *topic_json['path'].split(' > '), topic_json['name'])
        self.add_json_to_folder_structure(topic_json, leaf_folder_path)

    def get_leaf_paths(self):
        lowest_dirs = []
        for root,dirs,files in os.walk(self.root_folder_path):
            if not dirs:lowest_dirs.append(root)
        lowest_dirs.sort(key=lambda x: x.count('/'))
        return lowest_dirs

# WRITING TO FILES

    @staticmethod
    def parse_hierarchical_list(text, start_path=''):
        lines = text.strip().split('\n')
        stack = []
        start_name = start_path.split(' > ')[-1]
        root = {
            'name': start_name, 
            'children': {}, 
            'path': ' > '.join(start_path.split(' > ')[:-1])
        }
        stack.append(root)

        for line in lines:
            if not line: continue      # Note : this stops parsing at any empty lines... Not sure why, hence this fix
            if not re.match(r'^\s*\d+\.', line): continue

            level = len(re.findall(r'\d\.', line))
            name = re.sub(r'\d+\.', '', line).strip()
            name = re.sub(r'[^ a-zA-Z0-9]', '_', name)        # remove extra spaces
            node = {'name': name, 'children': {}}

            # replace special characters in line
            line = re.sub(r'\d+\.', '', line).strip()
            while len(stack) > level:
                stack.pop()

            if level > 0:
                node['path'] = stack[-1]['path'] + ' > ' + stack[-1]['name']
                node['path'] = re.sub( r'^ > ', '', node['path'])
                stack[-1]['children'][name] = node

            stack.append(node)

        return root

    @staticmethod
    def add_json_to_folder_structure(json_object, root_folder_path):
        if not os.path.exists(root_folder_path):
            os.mkdir(root_folder_path)

        # If the JSON object has children, create subfolders for each child
        if 'children' in json_object:
            for child_name, child_obj in json_object['children'].items():
                child_folder_path = os.path.join(root_folder_path, child_name)
                AtlasBuilder.add_json_to_folder_structure(child_obj, child_folder_path)

        # Write the JSON object to a file in the current folder
        json_file_path = os.path.join(root_folder_path, 'data.json')
        with open(json_file_path, 'w') as f:
            json.dump(AtlasBuilder.trim_json(json_object, 4), f)

    @staticmethod
    def trim_json(json_object, max_level=3, current_level=0):
        """
        Trims a JSON object to only X levels down.

        Args:
            json_object (dict): The JSON object to trim.
            current_level (int): The current level of the JSON object.
            max_level (int): The maximum level to include in the trimmed JSON object.

        Returns:
            dict: The trimmed JSON object.
        """
        # Base case: if we've reached the maximum level, return an empty dictionary
        if (current_level == max_level) or (not json_object):
            return {}

        # If the JSON object has children, recursively trim them and add them to a new dictionary
        trimmed_object = {}
        if 'children' in json_object:
            for child_name, child_obj in json_object['children'].items():
                trimmed_child = AtlasBuilder.trim_json(child_obj, current_level=current_level+1, max_level=max_level)
                if trimmed_child:
                    trimmed_object[child_name] = trimmed_child

        # Include the current object's name and any non-children attributes in the trimmed object
        # print(json_object)
        trimmed_object['name'] = json_object['name']
        for key, value in json_object.items():
            if key != 'name' and key != 'children':
                trimmed_object[key] = value

        return trimmed_object


# Build Knowledge Atlas!

## Start With "All Knowledge"

In [97]:
# !pip install -q openai
TOPIC_PATH = "All Knowledge"
# TOPIC_PATH = "Natural Sciences > Biology > Genetics"

atlas = AtlasBuilder(model_name='gpt-4')
# atlas = AtlasBuilder(model_name='gpt-3.5-turbo')


# RUN WITH CAUTION -- OVERWRITES EXISTING FILES
# if True:
if False:
	topic_hierarchy = atlas.get_topic_hierarchy(TOPIC_PATH)
	atlas.write_topic_hierarchy(topic_hierarchy)


## Get Topics to Expand 
(Leaf Nodes - Breadth-first search)

In [100]:
import time
ntopics = 10
# filter_for_topic = '/'
filter_for_topic = '/Biology/'

all_leaf_paths = atlas.get_leaf_paths()

# Filter for a specific topic
leaf_paths = [path for path in all_leaf_paths if filter_for_topic in path]

# Depth vs Breadth First Search
leaf_paths = leaf_paths[:min(ntopics, len(leaf_paths)-1)]   # Breadth
# leaf_paths = leaf_paths[-min(ntopics, len(leaf_paths)-1):]    # Depth

# leaf_paths = random.shuffle(leaf_paths)

for leaf_path in leaf_paths:
    # leaf_path = random.choice(atlas.get_leaf_paths()[-100:])  # Depth First Search
    
    leaf_topic_json = json.load(open(leaf_path+'/data.json'))
    print(leaf_topic_json)
    leaf_subtopics = atlas.get_topic_hierarchy(leaf_topic_json['path'] + ' > ' + leaf_topic_json['name'])
    atlas.write_topic_hierarchy(leaf_subtopics)
    print('sleeping for 5 seconds...')
    time.sleep(5)

{'name': 'Neuroscience', 'path': 'All Knowledge > Natural Sciences > Biology'}


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


sleeping for 5 seconds...
{'name': 'Anatomy', 'path': 'All Knowledge > Natural Sciences > Biology'}
sleeping for 5 seconds...
{'name': 'Physiology', 'path': 'All Knowledge > Natural Sciences > Biology'}


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 16:21:55 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9da205aa210764-MAN', 'alt-svc': 'h3=":443"; ma=86400, h3-29=":443"; ma=86400'}.
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised APIError: Gateway timeout. {"error":{"code":524,"message":"Gateway timeo

sleeping for 5 seconds...
{'name': 'Botany', 'path': 'All Knowledge > Natural Sciences > Biology'}


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Gateway timeout. {"error":{"code":524,"message":"Gateway timeout.","param":null,"type":"cf_gateway_timeout"}} 524 {'error': {'code': 524, 'message': 'Gateway timeout.', 'param': None, 'type': 'cf_gateway_timeout'}} {'Date': 'Fri, 19 May 2023 16:46:54 GMT', 'Content-Type': 'application/json', 'Content-Length': '92', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9dc2f2297c0764-MAN'}.
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised APIError: Gateway timeout. {"error":{"code":524,"message":"Gateway timeout.","param":null,"type":"cf_gateway_t

APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 17:42:46 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9e181c08a335fb-MAN', 'alt-svc': 'h3=":443"; ma=86400, h3-29=":443"; ma=86400'}

# TESTING

In [None]:
STOP_HERE

In [59]:
resp = atlas.query_openai(leaf_topic_json['path'] + ' > ' + leaf_topic_json['name'])
print(resp)

'Brainstorming: \n- International Relations Theories\n- International Organizations\n- Diplomacy and Negotiation\n- Global Governance\n- International Security\n- International Political Economy\n- Foreign Policy Analysis\n- Conflict and Peace Studies\n\nTaxonomy of Knowledge:\n1. International Relations Theories\n    1.1. Realism\n    1.2. Liberalism\n    1.3. Constructivism\n    1.4. Marxism\n    1.5. Feminism\n2. International Organizations\n    2.1. United Nations\n        2.1.1. General Assembly\n        2.1.2. Security Council\n        2.1.3. International Court of Justice\n        2.1.4. Secretariat\n    2.2. Regional Organizations\n        2.2.1. European Union\n        2.2.2. African Union\n        2.2.3. Organization of American States\n        2.2.4. Association of Southeast Asian Nations\n3. Diplomacy and Negotiation\n    3.1. Diplomatic History\n    3.2. Diplomatic Protocol\n    3.3. International Law\n    3.4. Mediation and Conflict Resolution\n4. Global Governance\n    4

In [64]:
atlas.parse_hierarchical_list(resp, start_path=leaf_topic_json['path'] + ' > ' + leaf_topic_json['name'])

{'name': 'International Relations',
 'children': {'International Relations Theories': {'name': 'International Relations Theories',
   'children': {'Realism': {'name': 'Realism',
     'children': {},
     'path': 'All Knowledge > Social Sciences > Political Science > International Relations > International Relations Theories'},
    'Liberalism': {'name': 'Liberalism',
     'children': {},
     'path': 'All Knowledge > Social Sciences > Political Science > International Relations > International Relations Theories'},
    'Constructivism': {'name': 'Constructivism',
     'children': {},
     'path': 'All Knowledge > Social Sciences > Political Science > International Relations > International Relations Theories'},
    'Marxism': {'name': 'Marxism',
     'children': {},
     'path': 'All Knowledge > Social Sciences > Political Science > International Relations > International Relations Theories'},
    'Feminism': {'name': 'Feminism',
     'children': {},
     'path': 'All Knowledge > Socia

In [25]:
leaf_paths

['public/knowledge_atlas/All Knowledge/Social Sciences/Sociology/Social Theory',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Sociology/Economic Sociology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Sociology/Social Research',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Sociology/Cultural Sociology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Clinical Psychology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Developmental Psychology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Social Psychology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Cognitive Psychology',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Political Science/Public Administration',
 'public/knowledge_atlas/All Knowledge/Social Sciences/Political Science/Public Policy']

In [26]:
leaf_subtopics

{'name': 'Social Theory',
 'children': {'Social Theory': {'name': 'Social Theory',
   'children': {'Classical Social Theory': {'name': 'Classical Social Theory',
     'children': {'Karl Marx': {'name': 'Karl Marx',
       'children': {},
       'path': 'All Knowledge > Social Sciences > Sociology > Social Theory > Social Theory > Classical Social Theory'},
      'Max Weber': {'name': 'Max Weber',
       'children': {},
       'path': 'All Knowledge > Social Sciences > Sociology > Social Theory > Social Theory > Classical Social Theory'},
      'Emile Durkheim': {'name': 'Emile Durkheim',
       'children': {},
       'path': 'All Knowledge > Social Sciences > Sociology > Social Theory > Social Theory > Classical Social Theory'}},
     'path': 'All Knowledge > Social Sciences > Sociology > Social Theory > Social Theory'},
    'Contemporary Social Theory': {'name': 'Contemporary Social Theory',
     'children': {'Critical Theory': {'name': 'Critical Theory',
       'children': {},
      

# OLD

In [None]:
STOP_HERE

In [None]:

import os
import openai
# openai.organization = "org-lmbVjLtK0ogCDQASTvLynITa"
openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.Model.list()

models = openai.Model.list()
list(map(lambda a : a['id'], models['data']))

In [219]:
# OLD PROMPT
topic = "Natural Language Processing (NLP)"
result = llm(f'''
In broad terms provide a 1 sentence description of "{topic}" then provide at the highest level, what are the main topics or subcategories within "{topic}"
Provie outputs in the following format :
- {topic} : <brief description of {topic}>
- <topic 1> : <brief description of topic 1>
- <topic 2> : <brief description of topic 2>
etc
''')
result

'\nNatural Language Processing (NLP) : The field of Artificial Intelligence that deals with analyzing, understanding, and generating natural language.\n- Text Analysis : The process of extracting and understanding meaningful patterns from text data.\n- Natural Language Generation : The process of automatically generating natural language from structured data.\n- Natural Language Understanding : The process of recognizing the meaning of an utterance and deriving an appropriate response.\n- Text Summarization : The process of condensing a text document into a shorter version that preserves the most important information.\n- Sentiment Analysis : The process of identifying and classifying opinions expressed in text data.'

In [220]:
resultsText = result

resArr = resultsText.split('\n')

import re
arrTopics = []
for i in resArr:
    if not len(i): continue
    topicObj = {
        
        # get first group from match
        "topic": re.search(r'^\-? *([^:]+?) ?:', i).group(1),
        "description": re.search(r': ?(.+)', i).group(1)
    }
    arrTopics.append(topicObj)


# resultsText
# resArr
# arrTopics


In [221]:
# find original topic
topicObj = [i for i in arrTopics if i["topic"] == topic][0]
subTopics = [i for i in arrTopics if i["topic"] != topic]
topicObj['subtopics'] = subTopics
topicObj

{'topic': 'Natural Language Processing (NLP)',
 'description': 'The field of Artificial Intelligence that deals with analyzing, understanding, and generating natural language.',
 'subtopics': [{'topic': 'Text Analysis',
   'description': 'The process of extracting and understanding meaningful patterns from text data.'},
  {'topic': 'Natural Language Generation',
   'description': 'The process of automatically generating natural language from structured data.'},
  {'topic': 'Natural Language Understanding',
   'description': 'The process of recognizing the meaning of an utterance and deriving an appropriate response.'},
  {'topic': 'Text Summarization',
   'description': 'The process of condensing a text document into a shorter version that preserves the most important information.'},
  {'topic': 'Sentiment Analysis',
   'description': 'The process of identifying and classifying opinions expressed in text data.'}]}

In [222]:
# write to json file
with open(f'public/knowledge-tree/{topic.lower().replace(" ", "-")}.json', 'w') as outfile:
    json.dump([topicObj], outfile, indent=4)


# Previous Fetching Topics

In [None]:
# For Leaf node, get the JSON

# randomly select a leaf path
import random
leaf_path = random.choice(atlas.get_leaf_paths()[30:100])
leaf_topic_json = json.load(open(leaf_path+'/data.json'))
leaf_topic_json

## Fetch Subtopics of a leaf node

# subtopics_name = leaf_topic_json['path']
leaf_subtopics = atlas.get_topic_hierarchy(leaf_topic_json['path'] + ' > ' + leaf_topic_json['name'])
# leaf_subtopics
# Write the JSON to the folder structure
atlas.write_topic_hierarchy(leaf_subtopics)
# Validate Write : !ls 'public/knowledge_atlas/All Knowledge/Technology/Engineering/Mechanical Engineering'
leaf_subtopics
leaf_subtopics = atlas.get_topic_hierarchy(leaf_topic_json['path'] + ' > ' + leaf_topic_json['name'])
atlas.write_topic_hierarchy(leaf_subtopics)
print('sleeping for 10 seconds...')
time.sleep(10)
atlas.get_leaf_paths()[-10:]
leaf_path = random.choice(atlas.get_leaf_paths()[:100])

In [37]:
map_prompt = """Below is a section of a website about {prospect}

Write a concise summary about {prospect}. If the information is not about {prospect}, exclude it from your summary.

{text}

CONCISE SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text", "prospect"])


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 800,
    chunk_overlap  = 0
)
docs = text_splitter.split_documents(UnstructuredURLLoader("https://en.wikipedia.org/wiki/Artificial_intelligence").load())
# docs = text_splitter.split_documents("Hello, this is a test. Can you hear me?")
# docs = ["Hello, this is a test. Can you hear me?"]

chain = load_summarize_chain(llm,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    # combine_prompt=combine_prompt_template,
)

[nltk_data] Downloading package punkt to /Users/amclean/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amclean/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


MissingSchema: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?

In [33]:
output = chain({"input_documents": docs, # The seven docs that were created before
    "company": "RapidRoad",
    "sales_rep" : "Greg", \
    "prospect" : "GitLab"
})

AttributeError: 'str' object has no attribute 'page_content'

In [24]:
res = llm.agenerate("This is a test")

chain = load_summarize_chain(llm, "map_reduce", map_prompt = PromptTemplate("This is a test"))

loader = UnstructuredURLLoader("https://www.nytimes.com/2020/06/23/us/politics/trump-russia-bounties.html")

doc = loader.load()

# print(doc)
# output results ..


  res = llm.agenerate("This is a test")


TypeError: __init__() takes exactly 1 positional argument (2 given)

## Delete Deeply Nested Folders "All Knowledge" Folders

In [96]:
STOP_HERE
import glob
res = glob.glob('public/knowledge_atlas/All Knowledge/**/*/All Knowledge/*', recursive=True)
res

[]

In [95]:
import shutil


for i in res:
    folderToDelete = './'+'/All Knowledge'.join(i.split('/All Knowledge')[:-1])
    print(folderToDelete)
    try: 
        shutil.rmtree(folderToDelete)
    except:
        pass

./public/knowledge_atlas/All Knowledge/Social Sciences/Tourism/Tourism Research and Education/Tourism Studies/Tourism Economics
./public/knowledge_atlas/All Knowledge/Social Sciences/Tourism/Tourism Research and Education/Tourism Studies/Tourism Economics
./public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Clinical Psychology
./public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Clinical Psychology
./public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Developmental Psychology
./public/knowledge_atlas/All Knowledge/Social Sciences/Psychology/Developmental Psychology
./public/knowledge_atlas/All Knowledge/Social Sciences/Political Science/Public Administration/Public Administration/Public Administration and Public Finance
./public/knowledge_atlas/All Knowledge/Social Sciences/Political Science/Public Administration/Public Administration/Public Administration and Public Finance
./public/knowledge_atlas/All Knowledge/Social Sciences/Political Science/I