# Get Wiki info of OpenAI key stakeholders

In [1]:
# !pip install langchain
# !pip install wikipedia

In [2]:
# Load the OpenAI Wikipedia page
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
raw_documents = WikipediaLoader(query="OpenAI").load()

# Define chunking strategy
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=20
)
# Chunk the document
documents = text_splitter.split_documents(raw_documents)
for d in documents:
    del d.metadata["summary"]

In [3]:
for doc in documents:
    print(doc.metadata['source'])

https://en.wikipedia.org/wiki/OpenAI
https://en.wikipedia.org/wiki/OpenAI_o1
https://en.wikipedia.org/wiki/OpenAI_o3
https://en.wikipedia.org/wiki/Sora_(text-to-video_model)
https://en.wikipedia.org/wiki/Removal_of_Sam_Altman_from_OpenAI
https://en.wikipedia.org/wiki/OpenAI_Five
https://en.wikipedia.org/wiki/OpenAI_Codex
https://en.wikipedia.org/wiki/ChatGPT
https://en.wikipedia.org/wiki/Sam_Altman
https://en.wikipedia.org/wiki/SearchGPT
https://en.wikipedia.org/wiki/Generative_artificial_intelligence
https://en.wikipedia.org/wiki/Whisper_(speech_recognition_system)
https://en.wikipedia.org/wiki/Artificial_general_intelligence
https://en.wikipedia.org/wiki/Anthropic
https://en.wikipedia.org/wiki/AI_boom
https://en.wikipedia.org/wiki/GPT-4
https://en.wikipedia.org/wiki/Greg_Brockman
https://en.wikipedia.org/wiki/Microsoft_Copilot
https://en.wikipedia.org/wiki/Gemini_(chatbot)
https://en.wikipedia.org/wiki/XAI_(company)
https://en.wikipedia.org/wiki/Jan_Leike
https://en.wikipedia.org/wik

In [4]:
documents.remove(documents[2])
documents.remove(documents[3])

# Enable Neo4j database

In [5]:
# !pip install pypdf

# News Articles

In [6]:
!pip install pypdf

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
    tinycss2 (>=1.1.0<1.2) ; extra == 'css'
             ~~~~~~~~^[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
import os

# Directory containing your PDF files
directory_path = 'TCs'

# Initialize PyPDFLoader for each PDF in the directory
loaders = [PyPDFLoader(os.path.join(directory_path, f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Load documents from PDFs
news_docs = []
for loader in loaders:
    news_docs.extend(loader.load())

# Prepare the content and metadata for each news article as Document objects
news_articles_data = [
    Document(
        page_content=doc.page_content,  # Assuming this is how you access the page content of the document
        metadata={
            "source": doc.metadata['source'].removeprefix('TCs'),  # Assuming this is the metadata format
            # Include any other metadata items here
        }
    )
    for doc in news_docs  # Assuming news_docs is a list of objects with page_content and metadata
]

# Later, when you are ready to add them to the database:
# Call add_documents and construct Document objects inline
# Assuming news_articles_data is already a list of Document objects
# neo4j_db.add_documents(
#     news_articles_data,
#     ids=[f"news_article_{i}" for i in range(len(news_articles_data))]

# )

In [8]:
# !pip install spacy-llm
# !pip install --upgrade jupyter ipywidgets

In [9]:
all_data = documents + news_articles_data

In [10]:
all_data

[Document(metadata={'title': 'OpenAI', 'source': 'https://en.wikipedia.org/wiki/OpenAI'}, page_content='OpenAI is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. Its stated mission is to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work". As a leading organization in the ongoing AI boom, OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora. Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI.\nThe organization consists of the non-profit OpenAI, Inc., registered in Delaware, and its for-profit subsidiary introduced in 2019, OpenAI Global, LLC. Microsoft owns roughly 49% of OpenAI\'s equity, having invested US$13 billion. It also provi

# Perform Article Summaries as Relationship Extraction Database

In [13]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize the text splitter
rtext_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

# Initialize LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-4")

# Define the map prompt template
map_template = """The following is a set of documents
{all_data}
Based on this list of docs, please perform concise summaries while extracting essential relationships for relationships analysis later, please do include dates of actions or events, which are very important for timeline analysis later. Example: "Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)", which showcases not only the relationship between Sam and OpenAI, but also when it happens.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)

# Define the map_chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)

all_data = news_articles_data + documents
# Extract text from each document
# all_text_data = [doc.page_content for doc in all_data]

# Reduce
reduce_template = """The following is set of summaries:
{all_data}
Take these and distill it into concise summaries of the articles while containing important relationships and events (including the timeline). Example: "Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)", which showcases not only the relationship between Sam and OpenAI, but also when it happens.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# ChatPromptTemplate(input_variables=['all_data'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['all_data'], template='The following is a set of documents:\n{all_data}\nBased on this list of docs, please identify the main themes \nHelpful Answer:'))])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="all_data"  # This should match the variable name in reduce_prompt
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="all_data",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(all_data)

# Run the MapReduce Chain
summarization_results = map_reduce_chain.run(split_docs)


  map_chain = LLMChain(llm=llm, prompt=map_prompt)
  combine_documents_chain = StuffDocumentsChain(
  reduce_documents_chain = ReduceDocumentsChain(
  map_reduce_chain = MapReduceDocumentsChain(
  summarization_results = map_reduce_chain.run(split_docs)


KeyboardInterrupt: 

In [290]:
summarization_results

"1. Sam Altman was fired as CEO of OpenAI on November 17, 2023, leading to a power struggle within the company. Over 730 employees threatened to quit and join Altman at Microsoft unless the board resigned and reappointed Altman and co-founder Greg Brockman. Despite initial talks of reinstatement, the board later confirmed that Altman would not be returning. Altman and Brockman subsequently joined Microsoft to head a new advanced AI research unit.\n\n2. Following Altman's departure, the OpenAI board underwent a reshuffle, with Mira Murati appointed as interim CEO, only to be later replaced by Emmett Shear, the former CEO of Twitch. The board's actions were criticized for lack of transparency and communication, with the exact reasons for Altman's removal remaining unclear.\n\n3. The OpenAI board, composed of Ilya Sutskever, Adam D’Angelo, Helen Toner, and Tasha McCauley, faced calls for resignation. Potential replacements included Bret Taylor and Will Hurd. The board's composition was cr

In [16]:
# Store summarization_results to a text file for future use
# Timeline will further be added into the summaries
with open('summary.txt', 'w') as file:
    file.write(str(summarization_results))

NameError: name 'summarization_results' is not defined

In [None]:
# !pip install openai

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting openai
  Using cached openai-1.59.7-py3-none-any.whl.metadata (27 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp39-cp39-macosx_10_12_x86_64.whl.metadata (5.2 kB)
Using cached openai-1.59.7-py3-none-any.whl (454 kB)
Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.8.2-cp39-cp39-macosx_10_12_x86_64.whl (304 kB)
    tinycss2 (>=1.1.0<1.2) ; extra == 'css'
             ~~~~~~~~^[0m[33m
[0mInstalling collected packages: jiter, distro, openai
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer wo

# Entity and Relationship

In [None]:
import os
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble

# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents] # referencial

# spacy-llm relationship extraction
def process_text(nlp, text, verbose=False):
    doc = nlp(text)
    if verbose:
        msg.text(f"Text: {doc.text}")
        msg.text(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        msg.text("Relations:")
        for r in doc._.rel:
            msg.text(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
    return doc

def run_pipeline(config_path, examples_path=None, verbose=False):
    if not os.getenv("OPENAI_API_KEY"):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again.", exits=1)

    nlp = assemble(config_path, overrides={} if examples_path is None else {"paths.examples": str(examples_path)})

    # Initialize counters and storage
    processed_data = []
    entity_counts = Counter()
    relation_counts = Counter()

    # Load your articles and news data here
    # all_data = news_articles_data + documents

    sents = split_document_sent(summarization_results)
    for sent in sents:
        doc = process_text(nlp, sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]

        # Store processed data
        processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])
        relation_counts.update([rel[1] for rel in relations])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    msg.text(f"Entity counts: {entity_counts}")
    msg.text(f"Relation counts: {relation_counts}")

# Set your configuration paths and flags
config_path = Path("zeroshot.cfg")
examples_path = None  # or None if not using few-shot
verbose = True

# Run the pipeline
file = run_pipeline(config_path, None, verbose)



[38;5;1m✘ OPENAI_API_KEY env variable was not found. Set it and try again.[0m



SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
