# Install Dependencies

In [None]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.18.0.tar.gz (198 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/198.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/198.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m143.4/198.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.0/198.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.18.0-py3-none-any.whl size=273862 sha

# Connect to the Database

In [None]:
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://bc8b6e15.databases.neo4j.io"
AUTH = ("neo4j", "Vucu9PUNiseZqd3RqqHjOBQ2-spLkBuP1H_VYbaZo7M")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

# Query the Database

In [None]:
# Construct the schema

sample_schema = """
CREATE (BNY:COMPANY {name: 'BNY Mellon'})
CREATE (q1_2023:EARNINGSCALL {name: 'q1_2023'})
CREATE (BNY) -[:HAS_EARNINGS] -> (q1_2023)
CREATE (q1_2023) -[:HAS_SECTION]-> (p:PARTICIPANT_SECTION {name: 'Participants'})
CREATE (q1_2023) -[:HAS_SECTION]-> (presentation:PRESENTATION_SECTION {name: 'Presentation'})
CREATE (q1_2023) -[:HAS_SECTION]-> (qa:QA_SECTION {name: "Q & A"})

CREATE (p) - [:HAS_PARTICIPANT] -> (rv:PARTICIPANT {name: "Robin Vince", title: "CEO of BNY Mellon"})
CREATE (p) - [:HAS_PARTICIPANT] -> (mm:PARTICIPANT {name: "Mike Mayo", title: "Analyst Wells Fargo"})

CREATE (presentation) -[:PRESENT_BY]-> (rv)
CREATE (presentation) -[:HAS_CONTENT]-> (content:CONTENT {name: "content", text: "The fourth quarter afsdhijbafsdhiuasdijb hiusdafihuj hiuasdijafdsihj ihu iuhjaihuj ijhdhij hiujahjkf dhij uih  asdf"})
CREATE (rv) -[:ANNOUNCE] -> (content)


CREATE (ans: ANSWER {text: "Where do you exp...", sentiment: "+ve"}) -[:ANSWER_TO] -> (question:QUESTION {text: "Where do you exp...", sentiment: "+ve"})
CREATE (mm)-[:ASKED]->(question)
CREATE (rv)-[:ANSWERED]->(ans)
CREATE (qa) -[:HAS_QUESTION]->(question)

CREATE (topic:Topic {name: "Revenue"})
CREATE (ans)-[:MENTIONED]->(topic)
CREATE (question)-[:MENTIONED]->(topic)

"""

records, summary, keys = driver.execute_query(sample_schema)

  records, summary, keys = driver.execute_query(schema)


# Parsing


In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse('data.xml')
root = tree.getroot()

In [None]:
header_element = root.find('header')

In [None]:
class Transcript:
    def __init__(self, header, participants_section, presentation_section, qa_section):
        self.header = header
        self.participants_section = participants_section
        self.presentation_section = presentation_section
        self.qa_section = qa_section

class Header:
    def __init__(self, company, title, time, currency, note):
        self.company = company
        self.title = title
        self.time = time
        self.currency = currency
        self.note = note

class ParticipantsSection:
    def __init__(self, participants=None, name="call participants"):
        self.name = name
        self.__participantsDict__ = participants
        self.participants = participants.values()

    def get_participant(self, participants_id):
        return self.__participantsDict__[participants_id]


class Person:
    def __init__(self, id, position=None, group=None, name=None):
        self.position = position
        self.group = group
        self.id = id
        self.name = name

    def __repr__(self):
        return self.name + ' ' + self.position

class Statement: # Content in the schema
    def __init__(self, speaker:Person, text):
        self.speaker = speaker
        self.text = text


class PresentationSection:
    def __init__(self, statements=None, name="Presentation"):
        self.name = name
        self.statements = statements if statements else []


class QASection:
    def __init__(self, transitions=None, questions=None, answers=None):
        self.transitions = transitions if transitions else []
        self.__questionsDict__ = questions if questions else {}
        self.questions = questions.values()
        self.answers = answers if answers else []

    def get_question(self, question_id):
        return self.__questionsDict__[question_id]

class Transition:
    def __init__(self, speaker, text):
        self.speaker = speaker
        self.text = text

class Question:
    def __init__(self, id, speaker, text):
        self.id = id
        self.speaker = speaker
        self.text = text

class Answer:
    def __init__(self, id, question, speaker, text):
        self.id = id
        self.question=question
        self.speaker = speaker
        self.text = text


In [None]:
def extract_participants(section_element, header):
    operator = Person(
            id="-1",
            name=f"{header.company} operator"
        )
    participants = {"-1": operator}
    for person_element in section_element.findall('person'):
        person = Person(
            id=person_element.get('id'),
            position=person_element.get('position'),
            group=person_element.get('group'),
            name=person_element.text
        )
        participants[person_element.get('id')] = person
    par_sec = ParticipantsSection(participants)
    return par_sec

def extract_presentation(section_element, participants):
    statements = []
    for statement_element in section_element.findall('statement'):
        speaker_element = statement_element.find('speaker')
        speaker_id=speaker_element.get('id')

        statement = Statement(speaker=participants.get_participant(speaker_id), text=speaker_element.find('text').text)
        statements.append(statement)
    return PresentationSection(statements)

def extract_qanda(section_element, participants):
    questions = {}
    answers = []
    transitions = []
    for question_element in section_element.findall('question'):
        speaker_element = question_element.find('speaker')
        question = Question(
            id=question_element.get('id'),
            speaker=participants.get_participant(speaker_element.get('id')),
            text=speaker_element.find('text').text
        )
        questions[question.id] = question

    for transition_element in section_element.findall('trainsition'): # This is a typo
        speaker_element = transition_element.find('speaker')
        transition = Transition(
            speaker=participants.get_participant("-1"), # Operator
            text=speaker_element.find('text').text
        )
        transitions.append(transition)

    for answer_element in section_element.findall('answer'):
        speaker_element = answer_element.find('speaker')
        answer = Answer(
            id=answer_element.get('id'),
            question=questions[answer_element.get('id')],
            speaker=participants.get_participant(speaker_element.get('id')),
            text=speaker_element.find('text').text
        )
        answers.append(answer)


    return QASection(transitions=transitions, questions=questions, answers=answers)


In [None]:

# Extract header information
header_element = root.find('header')
header = Header(
    company=header_element.find('company').text,
    title=header_element.find('title').text,
    time=header_element.find('time').text,
    currency=header_element.find('currency').text,
    note=header_element.find('note').text
)

for section_element in root.findall('body/section'):
    if section_element.get('name') == "call participants":
        participants_section = extract_participants(section_element, header)
    elif section_element.get('name') == "Presentation ":
        presentation_section = extract_presentation(section_element, participants_section)
    elif section_element.get('name') == "Question and Answer":
        qa_section = extract_qanda(section_element, participants_section)
    else:
        print(f"Skip tag {section_element.get('name')}")

transcript = Transcript(header, participants_section, presentation_section, qa_section)

Skip tag financial tables


In [None]:
header.title

'FQ2 2023 Earnings Call Transcripts'

In [None]:
COMPANY = "COMPANY"
EARNINGSCALL = "EARNINGSCALL"
PARTICIPANT_SECTION = "PARTICIPANT_SECTION"
PRESENTATION_SECTION = "PRESENTATION_SECTION"
QA_SECTION = "QA_SECTION"
def add_query(cyper, query):
    return cyper + query + "\n"

def make_participant_id(id):
    return f"participant{int(id)+1}" #offset -1

def make_content_id(id):
    return f"content{id}"

def make_question_id(id):
    return f"question{id}"
def make_answer_id(id):
    return f"answer{id}"

cypher = ""
# process_header:
header = transcript.header
query = \
    'MERGE (%s:COMPANY {name: "%s"}) \n' % (COMPANY, header.company) + \
    'MERGE (%s:EARNINGSCALL {name: "%s", time: "%s"}) \n' % (EARNINGSCALL, header.title, header.time) + \
    'MERGE (%s) -[:HAS_EARNINGS] -> (%s)' % (COMPANY, EARNINGSCALL)

cypher = add_query(cypher, query)

# process_participants:
participants_section = transcript.participants_section
query = 'MERGE (%s) -[:HAS_SECTION]-> (%s:SECTION {name: "%s"}) \n' % (EARNINGSCALL, PARTICIPANT_SECTION, PARTICIPANT_SECTION)
for participant in participants_section.participants:
    q = 'MERGE (PARTICIPANT_SECTION) - [:HAS_PARTICIPANT] -> (%s:PARTICIPANT {name: "%s", position: "%s", group: "%s", id: "%s"})' \
    % (make_participant_id(participant.id), participant.name, participant.position, participant.group, participant.id)
    query = add_query(query, q)

cypher = add_query(cypher, query)


# process_presentation:
presentation_section = transcript.presentation_section
query = 'MERGE (%s) -[:HAS_SECTION]-> (%s:SECTION {name: "%s"}) \n' % (EARNINGSCALL, PRESENTATION_SECTION, PRESENTATION_SECTION)
content_id = 0
for content in presentation_section.statements:
    q = 'MERGE (%s) -[:HAS_CONTENT]-> (%s:CONTENT {text: "%s"}) \n' % (PRESENTATION_SECTION, make_content_id(content_id), content.text) + \
        'MERGE (%s) -[:ANNOUNCE] -> (%s) \n' % (make_participant_id(content.speaker.id), make_content_id(content_id))
    content_id += 1
    query = add_query(query, q)

cypher = add_query(cypher, query)

# process_qa_section:
# Questions
qa_section = transcript.qa_section
query = 'MERGE (%s) -[:HAS_SECTION]-> (%s:SECTION {name: "%s"}) \n' % (EARNINGSCALL, QA_SECTION, QA_SECTION)
for question in qa_section.questions:
    # TODO: add sentiment analysis!
    q = 'MERGE (%s) -[:HAS_QUESTION]->(%s:QUESTION {text: "%s", sentiment: ""}) \n' % (QA_SECTION, make_question_id(question.id), question.text) + \
        'MERGE (%s)-[:ASKED]->(%s)' % (make_participant_id(question.speaker.id), make_question_id(question.id))
    # TODO: add topic modeling and associate question to topics
    query = add_query(query, q)
cypher = add_query(cypher, query)

# Answer
query = ""
answer_id = 0
for answer in qa_section.answers:
    # TODO: add sentiment analysis!
    q = 'MERGE (%s: ANSWER {text: "%s", sentiment: ""}) -[:ANSWER_TO] -> (%s) \n' % (make_answer_id(answer_id), answer.text, make_question_id(answer.question.id)) + \
        'MERGE (%s)-[:ANSWERED]->(%s) \n' % (make_participant_id(answer.speaker.id), make_answer_id(answer_id)) + \
        'MERGE (%s) -[:HAS_ANSWER] -> (%s)' % (make_question_id(answer.question.id), make_answer_id(answer_id)) + \
        'MERGE (%s)-[:WAS_ANSWERED_BY]->(%s) \n' % (make_answer_id(answer_id), make_participant_id(answer.speaker.id))
    query = add_query(query, q)
    answer_id += 1
cypher = add_query(cypher, query)





In [None]:
records, summary, keys = driver.execute_query(cypher)

  records, summary, keys = driver.execute_query(cypher)


In [None]:
def addParticipantsSection()

'Transcript'

In [None]:
# Get the name of all 42 year-olds
records, summary, keys = driver.execute_query(
    """MATCH (n:Product)<-[r:CONTAINS]-(s:Order)
    RETURN n,r,s
    LIMIT 25""",
    age=42,
    database_="neo4j",
)

# Loop through results and do something with them
for r in records:
    print(r['n']['productName'], r['s']['orderDate'])

# Summary information
print("The query `{query}` returned {records_count} records in {time} ms.".format(
    query=summary.query, records_count=len(records),
    time=summary.result_available_after,
))

  records, summary, keys = driver.execute_query(


The query `MATCH (n:Product)<-[r:CONTAINS]-(s:Order)
    RETURN n,r,s
    LIMIT 25` returned 0 records in 31 ms.


# Close the Database Connection

In [None]:
driver.close()