In this short project I use Selenium to scrape and clean the text from this HTML page: 
https://www.fool.com/earnings/call-transcripts/2021/11/03/kaltura-inc-kltr-q3-2021-earnings-call-transcript/

This page contains an earnings call transcript between a company (Kultura - traded on NASDAQ as KLTR) and Wall-Street's analysts for the 3rd quarter of 2021.

Further, I split the earning call into 2 text files (using a python script): 1. What is the company saying and its answers to analysts questions (without who said it). 2. Questions of the analysts.

This is a general script that will be able to work with any such earning call.

In [1]:
import chromedriver_autoinstaller


chromedriver_autoinstaller.install()

'/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/chromedriver_autoinstaller/98/chromedriver'

In [2]:
%pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

SELENIUM_URL = 'http://127.0.0.1:4444/wd/hub'
EARNINGS_URL = 'https://www.fool.com/earnings/call-transcripts/2021/11/03/kaltura-inc-kltr-q3-2021-earnings-call-transcript/'

def get_earnings():    
    browser = webdriver.Chrome(options=webdriver.ChromeOptions())
    
    browser.maximize_window()
    browser.implicitly_wait(10)

    browser.get(EARNINGS_URL)

    # Find element with article-content class
    article_content = browser.find_element(By.CLASS_NAME, "article-content")
    # Find h2 element with text "Prepared Remarks" as starting point for parsing company remarks
    prepared_remarks = article_content.find_element(By.XPATH, "./h2[text()='Prepared Remarks:']")
    # Find each of prepared_remarks' siblings that come after it which are:
    # * <p> elements
    # * don't contain the <strong> tag
    # * aren't preceded by a <p> with <strong> with the text "Operator"
    # Then stop once the next <h2> element is reached (Q&A starts)
    statements = prepared_remarks.find_elements(By.XPATH, "./following-sibling::p[not(strong) and preceding-sibling::p[strong][1][text()!='Operator'] and count(preceding-sibling::h2)=2]")
    # Add the text from each statement to a list (only if it contains any content)
    statements_text = [statement.text for statement in statements if len(statement.text.strip())]

    # Find h2 element with text "Questions & Answers" as starting point for parsing Q&A
    questions_answers = article_content.find_element(By.XPATH, "./h2[text()='Questions & Answers:']")
    # Find each of questions_answers' siblings that come after it which are:
    # * <p> elements
    # * don't contain the strong tag
    # * aren't preceded by a <p> with <strong> with the text "Operator"
    # * For questions we look for text from a speaker with the word "Analyst" in their name
    # * For answers we look for text from a speaker without the word "Analyst" in their name
    questions = questions_answers.find_elements(By.XPATH, "./following-sibling::p[not(strong) and preceding-sibling::p[strong and em][1][not(contains(strong/text(), 'Operator')) and contains(em/text(), 'Analyst')]]")
    # Save text property for each question element to a new list
    questions_text = [question.text for question in questions if len(question.text.strip())]
    answers = questions_answers.find_elements(By.XPATH, "./following-sibling::p[not(strong) and preceding-sibling::p[strong and em][1][not(contains(strong/text(), 'Operator')) and not(contains(text(), 'Analyst'))]]")
    # Save text property for each answer element to a new list
    answers_text = [answer.text for answer in answers if len(answer.text.strip())]
    with open("company.txt", "w") as company:
        company.write("\r\n".join(statements_text + answers_text))

    with open("questions.txt", "w") as questions:
        questions.write("\r\n".join(questions_text))

    browser.close()
    browser.quit()


get_earnings()


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
