# Sweet n Sour Sentiment on the Street
Georgia Tech Data Science Bootcamp - Cohort 6
Final Project
Team Members:
* Joseph Ayala
* Andrew Behrman
* Michael Fox
* Michael Hankinson

### Transcript Scraper

#### This notebook is designed to scrape the most recent Earnings Call Transcript for each of 10 S&P500 companies, chunk the call into 4 categories (Operator dialogue, Company presentation, Analyst Questions, Company Responses), and produce a dataset of the constituent sentences in each category.

In [2]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
import re
from splinter import Browser

#### Scrape call list from SeekingAlpha website
Get list of transcript news for specified ticker and find url of most recent earnings call.

Source: SeekingAlpha.com

In [42]:
ticker = 'WFC'
base_url = 'https://seekingalpha.com'
articles_url = base_url + '/symbol/' + ticker + '/earnings/transcripts'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
    'pragma': 'no-cache',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1'
}

http_session = requests.session()

# Retrieve page with the requests module
#response = http_session.get(articles_url, headers=headers)

# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

# collect news articles
articles = soup.find_all('div', class_="symbol_article")

# build list of article titles and urls, but only for call transcripts (last 3 words of title are "Earnings Call Transcript")
articles_details = []
for article in articles:
    a_tag = article.find('a')
    article_title = a_tag.text
    article_url = base_url + a_tag.get('href') + '?part=single'
    if (article_title[-24:] == 'Earnings Call Transcript'):
        articles_details.append({
            'ticker': ticker,
            'title': article_title,
            'url': article_url
        })

print(f'Most Recent Call Found: {articles_details[0]}')


Most Recent Call Found: {'ticker': 'WFC', 'title': 'Wells Fargo & Company (WFC) on Q3 2019 Results - Earnings Call Transcript', 'url': 'https://seekingalpha.com/article/4296629-wells-fargo-and-company-wfc-q3-2019-results-earnings-call-transcript?part=single'}


#### Scrape call transcript from SeekingAlpha website
Get html from call transcript and scrape into MongoDb

Source: SeekingAlpha.com

In [43]:
# get url of most recent call
call_url = articles_details[0]['url']

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
    'pragma': 'no-cache',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1'
}

# Retrieve page with the requests module
response = http_session.get(call_url, headers=headers)

# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

# collect paragraphs
paragraphs = soup.find_all('p')



In [44]:
## DEBUG -- regex experiments
# test = 'Ashish Kohli – Global Head-Investor Relations'
# test_arr = re.split('\s[–-]\s',test)
# test_arr

In [45]:
# prep dictionary for transcript
call_dict = {
    'ticker': ticker,
    'eps_info': '',
    'revenue_info': '',
    'call_title': '',
    'company_participants': [],
    'call_participants': [],
    'paragraphs': []
}

# setup expected sequence of paragraphs in transcript
p_sequence = ['title','participant_hdr1','company_participants','particpant_hdr2','call_participants',
              'speaker_name1','text1','qa_hdr','speaker_name2','text2']

# keep track of progress through sequence
sequence_index = 0

# setup dict for speaker/paragraph pairs
current_p = {
    'speaker': '',
    'content': '',
    'call_section': ''
}

for p in paragraphs:
    
    if (p_sequence[sequence_index] == 'title'):
        
        # grab call title
        call_dict['call_title'] = p.text
        
        # advance our sequence tracker
        sequence_index += 1
        
    elif (p_sequence[sequence_index] == 'participant_hdr1'):
        
        # check that we found the header
        if (p.text == 'Company Participants'):
            
            # advance our sequence tracker
            sequence_index += 1
            
    elif (p_sequence[sequence_index] == 'company_participants'):
        
        # check that we did not find the next header yet
        if (p.text != 'Conference Call Participants'):
            
            # we found a name, parse name and title
            #print(p.text) # DEBUG
            name, title = re.split('\s[–-]\s',p.text)
            
            # populate participant name and title into call dictionary
            call_dict['company_participants'].append({
                'name': name,
                'title': title,
                'affiliation': 'host'
            })
            
            # don't advance sequence tracker since we may have more names
        
        else:
            
            # found next participant header, skip ahead by 2
            sequence_index += 2
        
    elif (p_sequence[sequence_index] == 'participant_hdr2'):
        # should never get here, do nothing
        continue
        
    elif (p_sequence[sequence_index] == 'call_participants'):
        
        # check that we did not find first speaker yet

        if (p.contents[0].name != 'strong'):

            # we found a name, parse name and company
            print(p.text) # DEBUG
            name, company = re.split('\s[–-]\s',p.text)
            
            # populate participant name and company into call dictionary
            call_dict['call_participants'].append({
                'name': name,
                'company': company,
                'affiliation': 'guest'
            })
            
            # don't advance sequence tracker since we may have more names
        
        else:

            # found first speaker reference, capture name, section, and increment sequence tracker by 2
            current_p['speaker'] = p.text
            if (p.text == 'Operator'):
                current_p['call_section'] = 'operator_instruction'
            else:
                current_p['call_section'] = 'presentation'
            sequence_index += 2
        
    elif (p_sequence[sequence_index] == 'speaker_name1'):
        # should never get here
        continue
    elif (p_sequence[sequence_index] == 'text1'):
        
        # make sure we have not found next section indicated by <strong>
        if (p.contents[0].name != 'strong'):
            
            # append content to current_p
            current_p['content'] += ' ' + p.text
            
        else:
            
            # we found another section, check for Q/A Header
            if (p.text == 'Question-and-Answer Session'):
                
                # add current_p to call_dict
                call_dict['paragraphs'].append(current_p)
                
                # reset current_p
                current_p = {
                    'speaker': '',
                    'content': '',
                    'call_section': ''
                }
                
                # we are entering Q/A, force tracker to index 8 - speaker2
                sequence_index = 8
                
            else:
                
                # new speaker, not Q/A yet
                
                # add currrent_p to call_dict
                call_dict['paragraphs'].append(current_p)
                
                # reset current_p
                current_p = {
                    'speaker': p.text,
                    'content': '',
                    'call_section': 'presentation'
                }
                
                # no need to advance tracker as we were on text1 but found another speaker1, next section 
                # should again be text1
                        
    elif (p_sequence[sequence_index] == 'qa_hdr'):
        # should never get here
        continue
        
    elif (p_sequence[sequence_index] == 'speaker_name2'):
        
        # first speaker in Q/A section
        current_p['speaker'] = p.text
   
        # set call section appropriately...
        span = p.find('span')
        span_class = ''
        if (span):
            span_class = span['class'][0]

        # check if operator
        if (p.text == 'Operator'):   
            current_p['call_section'] = 'operator_instruction'
        # check if question
        elif (span_class == 'question'):
            current_p['call_section'] = 'question'
        # check if answer
        elif (span_class == 'answer'):
            current_p['call_section'] = 'answer'
        # default to presentation, in case of closing remarks for example
        else:
            current_p['call_section'] = 'presentation'     
            
        # advance sequence tracker
        sequence_index += 1
        
    elif (p_sequence[sequence_index] == 'text2'):

        # make sure we have not found next speaker indicated by <strong>
        if (p.contents[0].name != 'strong'):
            
            # append content to current_p
            current_p['content'] += ' ' + p.text
            
        else:
            
            # we found another speaker, add current_p to call_dict and reset
            call_dict['paragraphs'].append(current_p)
            
            # reset current_p
            current_p = {
                'speaker': p.text,
                'content': '',
                'call_section': ''
            }
            
            # set call section appropriately...
            span = p.find('span')
            span_class = ''
            if (span):
                span_class = span['class'][0]

            # check if operator
            if (p.text == 'Operator'):   
                current_p['call_section'] = 'operator_instruction'
            # check if question
            elif (span_class == 'question'):
                current_p['call_section'] = 'question'
            # check if answer
            elif (span_class == 'answer'):
                current_p['call_section'] = 'answer'
            # default to presentation, in case of closing remarks for example
            else:
                current_p['call_section'] = 'presentation'

            # no need to advance tracker as we were on text2 but found another speaker2, next section 
            # should again be text2
        
    else:
        # should never get here
        continue

# append final p to call_dict
call_dict['paragraphs'].append(current_p)

Erika Najarian - Bank of America Merrill Lynch
Scott Siefers - Sandler O'Neill
John McDonald - Autonomous Research
Ken Usdin - Jefferies
John Pancari - Evercore ISI
Matt O'Connor - Deutsche Bank
David Long - Raymond James
Vivek Juneja - JPMorgan
Eric Compton - Morningstar
Saul Martinez - UBS
Betsy Graseck - Morgan Stanley


In [46]:
### DEBUG
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(call_dict)
### END DEBUG

{   'call_participants': [   {   'affiliation': 'guest',
                                 'company': 'Bank of America Merrill Lynch',
                                 'name': 'Erika Najarian'},
                             {   'affiliation': 'guest',
                                 'company': "Sandler O'Neill",
                                 'name': 'Scott Siefers'},
                             {   'affiliation': 'guest',
                                 'company': 'Autonomous Research',
                                 'name': 'John McDonald'},
                             {   'affiliation': 'guest',
                                 'company': 'Jefferies',
                                 'name': 'Ken Usdin'},
                             {   'affiliation': 'guest',
                                 'company': 'Evercore ISI',
                                 'name': 'John Pancari'},
                             {   'affiliation': 'guest',
                                 'company':

                                     'investigations that began in 2006, we '
                                     "really don't have any update beyond what "
                                     'we said in our prepared remarks and what '
                                     'we previously disclosed in our last '
                                     'quarter 10-Q, our discussions with the '
                                     'DOJ and SEC are ongoing and when we have '
                                     "more information to disclose, we'll of "
                                     'course do so.',
                          'speaker': 'John Shrewsberry'},
                      {   'call_section': 'question',
                          'content': ' Absolutely, you said 2006 you meant 16?',
                          'speaker': 'Eric Compton'},
                      {   'call_section': 'answer',
                          'content': " Yes, I'm sorry. There is nothing from "
             

In [47]:
import pymongo

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [48]:
# Define database and collection
db = client.sweet_n_sour
collection = db.call_transcripts

In [49]:
# insert call transcript into db
collection.insert_one(call_dict)

<pymongo.results.InsertOneResult at 0x10ad92cc8>