In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

base_transcripts_url = "https://seekingalpha.com/symbol/{ticker}/earnings/transcripts"

In [3]:
# Headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [4]:
robots_url = "https://seekingalpha.com/robots.txt"
response = requests.get(robots_url)

In [5]:
if response.status_code == 200:
    print(response.text)
else:
    print(f"Failed to fetch robots.txt: Status {response.status_code}")

User-agent: *
Disallow: /api/v3/account
Disallow: /articles/enqueue_tracking
Disallow: /authentication
Disallow: /clean
Disallow: /iphone_data/check_update
Disallow: /market_news/enqueue_tracking
Disallow: /mone$
Disallow: /mone_event
Disallow: /mone_v2
Disallow: /mpw_count
Disallow: /research/enqueue_tracking
Disallow: /_sa_track/
Disallow: /xgCxM9By/init.js
Disallow: /zuora

Sitemap: https://seekingalpha.com/sitemap_news.xml
Sitemap: https://seekingalpha.com/instablog/index.xml
Sitemap: https://seekingalpha.com/news/index.xml
Sitemap: https://seekingalpha.com/article/index.xml
Sitemap: https://seekingalpha.com/author/index.xml
Sitemap: https://seekingalpha.com/checkout/index.xml
Sitemap: https://seekingalpha.com/symbol/sitemap_index.xml
Sitemap: https://seekingalpha.com/evergreen_sitemap.xml



In [6]:
ticker_company_map = {
    "AAPL": "Apple",
    "MSFT": "Microsoft",
    "GOOG": "Alphabet",
    "AMZN": "Amazon",
    "TSLA": "Tesla",
    "META": "Meta",
    "NFLX": "Netflix",
    "NVDA": "NVIDIA",
    "AMD": "Advanced Micro Devices",
    "BA": "Boeing",
    "V": "Visa",
    "SPY": "SPDR S&P 500 ETF",
    "SPCE": "Virgin Galactic",
    "FB": "Facebook", 
    "TWTR": "Twitter", 
    "BABA": "Alibaba",
    "MSTR": "MicroStrategy",
    "DIS": "Disney",
    "PYPL": "PayPal",
    "SHOP": "Shopify",
    "COIN": "Coinbase",
    "SQ": "Block",
    "INTC": "Intel",
    "CSCO": "Cisco",
    "IBM": "IBM",
    "GE": "General Electric",
    "WMT": "Walmart",
    "T": "AT&T",
}

In [7]:
def fetch_transcript_links(ticker):
    """
    Fetch up to three transcript links for the given ticker from Seeking Alpha.
    """
    url = base_transcripts_url.format(ticker=ticker)
    print(f"\nFetching transcript links for {ticker} from {url}")
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching page for {ticker} (Status code: {response.status_code})")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    links = [
        "https://seekingalpha.com" + a["href"]
        for a in soup.find_all("a", href=True)
        if "earnings-call-transcript" in a["href"]
    ]
    unique_links = list(dict.fromkeys(links))
    return unique_links[:3]

In [8]:
def scrape_transcript(transcript_url):
    """
    Scrape the transcript text from the transcript URL.
    """
    print(f"\nFetching transcript from: {transcript_url}")
    response = requests.get(transcript_url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching transcript (Status code: {response.status_code})")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")
    # Extract all paragraph tags and join their text
    paragraphs = soup.find_all("p")
    transcript_text = "\n".join(p.get_text() for p in paragraphs)
    return transcript_text

In [9]:
def main():
    # List to store transcript data for the DataFrame
    data = []

    for ticker, company_name in ticker_company_map.items():
        print(f"\n--- Processing {ticker} ({company_name}) ---")
        transcript_links = fetch_transcript_links(ticker)
        
        if not transcript_links:
            print(f"No transcripts found for {ticker}.")
        else:
            for idx, transcript_url in enumerate(transcript_links, start=1):
                transcript_text = scrape_transcript(transcript_url)
                if transcript_text:
                    # Store the transcript data as a dictionary
                    data.append({
                        "ticker": ticker,
                        "company": company_name,
                        "transcript_number": idx,
                        "transcript_url": transcript_url,
                        "transcript_text": transcript_text
                    })
                    # Optionally, print a preview of the transcript
                    print(f"\nTranscript {idx} Preview for {ticker}:\n{transcript_text[:500]}...\n")
                else:
                    print(f"Transcript {idx} for {ticker} could not be fetched.")
                
                print("Sleeping for 5 seconds before next transcript request...\n")
                time.sleep(5)
        
        print("Sleeping for 10 seconds before processing the next ticker...\n")
        time.sleep(10)
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)
    
    # Display the DataFrame
    print("\n--- Final DataFrame ---")
    print(df.head())
    
    # Optionally, save the DataFrame to a CSV file
    df.to_csv("earnings_transcripts.csv", index=False)
    print("\nData saved to earnings_transcripts.csv!")

In [10]:
if __name__ == "__main__":
    main()


--- Processing AAPL (Apple) ---

Fetching transcript links for AAPL from https://seekingalpha.com/symbol/AAPL/earnings/transcripts
Error fetching page for AAPL (Status code: 403)
No transcripts found for AAPL.
Sleeping for 10 seconds before processing the next ticker...


--- Processing MSFT (Microsoft) ---

Fetching transcript links for MSFT from https://seekingalpha.com/symbol/MSFT/earnings/transcripts
Error fetching page for MSFT (Status code: 403)
No transcripts found for MSFT.
Sleeping for 10 seconds before processing the next ticker...


--- Processing GOOG (Alphabet) ---

Fetching transcript links for GOOG from https://seekingalpha.com/symbol/GOOG/earnings/transcripts
Error fetching page for GOOG (Status code: 403)
No transcripts found for GOOG.
Sleeping for 10 seconds before processing the next ticker...


--- Processing AMZN (Amazon) ---

Fetching transcript links for AMZN from https://seekingalpha.com/symbol/AMZN/earnings/transcripts
Error fetching page for AMZN (Status code