In [1]:

# Web requests and data handling
import requests
import json
import pandas as pd
import numpy as np

# Date and time utilities
from datetime import datetime, timedelta
import time

# File handling
import os
import sys

# Adding our src directory to Python path so we can import our custom functions later
sys.path.append('../src')

# Displaying settings for better notebook output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)



In [3]:
print ("We need to test the connection to the SEC EDGAR API Connection...It's a free and open database"  )
print("-" * 100)



# Cell 2: Test SEC EDGAR API Connection
print("🔌 Testing connection to SEC EDGAR database...")
print("-" * 50)

# SEC requires us to identify ourselves - this is mandatory!
headers = {
    'User-Agent': 'M&A Intelligence Platform (dhruvb363@gmail.com.com)'
}

# Test with a simple API endpoint - get list of companies
test_url = "https://www.sec.gov/files/company_tickers.json"

try:
    print("📡 Attempting to connect to SEC EDGAR...")
    
    # Make the request with a timeout
    response = requests.get(test_url, headers=headers, timeout=10)
    
    # Check if the request was successful
    if response.status_code == 200:
        print("Connected to the SEC EDGAR database")
        
        # Parse the JSON response
        company_data = response.json()
        
        # Show some basic info about what we got
        print(f"Retrieved data for {len(company_data)} companies")
        print(f"Response time: {response.elapsed.total_seconds():.2f} seconds")
        
        # Show a few example companies to verify data quality
        print("\n🏢 Sample companies from SEC database:")
        count = 0
        for key, company in company_data.items():
            if count < 5:  # Show first 5 companies
                ticker = company.get('ticker', 'N/A')
                title = company.get('title', 'N/A')
                print(f"   • {ticker}: {title}")
                count += 1
        
        print(f"\n🎯 SEC API is working! We can access {len(company_data)} companies.")
        
    else:
        print(f"❌ ERROR: Failed to connect. Status code: {response.status_code}")
        print("This might be a temporary issue. Try again in a few minutes.")
        
except requests.exceptions.RequestException as e:
    print(f"❌ CONNECTION ERROR: {str(e)}")
    print("Check your internet connection and try again.")
    
except Exception as e:
    print(f"❌ UNEXPECTED ERROR: {str(e)}")

print("\n" + "=" * 50)
print("🔄 Connection test complete. Ready for next step...")



We need to test the connection to the SEC EDGAR API Connection...It's a free and open database
----------------------------------------------------------------------------------------------------
🔌 Testing connection to SEC EDGAR database...
--------------------------------------------------
📡 Attempting to connect to SEC EDGAR...
Connected to the SEC EDGAR database
Retrieved data for 10069 companies
Response time: 0.42 seconds

🏢 Sample companies from SEC database:
   • NVDA: NVIDIA CORP
   • MSFT: MICROSOFT CORP
   • AAPL: Apple Inc.
   • GOOGL: Alphabet Inc.
   • AMZN: AMAZON COM INC

🎯 SEC API is working! We can access 10069 companies.

🔄 Connection test complete. Ready for next step...


### **Getting the Data:** 

- ### I want to check out whether we can get the SEC filings, which will be crucial for our NLP tasks later in the project.     Let's run a test to check this out! 

- ### After that, I will use feedparser to go through a bunch of RSS news feeds, which will later help me track daily news and updates 

- ### I'm going to try out multiple sources at once...Eeven if one fall shorts, something will work at least

- ### I'm also going to test out API's for financial data, to get information on stocks and so on. 



In [None]:

# We'll test with Apple Inc. (everyone knows them, lots of filings)
test_company = "Apple Inc"
test_ticker = "AAPL" 
apple_cik = "0000320193"  # Apple's official SEC identifier

# SEC API endpoint for company filings
filings_url = f"https://data.sec.gov/submissions/CIK{apple_cik}.json"

# Set up headers (SEC requirement)
headers = {
    'User-Agent': 'M&A Intelligence Platform (dhruv.student@example.com)',  # Update with your email
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

try:
    print(f"🔍 Looking up recent filings for {test_company} ({test_ticker})...")
    
    # Get company's filing information
    response = requests.get(filings_url, headers=headers, timeout=15)
    
    if response.status_code == 200:
        print("✅ Successfully downloaded company data!")
        
        # Parse the JSON response
        company_info = response.json()
        
        # Extract basic company information
        company_name = company_info.get('name', 'Unknown')
        sic_description = company_info.get('sicDescription', 'Unknown')
        
        print(f"🏢 Company: {company_name}")
        print(f"📊 Industry: {sic_description}")
        
        # Get recent filings
        recent_filings = company_info.get('filings', {}).get('recent', {})
        
        if recent_filings:
            filing_forms = recent_filings.get('form', [])
            filing_dates = recent_filings.get('filingDate', [])
            accession_numbers = recent_filings.get('accessionNumber', [])
            
            print(f"\n📋 Found {len(filing_forms)} recent filings")
            
            # Show the 5 most recent filings
            print("\n🗂️ Most Recent Filings:")
            for i in range(min(5, len(filing_forms))):
                form_type = filing_forms[i]
                filing_date = filing_dates[i]
                
                # Highlight M&A-relevant filing types
                if form_type in ['10-K', '10-Q', '8-K', 'DEF 14A']:
                    marker = "🎯"  # These often contain M&A signals
                else:
                    marker = "📄"
                    
                print(f"   {marker} {form_type} filed on {filing_date}")
            
            # Test downloading one actual filing
            print(f"\n🔬 Testing download of most recent 10-K or 8-K filing...")
            
            # Find a 10-K or 8-K filing (most likely to have M&A content)
            target_filing = None
            for i in range(len(filing_forms)):
                if filing_forms[i] in ['10-K', '8-K']:
                    target_filing = {
                        'form': filing_forms[i],
                        'date': filing_dates[i],
                        'accession': accession_numbers[i].replace('-', '')
                    }
                    break
            
            if target_filing:
                # Construct URL for the actual filing document
                accession_clean = target_filing['accession']
                accession_formatted = f"{accession_clean[:10]}-{accession_clean[10:12]}-{accession_clean[12:]}"
                
                filing_url = f"https://www.sec.gov/Archives/edgar/data/{apple_cik}/{accession_clean}/{accession_formatted}.txt"
                
                print(f"📥 Downloading {target_filing['form']} from {target_filing['date']}...")
                
                # Add a small delay to be respectful to SEC servers
                time.sleep(0.1)
                
                filing_response = requests.get(filing_url, headers=headers, timeout=15)
                
                if filing_response.status_code == 200:
                    filing_text = filing_response.text
                    word_count = len(filing_text.split())
                    
                    print(f"✅ SUCCESS: Downloaded {target_filing['form']} filing!")
                    print(f"📊 Document length: {word_count:,} words")
                    
                    # Quick test: look for M&A-related keywords
                    ma_keywords = ['acquisition', 'merger', 'strategic', 'divest', 'spin-off', 'restructur']
                    keyword_counts = {}
                    
                    for keyword in ma_keywords:
                        count = filing_text.lower().count(keyword)
                        if count > 0:
                            keyword_counts[keyword] = count
                    
                    if keyword_counts:
                        print(f"\n🎯 M&A-related keywords found:")
                        for word, count in keyword_counts.items():
                            print(f"   • '{word}': {count} mentions")
                    else:
                        print(f"\n📝 No major M&A keywords in this filing (normal for {target_filing['form']})")
                    
                    print(f"\n🚀 Ready to process SEC filings! System is working perfectly.")
                    
                else:
                    print(f"⚠️ Could not download filing. Status: {filing_response.status_code}")
                    
            else:
                print("📋 No 10-K or 8-K filings found in recent submissions")
                
        else:
            print("⚠️ No recent filings data available")
            
    else:
        print(f"❌ Failed to get company data. Status code: {response.status_code}")
        print("SEC might be busy - try again in a few minutes")
        
except requests.exceptions.RequestException as e:
    print(f"❌ Network error: {str(e)}")
    
except Exception as e:
    print(f"❌ Error: {str(e)}")



🔍 Looking up recent filings for Apple Inc (AAPL)...
✅ Successfully downloaded company data!
🏢 Company: Apple Inc.
📊 Industry: Electronic Computers

📋 Found 1007 recent filings

🗂️ Most Recent Filings:
   📄 4 filed on 2025-08-12
   📄 144 filed on 2025-08-08
   🎯 10-Q filed on 2025-08-01
   🎯 8-K filed on 2025-07-31
   📄 SCHEDULE 13G/A filed on 2025-07-29

🔬 Testing download of most recent 10-K or 8-K filing...
📥 Downloading 8-K from 2025-07-31...
⚠️ Could not download filing. Status: 404

📋 SEC filing download test complete!
🎯 Next: We'll test news API connections...


In [5]:
# Getting in the RSS news feeds

# Install feedparser if not already installed
try:
    import feedparser
except ImportError:
    print("📦 Installing feedparser for RSS feeds...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "feedparser"])
    import feedparser

# Test multiple free news sources
news_sources = {
    "Reuters Business": "http://feeds.reuters.com/reuters/businessNews",
    "MarketWatch": "http://feeds.marketwatch.com/marketwatch/topstories/", 
    "Yahoo Finance": "https://finance.yahoo.com/news/rssindex",
    "SEC Press Releases": "https://www.sec.gov/news/pressreleases.rss"
}

print("🔍 Testing RSS news feeds...")

successful_sources = []
all_articles = []

for source_name, rss_url in news_sources.items():
    try:
        print(f"\n📡 Testing {source_name}...")
        
        # Parse RSS feed
        feed = feedparser.parse(rss_url)
        
        if feed.entries:
            article_count = len(feed.entries)
            print(f"✅ Success! Found {article_count} recent articles")
            
            # Look for M&A related articles
            ma_articles = []
            ma_keywords = ['merger', 'acquisition', 'buyout', 'takeover', 'deal', 'acquire', 'divest']
            
            for entry in feed.entries[:10]:  # Check first 10 articles
                title = entry.get('title', '').lower()
                summary = entry.get('summary', '').lower()
                
                # Check if article contains M&A keywords
                for keyword in ma_keywords:
                    if keyword in title or keyword in summary:
                        ma_articles.append({
                            'title': entry.get('title', 'No title'),
                            'published': entry.get('published', 'No date'),
                            'link': entry.get('link', ''),
                            'source': source_name,
                            'keyword': keyword
                        })
                        break
            
            if ma_articles:
                print(f"🎯 Found {len(ma_articles)} M&A-related articles:")
                for article in ma_articles[:3]:  # Show first 3
                    print(f"   • {article['title'][:80]}...")
                    
                all_articles.extend(ma_articles)
            else:
                print("📋 No M&A articles in recent headlines (normal - deals are rare)")
                
            successful_sources.append(source_name)
            
        else:
            print(f"⚠️ No articles found in {source_name} feed")
            
        # Small delay to be respectful
        time.sleep(0.2)
        
    except Exception as e:
        print(f"❌ Error accessing {source_name}: {str(e)}")

# Summary of results
print(f"\n" + "=" * 60)
print("📊 NEWS SOURCES SUMMARY:")
print(f"✅ Working sources: {len(successful_sources)}/{len(news_sources)}")
print(f"🎯 Total M&A articles found: {len(all_articles)}")

if successful_sources:
    print(f"\n🚀 Active news sources:")
    for source in successful_sources:
        print(f"   • {source}")

# Test web scraping backup (if RSS fails)
if len(successful_sources) < 2:
    print(f"\n🔧 Testing backup: Web scraping MarketWatch M&A section...")
    
    try:
        # Test scraping MarketWatch M&A page
        marketwatch_url = "https://www.marketwatch.com/markets"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(marketwatch_url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            print("✅ Web scraping backup working!")
            print("💡 Can scrape financial news sites directly if RSS feeds fail")
        else:
            print(f"⚠️ Web scraping test failed: Status {response.status_code}")
            
    except Exception as e:
        print(f"⚠️ Web scraping test error: {str(e)}")

# Show sample M&A article if found
if all_articles:
    print(f"\n📰 SAMPLE M&A ARTICLE:")
    sample = all_articles[0]
    print(f"Title: {sample['title']}")
    print(f"Source: {sample['source']}")  
    print(f"Date: {sample['published']}")
    print(f"M&A Keyword: '{sample['keyword']}'")

print(f"\n🎯 News collection system ready!")
print("📋 Next: We'll test financial data APIs...")

📦 Installing feedparser for RSS feeds...
🔍 Testing RSS news feeds...

📡 Testing Reuters Business...
⚠️ No articles found in Reuters Business feed

📡 Testing MarketWatch...
✅ Success! Found 10 recent articles
🎯 Found 1 M&A-related articles:
   • EchoStar’s stock is surging. Why AT&T just struck a $23 billion spectrum deal wi...

📡 Testing Yahoo Finance...
✅ Success! Found 45 recent articles
🎯 Found 1 M&A-related articles:
   • MARA Holdings Signs Investment Agreement with EDF Plus Ventures to Acquire Exaio...

📡 Testing SEC Press Releases...
✅ Success! Found 25 recent articles
🎯 Found 1 M&A-related articles:
   • Staff Issues FAQs to Help Broker-Dealers Implement Financial Responsibility Requ...

📊 NEWS SOURCES SUMMARY:
✅ Working sources: 3/4
🎯 Total M&A articles found: 3

🚀 Active news sources:
   • MarketWatch
   • Yahoo Finance
   • SEC Press Releases

📰 SAMPLE M&A ARTICLE:
Title: EchoStar’s stock is surging. Why AT&T just struck a $23 billion spectrum deal with the company.
Source: 