In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from bs4 import BeautifulSoup
import pandas as pd
import requests

def get_website_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None

def parse_mainrow(row):
    name = row.find('a').text.strip()
    link = row.find('a')['href']
    aum = row.find_all('td')[-1].text.strip()
    return name, aum, link

def extract_mainrow_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    mainrows = soup.find_all('tr', class_='mainrow')
    
    data = []
    for row in mainrows:
        data.append(parse_mainrow(row))
    
    return pd.DataFrame(data, columns=['Firm Name', 'AUM', 'Link'])

def upload_to_mongodb(df):
    load_dotenv()  # Load environment variables from .env file
    
    mongodb_uri = os.getenv('13F_MongoDB_URI')
    if not mongodb_uri:
        print("MongoDB URI not found in .env file")
        return False
    
    try:
        client = MongoClient(mongodb_uri)
        db = client['13f_filings']  # You can change the database name if needed
        collection = db['investment_firms']
        
        # Convert DataFrame to list of dictionaries
        records = df.to_dict('records')
        
        # Insert or update records in MongoDB
        for record in records:
            result = collection.update_one(
                {'Firm Name': record['Firm Name']},  # Query to find existing record
                {'$set': record},  # Update with new data
                upsert=True  # Insert if not found
            )
        
        print(f"Successfully processed {len(records)} records in MongoDB")
        return True
    except Exception as e:
        print(f"An error occurred while uploading to MongoDB: {e}")
        return False
    finally:
        client.close()

# Main execution
url = "https://www.holdingschannel.com/13f/latest-filings/"
html_content = get_website_html(url)

if html_content:
    df = extract_mainrow_data(html_content)
    df.to_csv('investment_firms.csv', index=False)
    print(f"Data extracted and saved to 'investment_firms.csv'")
    print(df.head())  # Display the first few rows
    
    # Upload to MongoDB
    if upload_to_mongodb(df):
        print("Data successfully uploaded to MongoDB")
    else:
        print("Failed to upload data to MongoDB")
else:
    print("Failed to retrieve HTML content. Cannot proceed with extraction.")

Data extracted and saved to 'investment_firms.csv'
                           Firm Name       AUM  \
0                 Creekside Partners  $180,578   
1  Gordian Capital Singapore Pte Ltd  $189,888   
2           Brown Financial Advisors  $223,770   
3                           NCP Inc.  $158,071   
4       SW Investment Management LLC  $314,407   

                                                Link  
0  https://www.holdingschannel.com/13f/creekside-...  
1  https://www.holdingschannel.com/13f/gordian-ca...  
2  https://www.holdingschannel.com/13f/brown-fina...  
3  https://www.holdingschannel.com/13f/ncp-inc-to...  
4  https://www.holdingschannel.com/13f/sw-investm...  
Successfully processed 100 records in MongoDB
Data successfully uploaded to MongoDB
