In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from bs4 import BeautifulSoup
import pandas as pd
import requests

def get_website_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None

def parse_mainrow(row):
    name = row.find('a').text.strip()
    link = row.find('a')['href']
    aum = row.find_all('td')[-1].text.strip()
    return name, aum, link

def extract_mainrow_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    mainrows = soup.find_all('tr', class_='mainrow')
    
    data = []
    for row in mainrows:
        data.append(parse_mainrow(row))
    
    return pd.DataFrame(data, columns=['Firm Name', 'AUM', 'Link'])

def upload_to_mongodb(df):
    load_dotenv()  # Load environment variables from .env file
    
    mongodb_uri = os.getenv('13F_MongoDB_URI')
    if not mongodb_uri:
        print("MongoDB URI not found in .env file")
        return False
    
    try:
        client = MongoClient(mongodb_uri)
        db = client['13f_filings']  # You can change the database name if needed
        collection = db['investment_firms']
        
        # Convert DataFrame to list of dictionaries
        records = df.to_dict('records')
        
        # Insert or update records in MongoDB
        for record in records:
            result = collection.update_one(
                {'Firm Name': record['Firm Name']},  # Query to find existing record
                {'$set': record},  # Update with new data
                upsert=True  # Insert if not found
            )
        
        print(f"Successfully processed {len(records)} records in MongoDB")
        return True
    except Exception as e:
        print(f"An error occurred while uploading to MongoDB: {e}")
        return False
    finally:
        client.close()

# Main execution
url = "https://www.holdingschannel.com/13f/latest-filings/"
html_content = get_website_html(url)

if html_content:
    df = extract_mainrow_data(html_content)    
    # Upload to MongoDB
    if upload_to_mongodb(df):
        print("Data successfully uploaded to MongoDB")
    else:
        print("Failed to upload data to MongoDB")
else:
    print("Failed to retrieve HTML content. Cannot proceed with extraction.")

Successfully processed 100 records in MongoDB
Data successfully uploaded to MongoDB


In [8]:
import os
import json
from dotenv import load_dotenv
from pymongo import MongoClient
from googlesearch import search

# Load environment variables
load_dotenv()

# Get MongoDB URI from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
firms_collection = db['investment_firms']
linkedin_collection = db['full_dataset']

def search_company(company_name):
    query = f"{company_name} site:linkedin.com"
    search_results = []
    
    try:
        for j in search(query, tld="co.in", num=5, stop=10, pause=2):
            if "https://www.linkedin.com/in" in j:
                search_results.append(j)
    except Exception as e:
        print(f"Error searching for {company_name}: {str(e)}")
    
    return search_results

def process_company(company_name):
    linkedin_links = search_company(company_name)
    return linkedin_links

# Extract firm names from the investment_firms collection
#firm_names = [doc['Firm Name'] for doc in firms_collection.find({}, {'Firm Name': 1, '_id': 0}) if 'Firm Name' in doc]

firm_names = ['Bridgewater Associates', 'Renaissance Technologies', 'AQR Capital Management', 'Two Sigma Investments', 'BlackRock']

# List to store all results
all_results = []

# Process each company
for company_name in firm_names:
    # Check if the company is already processed
    existing_records = linkedin_collection.count_documents({'company name': company_name})
    
    if existing_records > 0:
        print(f"Skipping {company_name} - already processed")
        continue
    
    linkedin_links = process_company(company_name)
    
    # Create individual entries for each LinkedIn link
    for link in linkedin_links:
        all_results.append({
            "company name": company_name,
            "linkedin link": link
        })
    
    print(f"Processed {company_name}")

# Print the results as JSON
print(json.dumps(all_results, indent=2))

# Upload to MongoDB
if all_results:
    linkedin_collection.insert_many(all_results)
    print("All new companies processed and uploaded to MongoDB")
else:
    print("No new companies to process")

# Close the MongoDB connection
client.close()

Processed Bridgewater Associates
Processed Renaissance Technologies
Processed AQR Capital Management
Processed Two Sigma Investments
Processed BlackRock
[
  {
    "company name": "Bridgewater Associates",
    "linkedin link": "https://www.linkedin.com/in/raydalio"
  },
  {
    "company name": "Bridgewater Associates",
    "linkedin link": "https://www.linkedin.com/in/karen-karniol-tambour-34239ba"
  },
  {
    "company name": "Bridgewater Associates",
    "linkedin link": "https://www.linkedin.com/in/nir-bar-dea-7562236a"
  },
  {
    "company name": "Renaissance Technologies",
    "linkedin link": "https://www.linkedin.com/in/daniel-long-a942241a2"
  },
  {
    "company name": "Renaissance Technologies",
    "linkedin link": "https://www.linkedin.com/in/john-f-08825a6"
  },
  {
    "company name": "AQR Capital Management",
    "linkedin link": "https://www.linkedin.com/in/mraposa"
  },
  {
    "company name": "AQR Capital Management",
    "linkedin link": "https://www.linkedin.com/in/

In [11]:
import os
import json
import requests
from dotenv import load_dotenv
from pymongo import MongoClient

# Load environment variables
load_dotenv()

# Get MongoDB URI from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
full_dataset_collection = db['full_dataset']

def process_linkedin_urls(linkedin_urls):
    url = "https://api.apollo.io/api/v1/people/bulk_match"
    headers = {
        'Cache-Control': 'no-cache',
        'Content-Type': 'application/json',
        'X-Api-Key': "4_TLpjdY4aWQsArrVAZWNw"
    }

    data = {
        "reveal_personal_emails": True,
        "reveal_phone_number": True,
        "webhook_url": "https://your_webhook_site",
        "details": [{"linkedin_url": url} for url in linkedin_urls]
    }

    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        return response.json().get('matches', [])
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

# Fetch all documents from the full_dataset collection
documents = list(full_dataset_collection.find())

# Extract LinkedIn URLs
linkedin_urls = [doc['linkedin link'] for doc in documents if 'linkedin link' in doc]

# Process LinkedIn URLs in batches of 100 (Apollo API limit)
batch_size = 5
updated_documents = []

for i in range(0, len(linkedin_urls), batch_size):
    batch = linkedin_urls[i:i+batch_size]
    matches = process_linkedin_urls(batch)
    
    for match in matches:
        linkedin_url = match.get('linkedin_url')
        if linkedin_url:
            # Find the corresponding document
            doc = next((d for d in documents if d.get('linkedin link') == linkedin_url), None)
            if doc:
                # Update the document with new information
                doc['email'] = match.get('email', 'N/A')
                doc['first name'] = match.get('first_name', 'N/A')
                doc['last name'] = match.get('last_name', 'N/A')
                doc['title'] = match.get('title', 'N/A')
                updated_documents.append(doc)

# Print the updated documents as JSON
print(json.dumps(updated_documents, indent=2))

# Update the documents in MongoDB
#for doc in updated_documents:
#    full_dataset_collection.update_one(
#        {'_id': doc['_id']},
#        {'$set': {
#            'email': doc['email'],
#            'first name': doc['first name'],
#            'last name': doc['last name'],
#            'title': doc['title']
#        }}
#    )
#
#print(f"Updated {len(updated_documents)} documents in MongoDB")
#
## Close the MongoDB connection
#client.close()

[]


In [18]:
import os
import json
import requests
from dotenv import load_dotenv
from pymongo import MongoClient

# Load environment variables
load_dotenv()

# Get MongoDB URI from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
full_dataset_collection = db['full_dataset']

def process_linkedin_urls(linkedin_urls):
    url = "https://api.apollo.io/api/v1/people/bulk_match"
    headers = {
        'Cache-Control': 'no-cache',
        'Content-Type': 'application/json',
        'X-Api-Key': "aLkaC8aAUp4DRhmTgbk9aA"
    }

    data = {
        "reveal_personal_emails": True,
        "reveal_phone_number": False,
        "webhook_url": "https://your_webhook_site",
        "details": [{"linkedin_url": url} for url in linkedin_urls]
    }

    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        return response.json().get('matches', [])
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

# Fetch all documents from the full_dataset collection
documents = list(full_dataset_collection.find())

# Extract LinkedIn URLs
linkedin_urls = [doc['linkedin link'] for doc in documents if 'linkedin link' in doc]

# Process LinkedIn URLs in batches of 10
batch_size = 10
updated_documents = []

for i in range(0, len(linkedin_urls), batch_size):
    batch = linkedin_urls[i:i+batch_size]
    print(batch)

['https://www.linkedin.com/in/raydalio', 'https://www.linkedin.com/in/karen-karniol-tambour-34239ba', 'https://www.linkedin.com/in/nir-bar-dea-7562236a', 'https://www.linkedin.com/in/daniel-long-a942241a2', 'https://www.linkedin.com/in/john-f-08825a6', 'https://www.linkedin.com/in/mraposa', 'https://www.linkedin.com/in/david-lohrey-45035115', 'https://www.linkedin.com/in/david-kabiller-7a193513', 'https://www.linkedin.com/in/david-m-siegel', 'https://www.linkedin.com/in/zach-cohen-a2060613']
['https://www.linkedin.com/in/tonyberkman', 'https://www.linkedin.com/in/tim-reynolds-6a76bb6', 'https://www.linkedin.com/in/guillaume-lefebvre-6166715', 'https://www.linkedin.com/in/laurencefink', 'https://www.linkedin.com/in/rick-rieder-b64336249', 'https://www.linkedin.com/in/andrew-ang-a9a65a89']


In [22]:
def test_process_linkedin_urls():
    # Mock documents (simulating what you might get from MongoDB)
    documents = [
        {"_id": "1", "name": "Satya Nadella", "linkedin link": "https://www.linkedin.com/in/satyanadella/"},
        {"_id": "2", "name": "Bill Gates", "linkedin link": "https://www.linkedin.com/in/williamhgates/"},
        {"_id": "3", "name": "Jeff Weiner", "linkedin link": "https://www.linkedin.com/in/jeffweiner08/"},
    ]

    # Extract LinkedIn URLs from documents
    linkedin_urls = [doc['linkedin link'] for doc in documents]

    print(f"Testing with {len(linkedin_urls)} LinkedIn URLs:")
    for url in linkedin_urls:
        print(f"  {url}")

    matches = process_linkedin_urls(linkedin_urls)

    print("\nMatches returned by process_linkedin_urls:")
    for match in matches:
        print(f"  {match}")

    updated_documents = []
    for match in matches:
        linkedin_url = match.get('linkedin_url')
        print(f"\nProcessing match for LinkedIn URL: {linkedin_url}")

        if linkedin_url:
            # Find the corresponding document
            doc = next((d for d in documents if d.get('linkedin link') == linkedin_url), None)
            print(f"  Corresponding document found: {doc is not None}")

            if doc:
                # Update the document with new information
                doc['email'] = match.get('email', 'N/A')
                doc['first name'] = match.get('first_name', 'N/A')
                doc['last name'] = match.get('last_name', 'N/A')
                doc['title'] = match.get('title', 'N/A')
                updated_documents.append(doc)
                print("  Document updated and added to updated_documents")
            else:
                print("  No corresponding document found")
        else:
            print("  No LinkedIn URL in this match")

    print("\nUpdated Documents:")
    for doc in updated_documents:
        print(f"ID: {doc.get('_id', 'N/A')}")
        print(f"Name: {doc.get('name', 'N/A')}")
        print(f"LinkedIn: {doc.get('linkedin link', 'N/A')}")
        print(f"Email: {doc.get('email', 'N/A')}")
        print(f"First Name: {doc.get('first name', 'N/A')}")
        print(f"Last Name: {doc.get('last name', 'N/A')}")
        print(f"Title: {doc.get('title', 'N/A')}")
        print("---")

    print(f"\nTotal documents updated: {len(updated_documents)}")

if __name__ == "__main__":
    test_process_linkedin_urls()

Testing with 3 LinkedIn URLs:
  https://www.linkedin.com/in/satyanadella/
  https://www.linkedin.com/in/williamhgates/
  https://www.linkedin.com/in/jeffweiner08/

Matches returned by process_linkedin_urls:
  {'id': '662718987c14770007f04f63', 'first_name': 'Satya', 'last_name': 'N', 'name': 'Satya N', 'linkedin_url': None, 'title': 'Chairman and CEO', 'email_status': 'unavailable', 'photo_url': 'https://static.licdn.com/aero-v1/sc/h/9c8pery4andzj6ohjkjp54ma2', 'twitter_url': None, 'github_url': None, 'facebook_url': None, 'extrapolated_email_confidence': None, 'headline': 'Chairman and CEO at Microsoft', 'email': None, 'organization_id': '62337760d02af100a5ca2468', 'employment_history': [{'_id': '66e205c020efe30001b5e9b0', 'created_at': None, 'current': True, 'degree': None, 'description': None, 'emails': None, 'end_date': None, 'grade_level': None, 'kind': None, 'major': None, 'organization_id': '62337760d02af100a5ca2468', 'organization_name': 'Microsoft', 'raw_address': None, 'start

In [26]:
import json

def test_process_linkedin_urls():
    # Mock documents (simulating what you might get from MongoDB)
    documents = [
        {"_id": "1", "name": "Satya Nadella", "linkedin link": "https://www.linkedin.com/in/satyanadella/"},
        {"_id": "2", "name": "Bill Gates", "linkedin link": "https://www.linkedin.com/in/williamhgates/"},
        {"_id": "3", "name": "Jeff Weiner", "linkedin link": "https://www.linkedin.com/in/jeffweiner08/"},
    ]

    # Extract LinkedIn URLs from documents
    linkedin_urls = [doc['linkedin link'] for doc in documents]

    print(f"Testing with {len(linkedin_urls)} LinkedIn URLs:")
    for url in linkedin_urls:
        print(f"  {url}")

    matches = process_linkedin_urls(linkedin_urls)

    print("\nFull JSON data for each match:")
    for i, match in enumerate(matches, 1):
        print(f"\nMatch {i}:")
        print(json.dumps(match, indent=2))

    updated_documents = []
    for match in matches:
        linkedin_url = match.get('linkedin_url')
        print(f"\nProcessing match for LinkedIn URL: {linkedin_url}")

        if linkedin_url:
            # Find the corresponding document
            doc = next((d for d in documents if d.get('linkedin link') == linkedin_url), None)
            print(f"  Corresponding document found: {doc is not None}")

            if doc:
                # Update the document with all fields from the match
                for key, value in match.items():
                    doc[key] = value
                updated_documents.append(doc)
                print("  Document updated and added to updated_documents")
            else:
                print("  No corresponding document found")
        else:
            print("  No LinkedIn URL in this match")

    print("\nUpdated Documents:")
    for doc in updated_documents:
        print(json.dumps(doc, indent=2))
        print("---")

    print(f"\nTotal documents updated: {len(updated_documents)}")

if __name__ == "__main__":
    test_process_linkedin_urls()

Testing with 3 LinkedIn URLs:
  https://www.linkedin.com/in/satyanadella/
  https://www.linkedin.com/in/williamhgates/
  https://www.linkedin.com/in/jeffweiner08/
Error: 422 - {"error":"You have insufficient credits! \u003ca href='https://app.apollo.io/#/settings/plans/upgrade' aria-onclick='close_alert'\u003eUpgrade your plan\u003c/a\u003e to increase your number of export credits."}

Full JSON data for each match:

Updated Documents:

Total documents updated: 0


In [30]:
documents = [
    {"_id": "1", "name": "Satya Nadella", "linkedin link": "https://www.linkedin.com/in/satyanadella/"},
    {"_id": "2", "name": "Bill Gates", "linkedin link": "https://www.linkedin.com/in/williamhgates/"},
    {"_id": "3", "name": "Jeff Weiner", "linkedin link": "https://www.linkedin.com/in/jeffweiner08/"},
]
# Extract LinkedIn URLs from documents
linkedin_urls = [doc['linkedin link'] for doc in documents]

print(f"Testing with {len(linkedin_urls)} LinkedIn URLs:")
for url in linkedin_urls:
    print(f"  {url}")
matches = process_linkedin_urls(linkedin_urls)
print(matches)

['https://www.linkedin.com/in/satyanadella/',
 'https://www.linkedin.com/in/williamhgates/',
 'https://www.linkedin.com/in/jeffweiner08/']

In [33]:
matches = process_linkedin_urls(['https://www.linkedin.com/in/satyanadella/', 'https://www.linkedin.com/in/williamhgates/', 'https://www.linkedin.com/in/jeffweiner08/'])
matches

Error: 422 - {"error":"You have insufficient credits! \u003ca href='https://app.apollo.io/#/settings/plans/upgrade' aria-onclick='close_alert'\u003eUpgrade your plan\u003c/a\u003e to increase your number of export credits."}


[]

In [25]:
def process_linkedin_urls(linkedin_urls):
    url = "https://api.apollo.io/api/v1/people/bulk_match"
    headers = {
        'Cache-Control': 'no-cache',
        'Content-Type': 'application/json',
        'X-Api-Key': "aLkaC8aAUp4DRhmTgbk9aA"
    }

    data = {
        "reveal_personal_emails": True,
        "reveal_phone_number": False,
        "webhook_url": "https://your_webhook_site",
        "details": [{"linkedin_url": url} for url in linkedin_urls]
    }

    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        return response.json().get('matches', [])
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []


def extract_required_fields(match):
    result = {
        'first_name': match.get('first_name', 'N/A'),
        'last_name': match.get('last_name', 'N/A'),
        'name': match.get('name', 'N/A'),
        'title': match.get('title', 'N/A'),
        'headline': match.get('headline', 'N/A'),
        'email': match.get('email', 'N/A'),
        'employment_history': [],
        'state': match.get('state', 'N/A'),
        'city': match.get('city', 'N/A'),
        'country': match.get('country', 'N/A'),
        'languages': match.get('languages', []),
        'keywords': match.get('keywords', []),
        'industry': match.get('industry', 'N/A'),
        'industries': match.get('industries', []),
        'secondary_industries': match.get('secondary_industries', []),
        'is_likely_to_engage': match.get('is_likely_to_engage', 'N/A'),
        'departments': match.get('departments', []),
        'subdepartments': match.get('subdepartments', []),
        'seniority': match.get('seniority', 'N/A'),
        'personal_emails': match.get('personal_emails', [])
    }
    
    for job in match.get('employment_history', []):
        result['employment_history'].append({
            'current': job.get('current', 'N/A'),
            'description': job.get('description', 'N/A'),
            'organization_name': job.get('organization_name', 'N/A'),
            'title': job.get('title', 'N/A')
        })
    
    return result

def test_process_linkedin_urls():
    # Mock documents (simulating what you might get from MongoDB)
    documents = [
        {"_id": "1", "name": "Satya Nadella", "linkedin link": "https://www.linkedin.com/in/satyanadella/"},
        {"_id": "2", "name": "Bill Gates", "linkedin link": "https://www.linkedin.com/in/williamhgates/"},
        {"_id": "3", "name": "Jeff Weiner", "linkedin link": "https://www.linkedin.com/in/jeffweiner08/"},
    ]

    # Extract LinkedIn URLs from documents
    linkedin_urls = [doc['linkedin link'] for doc in documents]

    print(f"Testing with {len(linkedin_urls)} LinkedIn URLs:")
    for url in linkedin_urls:
        print(f"  {url}")

    matches = process_linkedin_urls(linkedin_urls)

    updated_documents = []
    for match in matches:
        linkedin_url = match.get('linkedin_url')
        print(f"\nProcessing match for LinkedIn URL: {linkedin_url}")

        if linkedin_url:
            # Find the corresponding document
            doc = next((d for d in documents if d.get('linkedin link') == linkedin_url), None)
            print(f"  Corresponding document found: {doc is not None}")

            if doc:
                # Extract required fields and update the document
                extracted_data = extract_required_fields(match)
                doc.update(extracted_data)
                updated_documents.append(doc)
                print("  Document updated and added to updated_documents")
            else:
                print("  No corresponding document found")
        else:
            print("  No LinkedIn URL in this match")

    print("\nUpdated Documents:")
    for doc in updated_documents:
        print(json.dumps(doc, indent=2))
        print("---")

    print(f"\nTotal documents updated: {len(updated_documents)}")

if __name__ == "__main__":
    test_process_linkedin_urls()

Testing with 3 LinkedIn URLs:
  https://www.linkedin.com/in/satyanadella/
  https://www.linkedin.com/in/williamhgates/
  https://www.linkedin.com/in/jeffweiner08/
Error: 422 - {"error":"You have insufficient credits! \u003ca href='https://app.apollo.io/#/settings/plans/upgrade' aria-onclick='close_alert'\u003eUpgrade your plan\u003c/a\u003e to increase your number of export credits."}

Updated Documents:

Total documents updated: 0


In [20]:
updated_documents

[]