In [None]:
import os
import json
from dotenv import load_dotenv
from pymongo import MongoClient
from googlesearch import search

# Load environment variables
load_dotenv()

# Get MongoDB URI from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
firms_collection = db['investment_firms']
linkedin_collection = db['full_dataset']

def search_company(company_name):
    query = f"{company_name}"
    search_results = []
    
    try:
        for j in search(query, tld="co.in", num=5, stop=10, pause=2):
            if "https://www.linkedin.com/in" in j:
                search_results.append(j)
    except Exception as e:
        print(f"Error searching for {company_name}: {str(e)}")
    
    return search_results

def process_company(company_name):
    linkedin_links = search_company(company_name)
    return linkedin_links

# Extract firm names from the investment_firms collection
#firm_names = [doc['Firm Name'] for doc in firms_collection.find({}, {'Firm Name': 1, '_id': 0}) if 'Firm Name' in doc]

firm_names = ['Bridgewater Associates', 'Renaissance Technologies', 'AQR Capital Management', 'Two Sigma Investments', 'BlackRock']

# List to store all results
all_results = []

# Process each company
for company_name in firm_names:
    # Check if the company is already processed
    existing_records = linkedin_collection.count_documents({'company name': company_name})
    
    if existing_records > 0:
        print(f"Skipping {company_name} - already processed")
        continue
    
    linkedin_links = process_company(company_name)
    
    # Create individual entries for each LinkedIn link
    for link in linkedin_links:
        all_results.append({
            "company name": company_name,
            "linkedin link": link
        })
    
    print(f"Processed {company_name}")

# Print the results as JSON
print(json.dumps(all_results, indent=2))

# Upload to MongoDB
if all_results:
    linkedin_collection.insert_many(all_results)
    print("All new companies processed and uploaded to MongoDB")
else:
    print("No new companies to process")

# Close the MongoDB connection
client.close()