# Google Module

In [1]:
import json
import os
import re
from dotenv import load_dotenv
from pymongo import MongoClient
import anthropic
from googlesearch import search

# Load environment variables
load_dotenv()

# Get MongoDB URI and Anthropic API key from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')
Anthropic_Key = os.getenv("13F_Anthropic_Key")

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
collection = db['investment_firms']

# Initialize Anthropic client
anthropic_client = anthropic.Anthropic(api_key=Anthropic_Key)

def search_company(company_name):
    query = f"{company_name} LLC site:linkedin.com"
    search_results = []
    
    try:
        for j in search(query, tld="co.in", num=10, stop=10, pause=2):
            search_results.append(j)
    except Exception as e:
        print(f"Error searching for {company_name}: {str(e)}")
    
    json_results = {
        "company": company_name,
        "results": search_results
    }
    
    return json.dumps(json_results, indent=2)

def extract_relevant_links(search_results):
    system_prompt = """
    You will be provided with a list of LinkedIn URLs for a company. Your task is to identify and list the URLs that are specifically about people associated with the company.

    After your analysis, provide your response in the following format:

    <answer>
    [List the urls of LinkedIn profiles here, one per line. If there are no relevant profiles, state "No relevant LinkedIn profiles found."]
    </answer>

    Important notes:
    - Only include LinkedIn profile URLs.
    - If there are multiple relevant profiles, list all of them in a comma separated format.
    - If no relevant profiles are found, simply state "No relevant LinkedIn profiles found." in your answer.
    - Do not include any explanations or additional commentary in your answer, just the list of URLs or the "No profiles found" statement.
    """

    message = anthropic_client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1000,
        temperature=0,
        system=system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{search_results}"
                    }
                ]
            }
        ]
    )
    return message.content[0].text

def remove_answer_tag(text):
    return re.sub(r'</?answer>', '', text)

def parse_comma_separated_string(input_string):
    items = input_string.split(',')
    cleaned_items = [item.strip() for item in items]
    result = [item for item in cleaned_items if item]
    return result

def process_company(company_name):
    search_results = search_company(company_name)
    extracted_urls = extract_relevant_links(search_results)
    cleaned_extracted_urls = remove_answer_tag(extracted_urls)
    list_of_extracted_urls = parse_comma_separated_string(cleaned_extracted_urls)
    
    return list_of_extracted_urls

def main():
    # Extract firm names from the collection
    firm_names = [doc['Firm Name'] for doc in collection.find({}, {'Firm Name': 1, '_id': 0}) if 'Firm Name' in doc]

    # Process each company and create JSON files
    for company_name in firm_names:
        linkedin_links = process_company(company_name)
        
        # Create a JSON file for each company
        filename = f"{company_name.replace(' ', '_')}_linkedin_links.json"
        with open(filename, 'w') as f:
            json.dump({company_name: linkedin_links}, f, indent=2)
        
        print(f"Processed {company_name} and saved results to {filename}")

    # Close the MongoDB connection
    client.close()

if __name__ == "__main__":
    main()

Processed Creekside Partners and saved results to Creekside_Partners_linkedin_links.json
Processed Gordian Capital Singapore Pte Ltd and saved results to Gordian_Capital_Singapore_Pte_Ltd_linkedin_links.json
Processed Brown Financial Advisors and saved results to Brown_Financial_Advisors_linkedin_links.json
Processed NCP Inc. and saved results to NCP_Inc._linkedin_links.json
Processed SW Investment Management LLC and saved results to SW_Investment_Management_LLC_linkedin_links.json
Processed Centennial Bank AR and saved results to Centennial_Bank_AR_linkedin_links.json
Processed Mendon Capital Advisors Corp and saved results to Mendon_Capital_Advisors_Corp_linkedin_links.json
Processed Pensioenfonds Rail & OV and saved results to Pensioenfonds_Rail_&_OV_linkedin_links.json
Processed Strategic Investment Solutions Inc. IL and saved results to Strategic_Investment_Solutions_Inc._IL_linkedin_links.json
Processed University of Texas Texas AM Investment Management Co. and saved results to U

# Duck Duck Go Module

In [2]:
import json
import os
import re
from dotenv import load_dotenv
from pymongo import MongoClient
from duckduckgo_search import DDGS
import anthropic

# Load environment variables
load_dotenv()

# Get MongoDB URI and Anthropic API key from .env file
mongodb_uri = os.getenv('13F_MongoDB_URI')
Anthropic_Key = os.getenv("13F_Anthropic_Key")

# Connect to MongoDB
client = MongoClient(mongodb_uri)
db = client['13f_filings']
collection = db['investment_firms']

# Initialize Anthropic client
anthropic_client = anthropic.Anthropic(api_key=Anthropic_Key)

def search_company(company_name):
    search_query = f"{company_name} site:linkedin.com"
    
    with DDGS() as ddgs:
        results = list(ddgs.text(search_query, max_results=5))
    
    json_results = {
        "company": company_name,
        "results": []
    }
    
    for result in results:
        json_results["results"].append({
            "title": result['title'],
            "url": result['href'],
            "body": result['body']
        })
    
    return json.dumps(json_results, indent=2)

def extract_relevant_links(search_results):
    system_prompt = """
    You will be provided with information about various websites, including their names, links, and descriptions. Your task is to identify and list the names of websites that are specifically about people.

    Carefully analyze the provided website information. For each website, determine if its primary focus is on a person or people. This could include biographical websites, personal blogs, or sites dedicated to public figures.

    After your analysis, provide your response in the following format:

    <answer>
    [List the urls of websites about people here, one per line. If there are no such websites, state "No websites about people found."]
    </answer>

    Important notes:
    - Only include websites that are primarily about specific individuals or groups of people.
    - If there are multiple relevant websites, list all of them in a comma separated format
    - If no websites are about people, simply state "No websites about people found." in your answer.
    - Do not include any explanations or additional commentary in your answer, just the list of names or the "No websites found" statement.
    """

    message = anthropic_client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1000,
        temperature=0,
        system=system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{search_results}"
                    }
                ]
            }
        ]
    )
    return message.content[0].text

def remove_answer_tag(text):
    return re.sub(r'</?answer>', '', text)

def parse_comma_separated_string(input_string):
    items = input_string.split(',')
    cleaned_items = [item.strip() for item in items]
    result = [item for item in cleaned_items if item]
    return result

def process_company(company_name):
    search_results = search_company(company_name)
    extracted_urls = extract_relevant_links(search_results)
    cleaned_extracted_urls = remove_answer_tag(extracted_urls)
    list_of_extracted_urls = parse_comma_separated_string(cleaned_extracted_urls)
    
    return list_of_extracted_urls

def main():
    # Extract firm names from the collection
    firm_names = [doc['Firm Name'] for doc in collection.find({}, {'Firm Name': 1, '_id': 0}) if 'Firm Name' in doc]

    # Process each company and create JSON files
    for company_name in firm_names:
        linkedin_links = process_company(company_name)
        
        # Create a JSON file for each company
        filename = f"{company_name.replace(' ', '_')}_linkedin_links.json"
        with open(filename, 'w') as f:
            json.dump({company_name: linkedin_links}, f, indent=2)
        
        print(f"Processed {company_name} and saved results to {filename}")

    # Close the MongoDB connection
    client.close()

if __name__ == "__main__":
    main()

RatelimitException: https://duckduckgo.com/ 202 Ratelimit