# Importing necessary modules and libraries

In [1]:
%%capture
!pip install openai
!pip install python-dotenv
!pip install weaviate-client

In [2]:
from openai import OpenAI
import requests
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import weaviate
import weaviate.classes as wvc
from weaviate.util import generate_uuid5

# Initializing clients and defining some functions

In [4]:
# Load environment variables from .env file
load_dotenv()

True

In [4]:
# Initialize your OpenAI client with your API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [5]:
# Initialize your Weaviate client with your Cluster URL and API key
weaviate_client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCS_DEMO_URL"),
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_DEMO_RO_KEY")),
    headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
    }
)

In [18]:
# Define a new schema with OpenAI vectorizer
collection = weaviate_client.collections.create(
    name="VentureCapital",
    description="Collection of Venture Capital firms data",
    vectorizer_config=[
        wvc.config.Configure.NamedVectors.text2vec_openai(
            name="vc_data_vector",
            source_properties=["vc_name", "contacts", "industries", "investment_rounds"],
            vectorize_collection_name=False
        ),
    ],
    properties=[
        wvc.config.Property(name="vc_name", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="contacts", data_type=wvc.config.DataType.TEXT_ARRAY),
        wvc.config.Property(name="industries", data_type=wvc.config.DataType.TEXT_ARRAY),
        wvc.config.Property(name="investment_rounds", data_type=wvc.config.DataType.TEXT_ARRAY),
    ],
)

In [19]:
headers = { 'user-agent': os.getenv("USER_AGENT")}

In [20]:
def scrape_texts_and_links(url):
    """
    Scrapes text content and links from the provided URL.

    Args:
        url (str): The URL of the webpage to scrape.

    Returns:
        tuple: A tuple containing two elements:
            - texts (str): All text content extracted from the webpage.
            - links (str): All links extracted from the webpage, along with their corresponding text.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Ensure successful response

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all text from paragraphs and other text holders
        texts = soup.get_text(separator=' ', strip=True)

        # Extract all links and their text
        links = ' '.join(f"{urljoin(url, link['href'])} ({link.get_text(strip=True)})" for link in soup.find_all('a', href=True))

        return texts, links

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
    except Exception as err:
        print(f"An error occurred: {err}")  # Handle other exceptions

    return None, ""  # Return None for texts and empty string for links if exceptions occur

In [25]:
def extract_and_store_vc_information(text):
  """
    Extracts information about a Venture Capital firm from the provided text and stores it in the Weaviate database.

    Args:
        text (str): The text containing information about the VC firm.

    Returns:
        None: Inserts the extracted information into the Weaviate database.

    """
  try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-0125",
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": """Extract the following information from the provided text
                    and structure your response in a JSON format with the specified keys. Ensure that you do
                    not make assumptions or include incorrect information. If a specific type of information is
                    completely missing from the text, please return only one 'no info' for that field.

- **VC Name**: Look for the name of the Venture Capital firm.
- **Contacts**: Search for any type of contact details available such as email
 addresses or phone numbers. Also, ensure to include URLs that link directly to social media
 profiles or pages from 'http://linkedin.com', 'http://facebook.com', 'http://instagram.com', 'http://twitter.com'.
 List all found info, separating them by commas. Also focus on including URLs that provide direct communication
 channels (e.g., 'contact us', 'connect with us'), but avoid links related to job opportunities, personal profiles, or relationships.])
- **Industries**: Identify and list the industries that the Venture Capital firm invests in.
If not directly mentioned, infer based on the context of the text.
- **Investment Rounds**: Extract only the types of investment rounds the firm participates in or leads,
 such as Seed, Series A, Series B, etc. Do not include the names of companies involved in these rounds.
You can fill the contacts session with links(including words such as contact us, connect us, reach us),
which will move to pages from where we can take additional information. Do not include links which will contain info about jobs, people or relationships.

Use the following JSON keys for the response:
{
  "vc_name": "***",
  "contacts": "***",
  "industries": "***",
  "investment_rounds": "***"
}

Please format your response as a JSON object with these keys, filling in the appropriate information or 'no info' as required.
"""},
                    {"role": "user", "content": text}
                ]
            )
            response_json = json.loads(response.json())["choices"][0]["message"]["content"]
            vc_name = json.loads(response_json)["vc_name"]
            contacts = json.loads(response_json)["contacts"]
            industries = json.loads(response_json)["industries"]
            investment_rounds = json.loads(response_json)["investment_rounds"]

            extracted_info = {
                "vc_name": vc_name,
                "contacts": [contacts] if isinstance(contacts, str) else contacts,
                "industries": [industries] if isinstance(industries, str) else industries,
                "investment_rounds": [investment_rounds] if isinstance(investment_rounds, str) else investment_rounds
            }

            # Insert data into Weaviate
            venture_capital = weaviate_client.collections.get("VentureCapital")
            venture_capital.data.insert(
                properties=extracted_info,
                uuid=generate_uuid5(extracted_info)
            )
            print(f"Data for {url} inserted successfully.")
  except Exception as e:
      print(f"Failed to process or insert data for {url}: {e}")


# VC Website Information Extraction and Storage

In [23]:
url_list = ['https://www.accel.com/', 'https://a16z.com/', 'https://greylock.com/', 'https://www.sequoiacap.com/',
            'https://www.indexventures.com/', 'https://www.kleinerperkins.com/', 'https://lsvp.com/', 'https://matrix.vc/', 'https://www.500.co/',
            'https://www.sparkcapital.com/', 'https://www.insightpartners.com/'
            ]

In [26]:
for url in url_list:
    texts, links = scrape_texts_and_links(url)
    text = texts + links
    if text is not None:
      extract_and_store_vc_information(text)
    else:
        print(f"Skipping {url} due to failed scraping.")

Data for https://www.accel.com/ inserted successfully.
Data for https://a16z.com/ inserted successfully.
Data for https://greylock.com/ inserted successfully.
Data for https://www.sequoiacap.com/ inserted successfully.
Data for https://www.indexventures.com/ inserted successfully.
Data for https://www.kleinerperkins.com/ inserted successfully.
Data for https://lsvp.com/ inserted successfully.
Data for https://matrix.vc/ inserted successfully.
Data for https://www.500.co/ inserted successfully.
Data for https://www.sparkcapital.com/ inserted successfully.
Data for https://www.insightpartners.com/ inserted successfully.


In [28]:
# objects' properties
collection = weaviate_client.collections.get("VentureCapital")
response = collection.query.fetch_objects()

for o in response.objects:
    print(o.properties)

{'contacts': ['https://www.500.co/get-in-touch, https://www.linkedin.com/company/500global, https://www.facebook.com/500Global, https://www.instagram.com/500Global, https://twitter.com/500GlobalVC'], 'industries': ['Technology'], 'vc_name': '500 Global', 'investment_rounds': ['Seed, Series A, Series B']}
{'industries': ['AI, Cybersecurity, Infrastructure, SaaS, Consumer, Marketplaces & Commerce, Fintech & Crypto'], 'contacts': ['https://greylock.com/contact-us/, https://www.linkedin.com/company/greylock-partners/, https://twitter.com/GreylockVC, https://www.youtube.com/channel/UCZ7x7yDBbEFCGztD8BYvRhA'], 'vc_name': 'Greylock Partners', 'investment_rounds': ['Pre-Seed, Seed, Series A']}
{'contacts': ['https://www.sparkcapital.com/contact (Contact Us), http://twitter.com/sparkcapital'], 'industries': ['No info'], 'vc_name': 'Spark Capital', 'investment_rounds': ['Seed, Series A, Series B']}
{'contacts': ['mailto:helpdesk@accel.com?subject=Website%20Error, http://www.accel.com/connect'], 

In [29]:
# Objects with corresponding ids, properties, vectors and metadata
response = collection.query.fetch_objects(
    include_vector=True
)

print(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('04302b66-2bd5-50ac-98a8-fb9f6c8292e3'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'contacts': ['https://www.500.co/get-in-touch, https://www.linkedin.com/company/500global, https://www.facebook.com/500Global, https://www.instagram.com/500Global, https://twitter.com/500GlobalVC'], 'industries': ['Technology'], 'vc_name': '500 Global', 'investment_rounds': ['Seed, Series A, Series B']}, references=None, vector={'vc_data_vector': [-0.005570428445935249, -0.014103850349783897, 0.0133400559425354, -0.024494081735610962, -0.015354892238974571, 0.006485664285719395, -0.03339623287320137, -0.014380396343767643, -0.0003683165705297142, -0.0146174356341362, -0.006011585239320993, 0.002721015829592943, -0.00010226443555438891, -0.02211051806807518, 0.014011668041348457, 0.007446990814059973, 0.011351558379828

# Note

When scraping the homepage of a VC website, we've currently focused on extracting the text content and links available. However, it's important to note that if certain crucial information for the company is missing from the homepage, we can explore additional relative links within the website to search for the missing information. The extent of this exploration can be adjusted based on the company's objectives and resources, ensuring cost-effective data extraction.