In [None]:
!pip uninstall -y serpapi
!pip install git+https://github.com/serpapi/google-search-results-python.git


In [None]:
pip install --upgrade google-search-results

In [None]:

from serpapi import GoogleSearch
import pandas as pd
import time

SERPAPI_API_KEY = "5b665f5522542bac7d40edf9f972fc36ce10c1377df1028d4775e51527918ca2"

QUERIES = [
    "top venture capital firms investing in artificial intelligence",
    "best VCs for AI startups",
    "AI-focused VC firms 2025",
    "early stage AI investors",
    "top VCs in machine learning",
    "AI venture capital funding",
    "AI investors list",
    "AI VC firms in US and Europe"
]

def collect_snippets_from_query(query, max_snippets=100):
    snippets = []
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_API_KEY,
        "num": 10,
        "start": 0
    }

    while len(snippets) < max_snippets:
        search = GoogleSearch(params)
        results = search.get_dict()
        organic_results = results.get("organic_results", [])

        if not organic_results:
            break

        for result in organic_results:
            snippet = result.get("snippet") or result.get("description") or ""
            if snippet and snippet not in snippets:
                snippets.append(snippet)
            if len(snippets) >= max_snippets:
                break

        params["start"] += 10
        time.sleep(1)  \

    return snippets

all_snippets = []

for query in QUERIES:
    print(f"🔍 Searching for: {query}")
    snippets = collect_snippets_from_query(query, max_snippets=100)
    print(f"  → Collected {len(snippets)} snippets\n")
    all_snippets.extend(snippets)

unique_snippets = list(set(all_snippets))
df = pd.DataFrame(unique_snippets, columns=["snippet"])
df.to_csv("ai_vc_snippets.csv", index=False)

print(f"\n✅ Total unique snippets saved: {len(df)}")



In [1]:
import re
import time
import nltk
from nltk.corpus import stopwords

# Step 1: Download stopwords
nltk.download('stopwords')

# Step 2: Define filler words
filler_words = [
    r'\bVC\b',
    r'\bVenture Capital\b',
    r'\bVenture Capital Firms\b',
    r'\bAI\b',
    r'\bArtificial Intelligence\b',
    r'\bML\b',
    r'\bPE\b',
    r'\bPitchBook\b',
    r'\bInvestors?\b',
    r'\bInvesting\b',
    r'\bFund\b',
    r'\bFunds\b',
    r'\bCapital\b',
    r'\bInvestment\b',
    r'\bFirm\b',
    r'\bFirms\b',
    r'\bInvestments\b'
]
filler_pattern = re.compile('|'.join(filler_words), flags=re.IGNORECASE)
stop_words = set(stopwords.words('english'))

# Step 3: Cleaning function
def clean_text(text):
    text = filler_pattern.sub('', text)
    tokens = text.split()
    tokens = [t for t in tokens if t.lower() not in stop_words]
    cleaned = ' '.join(tokens)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

a = pd.read_csv("ai_vc_snippets.csv")

def extract_investments_only(df):
    data = []

    for idx, row in df.iterrows():
        snippet = str(row['snippet'])
        doc = nlp(snippet)

        money_entities = [ent.text for ent in doc.ents if ent.label_ == "MONEY"]

        investment_info = ", ".join(money_entities) if money_entities else "No explicit amount"

        data.append({
            "investment_info": investment_info,
            "snippet": snippet
        })

    return pd.DataFrame(data)

df_investments = extract_investments_only(a)
df_investments


Unnamed: 0,investment_info,snippet
0,No explicit amount,Fresh current investor list powered by serious...
1,No explicit amount,"... companies “AI-washing,” and many taking on..."
2,No explicit amount,Anyone pitching VCs on an AI startup needs to ...
3,No explicit amount,Biggest VCs in Europe · 1. Global Founders Cap...
4,No explicit amount,New AI tools are being released to help ventur...
...,...,...
693,No explicit amount,Browse machine learning startup programs and V...
694,No explicit amount,This article explores the benefits of public w...
695,A record $40 billion,A record $40 billion AI deal lifted venture ca...
696,No explicit amount,Europe's AI revolution - between record VC fun...


In [3]:
def classify_investment_type(text):
    text_lower = text.lower()

    projection_keywords = [
        "projected", "forecast", "expected", "estimated", "potential",
        "may", "could", "anticipate", "target", "plan to", "aim to",
        "likely", "pipeline", "predict", "future", "goal"
    ]
    actual_keywords = [
        "invested", "raised", "funded", "closed", "announced", "completed",
        "disbursed", "totaled", "secured", "financed", "backed", "deal"
    ]

    if any(kw in text_lower for kw in projection_keywords):
        return "Projected/Estimated"
    elif any(kw in text_lower for kw in actual_keywords):
        return "Actual"
    else:
        return "Unknown"

df_investments['investment_type'] = df_investments['snippet'].apply(classify_investment_type)

df_investments[['investment_info', 'investment_type', 'snippet']]


Unnamed: 0,investment_info,investment_type,snippet
0,No explicit amount,Unknown,Fresh current investor list powered by serious...
1,No explicit amount,Projected/Estimated,"... companies “AI-washing,” and many taking on..."
2,No explicit amount,Unknown,Anyone pitching VCs on an AI startup needs to ...
3,No explicit amount,Unknown,Biggest VCs in Europe · 1. Global Founders Cap...
4,No explicit amount,Unknown,New AI tools are being released to help ventur...
...,...,...,...
693,No explicit amount,Unknown,Browse machine learning startup programs and V...
694,No explicit amount,Unknown,This article explores the benefits of public w...
695,A record $40 billion,Actual,A record $40 billion AI deal lifted venture ca...
696,No explicit amount,Unknown,Europe's AI revolution - between record VC fun...


In [4]:
from huggingface_hub import login

login(token="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")


In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def classify_investment_with_llm(snippet):
    prompt = f"""
You are an investment analyst. Determine if the following text snippet describes an investment that has already been made or funded (Actual), or if it refers to a future, planned, or expected investment (Projected).

Consider words like 'invested', 'funded', 'raised', 'secured', 'backed', 'closed', 'announced', 'committed' for Actual investments, and words like 'projected', 'planned', 'expected', 'forecasted', 'anticipated', 'estimated', 'targeted', 'may', 'could' for Projected investments.

Reply with only one word: Actual or Projected.

Text:
\"\"\"{snippet}\"\"\"
"""
    output = pipe(prompt, max_new_tokens=32, do_sample=False)[0]['generated_text'].strip().lower()
    if "actual" in output:
        return "Actual"
    elif "projected" in output:
        return "Projected"
    else:
        return "Unknown"

df_investments['investment_type_llm'] = df_investments['snippet'].apply(classify_investment_with_llm)

print(df_investments[['snippet', 'investment_type_llm']].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


                                             snippet investment_type_llm
0  Fresh current investor list powered by serious...             Unknown
1  ... companies “AI-washing,” and many taking on...             Unknown
2  Anyone pitching VCs on an AI startup needs to ...             Unknown
3  Biggest VCs in Europe · 1. Global Founders Cap...             Unknown
4  New AI tools are being released to help ventur...             Unknown


In [6]:
df_investments

Unnamed: 0,investment_info,snippet,investment_type,investment_type_llm
0,No explicit amount,Fresh current investor list powered by serious...,Unknown,Unknown
1,No explicit amount,"... companies “AI-washing,” and many taking on...",Projected/Estimated,Unknown
2,No explicit amount,Anyone pitching VCs on an AI startup needs to ...,Unknown,Unknown
3,No explicit amount,Biggest VCs in Europe · 1. Global Founders Cap...,Unknown,Unknown
4,No explicit amount,New AI tools are being released to help ventur...,Unknown,Unknown
...,...,...,...,...
693,No explicit amount,Browse machine learning startup programs and V...,Unknown,Unknown
694,No explicit amount,This article explores the benefits of public w...,Unknown,Unknown
695,A record $40 billion,A record $40 billion AI deal lifted venture ca...,Actual,Unknown
696,No explicit amount,Europe's AI revolution - between record VC fun...,Unknown,Unknown


In [16]:
def combine_classifications(rule_label, llm_label):
    if rule_label == llm_label:
        return rule_label
    elif llm_label == "Actual":
        return "Actual"
    elif rule_label == "Actual":
        return "Actual"
    else:
        return llm_label if llm_label != "Unknown" else rule_label


In [17]:
df_investments['final_type'] = df_investments.apply(
    lambda row: combine_classifications(row['investment_type'], row['investment_type_llm']),
    axis=1
)


In [18]:
def is_actual_investment(amount_str):
    exclude_phrases = [
        "no explicit amount",
        "expected",
        "projected",
        "plan to invest",
        "forecast",
        "estimated",
        "potential",
        "up to",
        "could be",
        "might",
        "about",
        "around",
        "record",
        "total",
        "some",
        "more than",
        "billions of dollars",
        "unknown",
        "none",
        "0"
    ]

    amount_lower = amount_str.lower()
    for phrase in exclude_phrases:
        if phrase in amount_lower:
            return False

    money_pattern = re.compile(r'(\$|usd|€|£)?\s?[\d,.]+(billion|million|m|bn|k)?', re.IGNORECASE)
    if money_pattern.search(amount_str):
        return True

    return False

df_actual_investments = df_investments[df_investments['investment_info'].apply(is_actual_investment)]



In [19]:
df_actual_investments

Unnamed: 0,investment_info,snippet,investment_type,investment_type_llm,final_type
10,$6 billion,Some of the largest generative AI startups hav...,Actual,Unknown,Actual
62,$176 million,"Founded in 2017 by Dr. Andrew Ng, AI Fund is b...",Actual,Unknown,Actual
63,"$26 billion, $5.7 billion",Global venture funding totaled $26 billion in ...,Actual,Unknown,Actual
81,just $1bn,European startups got just $1bn of the €22bn t...,Actual,Unknown,Actual
96,"$126.3 billion, $73.1 billion","In Q1 2025, venture capital firms invested a t...",Actual,Unknown,Actual
98,$69.7 billion,"Between February and May of this year, VCs pou...",Projected/Estimated,Unknown,Projected/Estimated
111,$55.6 billion,US venture capital funding surged to $55.6 bil...,Unknown,Unknown,Unknown
127,USD 75 billion,The global annual value of VC investments in A...,Unknown,Unknown,Unknown
163,18B,"OpenAI, Anthropic, and Inflection A collective...",Unknown,Unknown,Unknown
175,"$175 million, $2 billion",Founders Fund led a $175 million investment in...,Unknown,Unknown,Unknown


In [10]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m2.4/3.1 MB[0m [31m70.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [28]:
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

vc_whitelist = [
    "Sequoia: Sequoia Capital", "Sequoia",
    "a16z: Andreessen Horowitz", "a16z", "Andreessen",
    "Accel Partners:Accel", "Accel Partners",
    "Benchmark: Benchmark Capital", "Benchmark",
    "Greylock Partners: Greylock", "Greylock Partners",
    "Kleiner Perkins", "Kleiner",
    "Index Ventures", "Index",
    "Lightspeed: Lightspeed Venture Partners", "Lightspeed",
    "Bessemer: Bessemer Venture Partners", "Bessemer",
    "General Catalyst: General Catalyst", "Catalyst",
    "Insight Partners", "Insight",
    "IVP", "Institutional Venture Partners",
    "Battery Ventures", "Battery",
    "Menlo Ventures", "Menlo",
    "Radical Ventures", "Radical",
    "AIX Ventures",
    "Data Collective", "DCVC",
    "Zetta: Zetta Venture Partners", "Zetta",
    "Element AI",
    "Khosla Ventures", "Khosla",
    "Sierra Ventures: Sierra Ventures", "Sierra",
    "Obvious Ventures",
    "SoftBank", "SoftBank Group",
    "GV", "Google Ventures",
    "Google",
    "Intel Capital",
    "Amazon Alexa Fund",
    "Salesforce Ventures",
    "Microsoft M12", "M12",
    "Samsung Next",
    "First Round Capital", "First Round",
    "Initialized Capital", "Initialized",
    "Union Square Ventures", "USV",
    "8VC",
    "Social Capital",
    "Balderton Capital",
    "Canaan Partners", "Canaan",
    "Upfront Ventures", "Upfront",
    "Felicis Ventures", "Felicis",
    "Wing VC",
    "Lux Capital",
    "Village Global",
    "Susa Ventures",
    "Pear VC",
    "GV",
    "General Atlantic",
    "Coatue Management", "Coatue",
    "Tiger Global Management", "Tiger Global",
    "Lightspeed India",
]


matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(vc) for vc in vc_whitelist]
matcher.add("VC_FIRM", patterns)

def extract_vc_mentions(text):
    doc = nlp(text)
    matches = matcher(doc)
    matched_vcs = {doc[start:end].text for _, start, end in matches}
    return list(matched_vcs) if matched_vcs else None


df_actual_investments['vc_firm_matches'] = df_actual_investments['snippet'].apply(extract_vc_mentions)

df_matched = df_actual_investments[df_actual_investments['vc_firm_matches'].notna()]

print(df_matched[['snippet', 'investment_info', 'vc_firm_matches']].head())


                                               snippet  \
62   Founded in 2017 by Dr. Andrew Ng, AI Fund is b...   
413  Elon Musk's xAI raised its second monster fund...   
427  Enlitic, Deep learning for medical imaging, $2...   
431  Their focus on AI: Andreessen Horowitz has 100...   
438  AIX Ventures focuses on pre-seed and seed-stag...   

              investment_info        vc_firm_matches  
62               $176 million              [Sequoia]  
413                $6 billion  [Andreessen, Sequoia]  
427                $2M, $2.6M      [Data Collective]  
431                       7.2           [Andreessen]  
438  $1 million to $3 million         [AIX Ventures]  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_actual_investments['vc_firm_matches'] = df_actual_investments['snippet'].apply(extract_vc_mentions)


In [29]:
df_matched

Unnamed: 0,investment_info,snippet,investment_type,investment_type_llm,final_type,vc_firm_matches
62,$176 million,"Founded in 2017 by Dr. Andrew Ng, AI Fund is b...",Actual,Unknown,Actual,[Sequoia]
413,$6 billion,Elon Musk's xAI raised its second monster fund...,Actual,Unknown,Actual,"[Andreessen, Sequoia]"
427,"$2M, $2.6M","Enlitic, Deep learning for medical imaging, $2...",Unknown,Unknown,Unknown,[Data Collective]
431,7.2,Their focus on AI: Andreessen Horowitz has 100...,Actual,Unknown,Actual,[Andreessen]
438,$1 million to $3 million,AIX Ventures focuses on pre-seed and seed-stag...,Unknown,Unknown,Unknown,[AIX Ventures]
509,$97.9 billion,AI and Big Data Venture Capital Funding. The i...,Projected/Estimated,Unknown,Projected/Estimated,[Sierra]
604,1,"Companies: AI startup, backed by top VCs · Zet...",Actual,Unknown,Actual,[Zetta]
651,4.5B,5. Artificial Intelligence & Deep Tech · Found...,Unknown,Unknown,Unknown,[Lux Capital]


In [30]:
filtered_df = df_matched[df_matched['final_type'] == 'Actual'].drop(columns=['investment_type', 'investment_type_llm'])


In [31]:
filtered_df

Unnamed: 0,investment_info,snippet,final_type,vc_firm_matches
62,$176 million,"Founded in 2017 by Dr. Andrew Ng, AI Fund is b...",Actual,[Sequoia]
413,$6 billion,Elon Musk's xAI raised its second monster fund...,Actual,"[Andreessen, Sequoia]"
431,7.2,Their focus on AI: Andreessen Horowitz has 100...,Actual,[Andreessen]
604,1,"Companies: AI startup, backed by top VCs · Zet...",Actual,[Zetta]


In [37]:
def split_vc_firms(cell):
    if isinstance(cell, list):
        # Already a list, just return as is
        return cell
    elif isinstance(cell, str):
        # Remove brackets and split by comma, strip spaces
        cleaned = cell.strip('[]')
        return [x.strip() for x in cleaned.split(',') if x.strip()]
    else:
        # Unexpected type — return empty list or as is
        return []
filtered_df['vc_firm_matches'] = filtered_df['vc_firm_matches'].apply(split_vc_firms)
filtered_df = filtered_df.explode('vc_firm_matches').reset_index(drop=True)


In [43]:
def parse_investment_amount(text):
    text = text.lower().replace('$', '').strip()
    multiplier = 1

    if 'billion' in text:
        multiplier = 1_000
        text = text.replace('billion', '').strip()
    elif 'million' in text:
        multiplier = 1
        text = text.replace('million', '').strip()
    elif 'k' in text:
        multiplier = 0.001
        text = text.replace('k', '').strip()

    try:
        value = float(text)
        return value * multiplier
    except ValueError:
        return 0

filtered_df['investment_million'] = filtered_df['investment_info'].apply(parse_investment_amount)

top_1_percent_count = max(1, int(len(filtered_df) * 0.01))

top_1_percent_df = filtered_df.sort_values(by='investment_million', ascending=False).head(top_1_percent_count)

print(top_1_percent_df)

  investment_info                                            snippet  \
1      $6 billion  Elon Musk's xAI raised its second monster fund...   

  final_type vc_firm_matches  investment_million  
1     Actual      Andreessen              6000.0  
