In [None]:
! pip install google-generativeai



In [None]:
! pip install nest-asyncio



In [None]:
! pip install pydantic



In [None]:
import json
with open('/content/Combined file.xlsx - Sheet1.json', 'r') as file:
    investors = json.load(file)

print(len(investors)) # To check total number of Investors

2572


**Founder-Investor Matching AI Model (Approach)**

1. The investors data has been loaded.
2. With the helper functions "filter_investors" and "relevance_score" pre-filtering is done since to optimize the api call and to avoid api rate limit execeeded error.
3. From the pre-filtered score top 5 is selected and passing to gemini API call to get the compactible score.
4. Once the score is recieved store the investor details along with the compactible score and relevant informations in a structured format.

**Code that executes for one founder profile**

Output file is saved as "investor_matches.json" which has top 5 matching investors with the explanations.

In [None]:
import google.generativeai as genai
import pandas as pd
import re
import time
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional
import os 

# Set up Gemini API
genai.configure(api_key=os.env["GOOGLE_API_KEY"])


# Pydantic model for structured response
class CompatibilityResult(BaseModel):
    score: int = Field(..., ge=0, le=100)
    industry_alignment: str
    stage_alignment: str
    funding_alignment: str
    traction: Optional[str]
    business_model: str
    issues: Optional[List[str]]
    recommendation: Optional[str]

MAX_RETRIES = 5
# Load investor dataset
data_file = '/content/Combined file.xlsx - Sheet1.json'
investor_data = pd.read_json(data_file)

# Sample founder profile
founder_profile = {
    "Industry": "Information Technology & Services",
    "Stage": "Seed",
    "Funding_Required": "$1M",
    "Traction": "10,000 active users, $50K monthly revenue",
    "Business_Model": "Subscription-based SaaS platform"
}

# Helper function to convert funding ranges to numerical values
def parse_funding_range(funding):
    match = re.findall(r'\$([\d.]+)([KMB])', funding)
    if not match:
        return None
    value, unit = match[0]
    value = float(value)
    if unit == 'K':
        value *= 1_000
    elif unit == 'M':
        value *= 1_000_000
    elif unit == 'B':
        value *= 1_000_000_000
    return value

# Step 1: Pre-filter based on industry, stage, and funding range
def filter_investors(founder, investors):
    funding_required = parse_funding_range(founder['Funding_Required'])

    filtered = []
    for _, investor in investors.iterrows():
        investor_funding_range = investor['Cheque_range']
        if not investor_funding_range:
            continue

        min_funding, max_funding = None, None
        match = re.findall(r'\$([\d.]+)([KMB])', investor_funding_range)
        if match and len(match) == 2:
            min_funding = parse_funding_range(f"${match[0][0]}{match[0][1]}")
            max_funding = parse_funding_range(f"${match[1][0]}{match[1][1]}")

        # Filter based on industry, stage, and funding range
        if (founder['Industry'] in investor['Industry'] and
            founder['Stage'] in investor['Stage'] and
            (min_funding is None or max_funding is None or (min_funding <= funding_required <= max_funding))):
            filtered.append(investor.to_dict())

    return filtered

# Step 2: Simple relevance scoring (for pre-ranking)
def relevance_score(founder, investor):
    score = 0
    if founder['Industry'] in investor['Industry']:
        score += 40
    if founder['Stage'] in investor['Stage']:
        score += 30
    funding_required = parse_funding_range(founder['Funding_Required'])
    if investor['Cheque_range']:
        match = re.findall(r'\$([\d.]+)([KMB])', investor['Cheque_range'])
        if match and len(match) == 2:
            min_funding = parse_funding_range(f"${match[0][0]}{match[0][1]}")
            max_funding = parse_funding_range(f"${match[1][0]}{match[1][1]}")
            if min_funding <= funding_required <= max_funding:
                score += 30
    return score

# Step 3: Get match score using Gemini API
def get_match_score(founder, investor):
    prompt = f"""
    Given the following founder profile and investor preferences, calculate a compatibility score (0-100) and provide a structured explanation in JSON format:

    Founder Profile:
    Industry: {founder['Industry']}
    Stage: {founder['Stage']}
    Funding Required: {founder['Funding_Required']}
    Traction: {founder['Traction']}
    Business Model: {founder['Business_Model']}

    Investor Profile:
    Name: {investor['Name']}
    Industry: {investor['Industry']}
    Stage: {investor['Stage']}
    Cheque Range: {investor['Cheque_range']}
    Overview: {investor['Overview']}

    Respond in the following JSON format:
    {{
        "score": <int>,
        "industry_alignment": "<string>",
        "stage_alignment": "<string>",
        "funding_alignment": "<string>",
        "traction": "<string>",
        "business_model": "<string>",
        "issues": ["<string>", "<string>"],
        "recommendation": "<string>"
    }}
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)

    if response and response.text:
        try:
            # Extract the JSON response using regex
            match = re.search(r'\{.*\}', response.text, re.DOTALL)
            if match:
                structured_response = match.group(0)
                print("Raw Response:\n", structured_response)  # Debugging
                # Load into Pydantic model
                compatibility_result = CompatibilityResult.parse_raw(structured_response)
                return compatibility_result
        except ValidationError as e:
            print(f"Validation Error: {e}")
        except Exception as e:
            print(f"Error parsing JSON: {e}")

    return None

# Function with retry and backoff logic
def get_match_score_with_backoff(founder_profile, investor):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            result = get_match_score(founder_profile, investor)
            if result:
                return result
        except requests.exceptions.RequestException as e:
            wait_time = (2 ** retries) + random.uniform(0, 1)
            print(f"Request failed: {e}. Retrying in {wait_time:.2f} seconds...")
            time.sleep(wait_time)
            retries += 1
        except Exception as e:
            print(f"Error: {e}")
            retries += 1
    raise Exception("Max retries reached. Could not get a response.")

# Filter and Pre-rank Investors
filtered_investors = filter_investors(founder_profile, investor_data)

# Further filter to top 5 by relevance score
filtered_investors = sorted(filtered_investors, key=lambda inv: relevance_score(founder_profile, inv), reverse=True)[:5]
print(len(filtered_investors))


# Step 5: Batch API Calls to Avoid Rate Limit
results = []
BATCH_SIZE = 5

for i in range(0, len(filtered_investors), BATCH_SIZE):
    batch = filtered_investors[i:i + BATCH_SIZE]
    for investor in batch:
        result = get_match_score_with_backoff(founder_profile, investor)
        if result:
            results.append({
                'Name': investor['Name'],
                'Website': investor['Website'],
                'Score': result.score,
                'Industry Alignment': result.industry_alignment,
                'Stage Alignment': result.stage_alignment,
                'Funding Alignment': result.funding_alignment,
                'Traction': result.traction,
                'Business Model': result.business_model,
                'Issues': result.issues,
                'Recommendation': result.recommendation
            })
    time.sleep(2)  # Add delay between batches to avoid rate limiting

# Rank by match score
ranked_results = sorted(results, key=lambda x: x['Score'], reverse=True) #Top 5 investors are returned and sorted according to the score

# Display ranked results
df = pd.DataFrame(ranked_results)
print("\n=== Ranked Investor Matches ===")
print(df)

# Save results to JSON
df.to_json('investor_matches.json', orient='records', indent=4)
print("\nResults saved to 'investor_matches.json'")

5
Raw Response:
 {
  "score": 75,
  "industry_alignment": "Excellent: Both founder and investor focus on Information Technology & Services.",
  "stage_alignment": "Good: Investor invests in Seed stage, aligning with the founder's stage. ",
  "funding_alignment": "Good: Founder's funding request of $1M falls within the investor's cheque range of $250K-$2M.",
  "traction": "Moderate: 10,000 active users and $50K monthly revenue demonstrate some traction, but may not be sufficient to impress investors focused on 'deep tech' innovations.",
  "business_model": "Moderate: Subscription-based SaaS is a common and understood model, but the investor's focus on solving 'the world's biggest challenges' may require further demonstration of the platform's impact.",
  "issues": [
    "The founder's description needs to highlight how their SaaS platform addresses a significant global challenge to better resonate with 01 Ventures' focus on 'deep tech' innovations.",
    "While the funding aligns, secur

<ipython-input-13-f6de5e08ed35>:137: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  compatibility_result = CompatibilityResult.parse_raw(structured_response)


Raw Response:
 {
  "score": 75,
  "industry_alignment": "Excellent. Both founder and investor focus on Information Technology & Services.",
  "stage_alignment": "Good. Investor invests in Seed stage, aligning with the founder's stage.",
  "funding_alignment": "Good. Funding request ($1M) falls within investor's cheque range ($500K - $5M).",
  "traction": "Moderate.  10,000 active users and $50K MRR is decent traction for seed stage, but might not be considered exceptional by all investors.",
  "business_model": "Moderate. Subscription-based SaaS is a common and well-understood model, but not explicitly mentioned as a preference by the investor.",
  "issues": [
    "Geographic Location: The investor profile specifies a focus on Israeli and Israeli-related entrepreneurs.  The founder's location is not specified, impacting compatibility.",
    "Technology Focus: The investor shows a strong preference for deep technology, digital health, fintech, insurance, computer vision, and artificial 

**Code that executes for multiple founder profiles when loaded a json file**

Output file is saved as "{founder_name}_matches.json" for each founder which has top 5 matching investors with the explanations.

In [None]:
import re
import time
import random
import json
import os
import google.generativeai as genai
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional

# Configure Gemini API
genai.configure(api_key= os.env["GOOGLE_API_KEY"])

MAX_RETRIES = 5

# Pydantic model for structured response
class CompatibilityResult(BaseModel):
    score: int = Field(..., ge=0, le=100)
    industry_alignment: str
    stage_alignment: str
    funding_alignment: str
    traction: Optional[str]
    business_model: str
    issues: Optional[List[str]]
    recommendation: Optional[str]
# Helper function to convert funding ranges to numerical values
def parse_funding_range(funding):
    match = re.findall(r'\$([\d.]+)([KMB])', funding)
    if not match:
        return None
    value, unit = match[0]
    value = float(value)
    if unit == 'K':
        value *= 1_000
    elif unit == 'M':
        value *= 1_000_000
    elif unit == 'B':
        value *= 1_000_000_000
    return value

# Step 1: Pre-filter based on industry, stage, and funding range
def filter_investors(founder, investors):
    funding_required = parse_funding_range(founder['Funding_Required'])

    filtered = []
    for _, investor in investors.iterrows():
        investor_funding_range = investor['Cheque_range']
        if not investor_funding_range:
            continue

        min_funding, max_funding = None, None
        match = re.findall(r'\$([\d.]+)([KMB])', investor_funding_range)
        if match and len(match) == 2:
            min_funding = parse_funding_range(f"${match[0][0]}{match[0][1]}")
            max_funding = parse_funding_range(f"${match[1][0]}{match[1][1]}")

        # Filter based on industry, stage, and funding range
        if (founder['Industry'] in investor['Industry'] and
            founder['Stage'] in investor['Stage'] and
            (min_funding is None or max_funding is None or (min_funding <= funding_required <= max_funding))):
            filtered.append(investor.to_dict())

    return filtered


# Step 2: Simple relevance scoring (for pre-ranking)
def relevance_score(founder, investor):
    score = 0
    if founder['Industry'] in investor['Industry']:
        score += 40
    if founder['Stage'] in investor['Stage']:
        score += 30
    funding_required = parse_funding_range(founder['Funding_Required'])
    if investor['Cheque_range']:
        match = re.findall(r'\$([\d.]+)([KMB])', investor['Cheque_range'])
        if match and len(match) == 2:
            min_funding = parse_funding_range(f"${match[0][0]}{match[0][1]}")
            max_funding = parse_funding_range(f"${match[1][0]}{match[1][1]}")
            if min_funding <= funding_required <= max_funding:
                score += 30
    return score

# Function to generate compatibility score
def get_match_score(founder, investor):
    prompt = f"""
    Given the following founder profile and investor preferences, calculate a compatibility score (0-100) and provide a structured explanation in JSON format:

    Founder Profile:
    Industry: {founder['Industry']}
    Stage: {founder['Stage']}
    Funding Required: {founder['Funding_Required']}
    Traction: {founder['Traction']}
    Business Model: {founder['Business_Model']}

    Investor Profile:
    Name: {investor['Name']}
    Industry: {investor['Industry']}
    Stage: {investor['Stage']}
    Cheque Range: {investor['Cheque_range']}
    Overview: {investor['Overview']}

    Respond in the following JSON format:
    {{
        "score": <int>,
        "industry_alignment": "<string>",
        "stage_alignment": "<string>",
        "funding_alignment": "<string>",
        "traction": "<string>",
        "business_model": "<string>",
        "issues": ["<string>", "<string>"],
        "recommendation": "<string>"
    }}
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)

    if response and response.text:
        try:
            # Extract JSON response using regex
            match = re.search(r'\{.*\}', response.text, re.DOTALL)
            if match:
                structured_response = match.group(0)
                # Load into Pydantic model
                compatibility_result = CompatibilityResult.parse_raw(structured_response)
                return compatibility_result
        except ValidationError as e:
            print(f"Validation Error: {e}")
        except Exception as e:
            print(f"Error parsing JSON: {e}")

    return None

# Backoff logic for handling API retries
def get_match_score_with_backoff(founder, investor):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            result = get_match_score(founder, investor)
            if result:
                return result
        except Exception as e:
            wait_time = (2 ** retries) + random.uniform(0, 1)
            print(f"Request failed: {e}. Retrying in {wait_time:.2f} seconds...")
            time.sleep(wait_time)
            retries += 1
    raise Exception("Max retries reached. Could not get a response.")

# Function to process all founders and investors
def process_founders(founders, investors):
    for founder in founders:
        results = []

        print(f"\nProcessing founder: {founder['Name']}")
        print(founder)
        filtered_investors = filter_investors(founder, investor_data)

        # Further filter to top 20-30 by relevance score
        filtered_investors = sorted(filtered_investors, key=lambda inv: relevance_score(founder, inv), reverse=True)[:5]
        print(len(filtered_investors))

        for investor in filtered_investors:
            result = get_match_score_with_backoff(founder, investor)
            if result:
                results.append({
                    'Name': investor['Name'],
                    'Website': investor.get('Website', ''),
                    'Score': result.score,
                    'Industry Alignment': result.industry_alignment,
                    'Stage Alignment': result.stage_alignment,
                    'Funding Alignment': result.funding_alignment,
                    'Traction': result.traction,
                    'Business Model': result.business_model,
                    'Issues': result.issues,
                    'Recommendation': result.recommendation
                })

        # Rank by score (highest first)
        ranked_results = sorted(results, key=lambda x: x['Score'], reverse=True)

        # Output to JSON file (unique for each founder)
        founder_name_cleaned = founder['Name'].replace(' ', '_').lower()
        filename = f"{RESULTS_DIR}/{founder_name_cleaned}_matches.json"
        with open(filename, 'w') as file:
            json.dump(ranked_results, file, indent=4)

        print(f"Results saved for {founder['Name']} in '{filename}'")

# Function to load founder data from JSON file
def load_json(file_path):
    with open(file_path, 'r') as file:
        details = json.load(file)
    return details


# Directory to store results
RESULTS_DIR = 'founder_matches'
os.makedirs(RESULTS_DIR, exist_ok=True)



# Load founders from file
founders = load_json('founders.json')

# Example Investor Data
investors = load_json('/content/Combined file.xlsx - Sheet1.json')

'''filtered_investors = filter_investors(founder_profile, investor_data)

# Further filter to top 20-30 by relevance score
filtered_investors = sorted(filtered_investors, key=lambda inv: relevance_score(founder_profile, inv), reverse=True)[:5]
print(len(filtered_investors))
'''
# Run the process
process_founders(founders, filtered_investors)



Processing founder: Alice Smith
{'Name': 'Alice Smith', 'Industry': 'Information Technology & Services', 'Stage': 'Seed', 'Funding_Required': '$1M', 'Traction': '10,000 active users, $50K MRR', 'Business_Model': 'SaaS'}
5


<ipython-input-17-76dbf945cfa7>:124: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  compatibility_result = CompatibilityResult.parse_raw(structured_response)


Results saved for Alice Smith in 'founder_matches/alice_smith_matches.json'

Processing founder: Bob Johnson
{'Name': 'Bob Johnson', 'Industry': 'Consumer, Future of Work', 'Stage': 'Series A', 'Funding_Required': '$2M', 'Traction': '5,000 active users, $100K ARR', 'Business_Model': 'Marketplace'}
0
Results saved for Bob Johnson in 'founder_matches/bob_johnson_matches.json'
