In [1]:
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [None]:
embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en")

In [3]:
import json
with open("investor_profiles.json", "r") as f:
    data = json.load(f)

investors = data['investors']

In [7]:
with open("EcoTech_recommendations_augmented.json", "r") as f:
    startup = json.load(f)

startup

{'Problem': 'Inefficient urban waste collection causes $380M annual cost impact; pilot cities show 60% potential improvement in recycling rates.',
 'Solution': 'AI-powered routing algorithm + IoT smart bins reduces collection trips by 40% vs competitors; proprietary technology validated in 3 municipalities.',
 'Market': 'TAM/SAM/SOM clearly defined ($3.8B/$1.5B/$500M); growth 15%; competitors include WasteRobotics, RecycleBot with clear differentiation.',
 'Product': 'MVP pilots completed; IoT sensors uptime 99%, mobile app retention 95%, routing algorithm validated; moat: data network effects.',
 'Team': 'Founders have 30 yrs combined experience; relevant municipal scaling & tech expertise; advisory board selected.',
 'Business_Model': 'Subscription pricing tiers clarified: Basic $2K, Premium $5K, Enterprise $10K; revenue split: municipalities pay monthly fees for premium features.'}

### Create ChromaDB

In [8]:
db = Chroma(collection_name='investors', embedding_function=embed_model)

### Add investors

In [12]:
for inv in investors:
    text_to_embed = inv['thesis_text'] + " " + " ".join(inv["industry_tags"])
    
    metadata = {
        "id": inv["id"],
        "name": inv["name"],
        "stage_focus": ", ".join(inv["stage_focus"]),
        "ticket_min_usd": inv["ticket_min_usd"],
        "ticket_max_usd": inv["ticket_max_usd"],
        "preferred_geographies": ", ".join(inv["preferred_geographies"]),
        "industry_tags": ", ".join(inv["industry_tags"]),
        "thesis_text": inv["thesis_text"],
        "contact_email": inv["contact"]["email"]
    }
    
    db.add_texts([text_to_embed], metadatas=[metadata], ids=[inv["id"]])

### Query Construction for Startup

In [None]:
query_text = (
    startup["problem_statement"] + " " +
    startup["solution_description"] + " " +
    " ".join(startup["industry_tags"]) + " " +
    f"Stage: {startup['stage']} " +
    f"Funding ask: ${startup['funding_ask_usd']} " 
)


### Similarity Search with Score

In [None]:
results = db.(query_text, k=2)

for match, score in results:
    print(f"{match.metadata['name']} — Score: {score:.4f}")


### Structured filters

In [None]:
filtered = [
    r for r in results
    if r[0].metadata["ticket_min_usd"] <= startup["funding_ask_usd"] <= r[0].metadata["ticket_max_usd"]
    and startup["stage"] in r[0].metadata["stage_focus"]
]


#### Apply Structured Filter & Rescore:

In [None]:
filtered = []
for doc, sim_score in results:
    meta = doc.metadata
    if (startup_profile["stage"] in meta["stage_focus"] and
        startup_profile["geography"] in meta["preferred_geographies"] and
        meta["ticket_min_usd"] <= startup_profile["funding_need"] <= meta["ticket_max_usd"]):
        
        final_score = 0.7 * sim_score + 0.3  # example boosting
        filtered.append((meta, final_score))

top_investors = sorted(filtered, key=lambda x: x[1], reverse=True)
