# Setting up environment

In [24]:
!mkdir -p app data report

In [25]:
!pip install openai beautifulsoup4 pandas streamlit scraper



# Scrape crunchbase sample

In [26]:
# smart_lead_refiner/app/scraper.py

import requests
import pandas as pd
import os
import json

# Load mock data from JSON file
MOCK_FILE = "data/mock_company_data.json"

def load_mock_data():
    with open(MOCK_FILE, "r") as file:
        return json.load(file)

# Clearbit or mock enrichment function
def enrich_company(domain, api_key):
    if os.environ.get("USE_MOCK_DATA") == "1":
        mock_db = load_mock_data()
        return mock_db.get(domain, None)

    url = f"https://company.clearbit.com/v2/companies/find?domain={domain}"
    headers = {"Authorization": f"Bearer {api_key}"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return None

# Wrapper to build DataFrame from list of domains
def scrape_via_clearbit(domains, api_key="mock"):
    data = []
    for domain in domains:
        company = enrich_company(domain, api_key)
        if company:
            data.append({
                "Company Name": company.get("name"),
                "Website": company.get("domain"),
                "Industry": company.get("category", {}).get("industry", "N/A"),
                "Team Size": company.get("metrics", {}).get("employees", "N/A"),
                "Funding": company.get("metrics", {}).get("raised", "N/A"),
                "Location": company.get("location", "N/A")
            })
    return pd.DataFrame(data)

# Example usage:
if __name__ == "__main__":
    os.environ["USE_MOCK_DATA"] = "1"
    domains = ["openai.com", "notion.so", "zapier.com", "loom.com", "figma.com"]
    df = scrape_via_clearbit(domains)
    print(df)

  Company Name     Website               Industry Team Size Funding  \
0       OpenAI  openai.com            AI Research     >1000   $11B+   
1       Notion   notion.so  Productivity Software   200-500   $350M   
2       Zapier  zapier.com        Automation SaaS  500-1000   $1.3B   
3         Loom    loom.com        Video Messaging   100-200   $203M   
4        Figma   figma.com   Design Collaboration  500-1000   $332M   

            Location  
0  San Francisco, CA  
1  San Francisco, CA  
2             Remote  
3  San Francisco, CA  
4  San Francisco, CA  


# Lead scoring

In [27]:
def score_lead(row):
    score = 0

    # Funding weight
    try:
        if isinstance(row["Funding"], str) and "$" in row["Funding"]:
            amount = float(row["Funding"].replace("$", "").replace("B", "e9").replace("M", "e6").replace("+", ""))
            score += amount / 1e8
        elif isinstance(row["Funding"], (int, float)):
            score += row["Funding"] / 1e8
    except:
        pass

    # Team size weight
    try:
        if isinstance(row["Team Size"], str):
            size = int(str(row["Team Size"]).split("-")[0].replace(">", ""))
            if size >= 1000:
                score += 10
            elif size >= 500:
                score += 8
            elif size >= 200:
                score += 6
        elif isinstance(row["Team Size"], (int, float)):
            size = row["Team Size"]
            if size >= 1000:
                score += 10
            elif size >= 500:
                score += 8
            elif size >= 200:
                score += 6
    except:
        pass

    # Industry relevance
    if isinstance(row["Industry"], str) and any(keyword in row["Industry"].lower() for keyword in ["ai", "automation", "software"]):
        score += 5

    return round(score, 2)

if __name__ == "__main__":
    from app.scraper import scrape_via_clearbit
    api_key = "sk_your_clearbit_key_here"
    df = scrape_via_clearbit(["openai.com", "notion.so"], api_key)
    df["Score"] = df.apply(score_lead, axis=1)
    print(df.sort_values("Score", ascending=False))

ModuleNotFoundError: No module named 'app.scraper'

#static email generator

In [None]:
def generate_email(company_name, industry, pain_point, contact_name="there"):
    email_template = f"""
    Hi {contact_name},

    I came across {company_name} and was really impressed by your work in the {industry} space.

    I've worked with a number of growing companies facing similar challenges around {pain_point}. Based on what I've seen, I believe there's real potential to optimize operations and accelerate growth with a bit of smart tooling.

    Would love to share a few ideas — totally no pressure. Let me know if a quick 15-minute chat next week works for you.

    Best,
    Barneel Ray
    AI & Automation Enthusiast | Robotics Engineer
    """
    return email_template.strip()

# Example:
if __name__ == "__main__":
    print(generate_email("Notion", "Productivity Software", "managing cross-functional remote teams", "Akshay"))


# Pipeline execution

In [None]:
from app.scraper import scrape_via_clearbit
from app.lead_scorer import score_lead
from app.email_generator import generate_email
import pandas as pd

# Step 1: Provide your API Key and domains
api_key = "sk_your_clearbit_key_here"
domains = ["openai.com", "notion.so", "zapier.com"]

# Step 2: Enrich data
df = scrape_via_clearbit(domains, api_key)

# Step 3: Score leads
df["Score"] = df.apply(score_lead, axis=1)

# Step 4: Add pain points + generate emails
pain_point_map = {
    "AI Research": "scaling AI model deployment and compliance",
    "Productivity Software": "managing cross-functional remote teams",
    "Automation SaaS": "workflow fragmentation and integration bottlenecks"
}

def get_pain_point(industry):
    return pain_point_map.get(industry, "growth challenges in tech sectors")

df["Pain Point"] = df["Industry"].apply(get_pain_point)
df["Email"] = df.apply(lambda row: generate_email(row["Company Name"], row["Industry"], row["Pain Point"]), axis=1)

# Step 5: Sort + Save
sorted_df = df.sort_values("Score", ascending=False)
sorted_df.to_csv("data/leads_scored.csv", index=False)

print("\n--- Pipeline Complete: Top Leads ---\n")
print(sorted_df[["Company Name", "Score"]])