<a href="https://colab.research.google.com/github/Chetansahney/projects/blob/main/part(1)Assign.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import datetime

# ==========================================
# PHASE 1: THE SAMPLE DATASET (Crawler Output)
# ==========================================
# This simulates the JSON data your crawler would return after
# scraping LinkedIn, PubMed, and Crunchbase.
raw_crawler_data = [
    {
        "id": "L001",
        "name": "Dr. Sarah Chen",
        "title": "Director of Toxicology",
        "company": "Hepatox Bio",
        "email": "s.chen@hepatox.bio",
        "linkedin_url": "linkedin.com/in/schen-tox",
        "hq_location": "Cambridge, MA",
        "person_location": "Austin, TX",  # Remote worker!
        "funding_status": "Series B",     # +20 points
        "tech_stack": "In-vitro liver models", # +15 points
        "recent_publications": [
            "Mechanisms of Drug-Induced Liver Injury in 3D Spheroids (2024)"
        ] # +40 points
    },
    {
        "id": "L002",
        "name": "Michael Ross",
        "title": "Junior Research Associate",
        "company": "EduLab Inc",
        "email": "mike@edulab.org",
        "linkedin_url": "linkedin.com/in/mross-lab",
        "hq_location": "Kansas City, MO",
        "person_location": "Kansas City, MO",
        "funding_status": "Bootstrapped",
        "tech_stack": "Standard 2D cell culture",
        "recent_publications": []
    },
    {
        "id": "L003",
        "name": "Dr. Elena Vlasic",
        "title": "VP of Preclinical Safety",
        "company": "Novartis",
        "email": "elena.v@novartis.com",
        "linkedin_url": "linkedin.com/in/evlasic",
        "hq_location": "Basel, Switzerland", # Hub! +10 points
        "person_location": "Basel, Switzerland",
        "funding_status": "Public (IPO)",      # High intent
        "tech_stack": "Open to NAMs",        # +15 points
        "recent_publications": [
            "Safety assessment strategies for new biological entities"
        ] # Relevant, but maybe not 'Liver Injury' specific? Let's see the logic.
    },
    {
        "id": "L004",
        "name": "James Foster",
        "title": "Head of Hepatic Safety",
        "company": "LiverChip Tech",
        "email": "j.foster@liverchip.com",
        "linkedin_url": "linkedin.com/in/jfoster-safety",
        "hq_location": "London, UK", # Hub!
        "person_location": "London, UK",
        "funding_status": "Series A", # +20 points
        "tech_stack": "Organ-on-chip", # +15 points
        "recent_publications": [
            "Predicting DILI using organ-on-chip technologies"
        ] # +40 points!
    },
    {
        "id": "L005",
        "name": "Linda Wu",
        "title": "Software Engineer",
        "company": "BioTech Data",
        "email": "linda@biotechdata.io",
        "linkedin_url": "linkedin.com/in/lindawu",
        "hq_location": "San Francisco, CA",
        "person_location": "San Francisco, CA",
        "funding_status": "Series C",
        "tech_stack": "Python, SQL",
        "recent_publications": []
    }
]

# ==========================================
# PHASE 2: THE SCORING ENGINE (The Logic)
# ==========================================

class LeadScoringEngine:
    def __init__(self):
        # Configuration of Weights
        self.weights = {
            'role_fit': 30,
            'company_intent': 20,
            'tech_fit': 15,
            'location_hub': 10,
            'scientific_intent': 40
        }

        self.hubs = ['cambridge', 'boston', 'bay area', 'basel', 'london', 'oxford', 'san francisco']
        self.target_titles = ['toxicology', 'safety', 'hepatic', '3d', 'preclinical', 'head', 'director', 'vp']
        self.tech_keywords = ['in-vitro', 'vitro', 'nams', 'organ-on-chip', '3d']

    def calculate_score(self, lead):
        score = 0
        signals_hit = []

        # 1. Role Fit (+30)
        title_lower = lead['title'].lower()
        if any(t in title_lower for t in self.target_titles):
            score += self.weights['role_fit']
            signals_hit.append("Role Fit")

        # 2. Company Intent / Funding (+20)
        funding = lead['funding_status'].lower()
        if any(x in funding for x in ['series a', 'series b', 'ipo', 'public']):
            score += self.weights['company_intent']
            signals_hit.append("Funding")

        # 3. Technographic (+15)
        tech = lead['tech_stack'].lower()
        if any(x in tech for x in self.tech_keywords):
            score += self.weights['tech_fit']
            signals_hit.append("Tech Match")

        # 4. Location Hub (+10)
        hq = lead['hq_location'].lower()
        if any(h in hq for h in self.hubs):
            score += self.weights['location_hub']
            signals_hit.append("Hub Location")

        # 5. Scientific Intent (+40) - The "Golden" Signal
        # Checking for 'Liver' and ('Injury' or 'Toxicity') in recent papers
        has_paper = False
        for paper in lead['recent_publications']:
            p_lower = paper.lower()
            if 'liver' in p_lower or 'hepatic' in p_lower:
                if 'injury' in p_lower or 'toxicity' in p_lower or 'dili' in p_lower:
                    has_paper = True
                    break

        if has_paper:
            score += self.weights['scientific_intent']
            signals_hit.append("Scientific Paper")

        # Cap score at 100
        final_score = min(score, 100)
        return final_score, signals_hit

    def determine_action(self, score, is_remote):
        if score >= 90:
            action = "Call Now / High Priority"
            if is_remote:
                action = "Schedule Zoom (High Priority)"
        elif score >= 50:
            action = "Add to Email Sequence"
        else:
            action = "Nurture / Ignore"
        return action

    def process_leads(self, leads_data):
        processed = []

        for lead in leads_data:
            # Calculate Score
            score, signals = self.calculate_score(lead)

            # Determine Remote Status
            is_remote = lead['hq_location'] != lead['person_location']

            # Create the Location Split String
            if is_remote:
                loc_split = f"REMOTE: {lead['person_location']} (HQ: {lead['hq_location']})"
            else:
                loc_split = lead['hq_location']

            # Determine Action
            action = self.determine_action(score, is_remote)

            processed.append({
                "Rank": 0, # Placeholder
                "Probability": score,
                "Name": lead['name'],
                "Title": lead['title'],
                "Company": lead['company'],
                "Location Split": loc_split,
                "Email": lead['email'],
                "Signals Hit": ", ".join(signals), # Good for debugging
                "Action": action
            })

        # Create DataFrame and Sort
        df = pd.DataFrame(processed)
        df = df.sort_values(by="Probability", ascending=False)

        # Add Rank
        df['Rank'] = range(1, len(df) + 1)

        # Reorder columns
        cols = ['Rank', 'Probability', 'Name', 'Title', 'Company', 'Location Split', 'Email', 'Action', 'Signals Hit']
        return df[cols]

# ==========================================
# PHASE 3: EXECUTION
# ==========================================

if __name__ == "__main__":
    # Initialize Engine
    engine = LeadScoringEngine()

    # Run the Engine on the Crawler Data
    dashboard = engine.process_leads(raw_crawler_data)

    # Display Options for pretty printing
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.colheader_justify', 'left')

    print("\n--- LEAD GENERATION DASHBOARD OUTPUT ---\n")
    # Using to_markdown for a nice table look in the console
    print(dashboard.to_markdown(index=False))

    # Optional: Save to CSV
    # dashboard.to_csv("ranked_leads.csv", index=False)


--- LEAD GENERATION DASHBOARD OUTPUT ---

|   Rank |   Probability | Name             | Title                     | Company        | Location Split                         | Email                  | Action                        | Signals Hit                                                   |
|-------:|--------------:|:-----------------|:--------------------------|:---------------|:---------------------------------------|:-----------------------|:------------------------------|:--------------------------------------------------------------|
|      1 |           100 | Dr. Sarah Chen   | Director of Toxicology    | Hepatox Bio    | REMOTE: Austin, TX (HQ: Cambridge, MA) | s.chen@hepatox.bio     | Schedule Zoom (High Priority) | Role Fit, Funding, Tech Match, Hub Location, Scientific Paper |
|      2 |            75 | Dr. Elena Vlasic | VP of Preclinical Safety  | Novartis       | Basel, Switzerland                     | elena.v@novartis.com   | Add to Email Sequence         | Role Fit,

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# ======================================================
# STEP 1: GENERATE "HISTORICAL" TRAINING DATA
# ======================================================
# In a real company, this comes from your CRM (Salesforce/HubSpot).
# We will simulate 1,000 past leads to "teach" the model.

def generate_historical_data(n_rows=1000):
    np.random.seed(42) # Consistent results

    titles = ['Director of Toxicology', 'VP Safety', 'Senior Scientist', 'Intern', 'Student',
              'Head of Liver Safety', 'Research Assistant', 'CEO', 'CFO', 'Lab Tech']
    fundings = ['Series A', 'Series B', 'Seed', 'Public', 'Bootstrapped', 'None']
    locs = ['Cambridge, MA', 'San Francisco, CA', 'Basel', 'Kansas', 'Texas', 'Remote', 'London']
    techs = ['Standard Assays', 'In-vitro models', 'Organ-on-chip', '2D Culture', 'None', 'NAMs']

    data = []

    for _ in range(n_rows):
        # 1. Randomly pick attributes
        title = np.random.choice(titles)
        funding = np.random.choice(fundings)
        loc = np.random.choice(locs)
        tech = np.random.choice(techs)
        has_paper = np.random.choice([0, 1], p=[0.7, 0.3]) # 30% have papers

        # 2. Assign "Hidden" True Probability (The Ground Truth we want the ML to learn)
        # This simulates real life: Directors & Series B companies are more likely to buy.
        prob = 0.05 # Base rate

        if 'Director' in title or 'VP' in title or 'Head' in title: prob += 0.30
        if 'Intern' in title or 'Student' in title: prob -= 0.05
        if funding in ['Series A', 'Series B', 'Public']: prob += 0.20
        if 'chip' in tech or 'In-vitro' in tech: prob += 0.15
        if has_paper: prob += 0.25

        # 3. Determine if they actually converted (0 or 1) based on that probability
        converted = 1 if np.random.rand() < prob else 0

        data.append({
            'Title': title,
            'Funding': funding,
            'Location': loc,
            'Tech_Stack': tech,
            'Has_Recent_Paper': has_paper, # 1 or 0
            'Converted': converted # TARGET VARIABLE
        })

    return pd.DataFrame(data)

# Generate the training dataset
df_history = generate_historical_data()
print(f"--- TRAINING DATA GENERATED: {len(df_history)} rows ---")
print(df_history.head(3).to_markdown(index=False))
print("-" * 60)

# ======================================================
# STEP 2: BUILD THE ML PIPELINE
# ======================================================

# A. Define Features
text_features = 'Title'
categorical_features = ['Funding', 'Location', 'Tech_Stack']
# 'Has_Recent_Paper' is already numeric (0/1), so we pass it through

# B. Create Preprocessing Transformers
# 1. Titles are text: We count words ("Director", "Toxicology")
title_transformer = CountVectorizer(stop_words='english')

# 2. Funding/Location are categories: We turn them into vectors
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# C. Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('title', title_transformer, 'Title'),
        ('cat', cat_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep 'Has_Recent_Paper' as is
)

# D. The Full Pipeline: Preprocessor -> Logistic Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# ======================================================
# STEP 3: TRAIN THE MODEL
# ======================================================
X = df_history.drop('Converted', axis=1)
y = df_history['Converted']

# Train on the 1,000 historical rows
model_pipeline.fit(X, y)
print("✅ MODEL TRAINED successfully on historical data.")

# Show what the model learned (Coefficients)
# This explains *why* it scores people high
feature_names = (model_pipeline.named_steps['preprocessor']
                 .transformers_[0][1].get_feature_names_out().tolist() +
                 model_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out().tolist() +
                 ['Has_Recent_Paper'])

coeffs = model_pipeline.named_steps['classifier'].coef_[0]
importance = pd.DataFrame({'Feature': feature_names, 'Weight': coeffs})
print("\n--- WHAT THE MODEL LEARNED (Top Positive Signals) ---")
print(importance.sort_values(by='Weight', ascending=False).head(5).to_markdown(index=False))
print("-" * 60)

# ======================================================
# STEP 4: PREDICT ON NEW LEADS (The Crawler Output)
# ======================================================

# These are the "Live" leads your crawler just found
new_leads = pd.DataFrame([
    {
        'Name': 'Dr. Sarah Chen',
        'Title': 'Director of Toxicology',
        'Funding': 'Series B',
        'Location': 'Cambridge, MA',
        'Tech_Stack': 'In-vitro models',
        'Has_Recent_Paper': 1
    },
    {
        'Name': 'Mike Ross',
        'Title': 'Summer Intern',
        'Funding': 'None',
        'Location': 'Kansas',
        'Tech_Stack': 'None',
        'Has_Recent_Paper': 0
    },
    {
        'Name': 'James Foster',
        'Title': 'Head of Hepatic Safety',
        'Funding': 'Series A',
        'Location': 'London',
        'Tech_Stack': 'Organ-on-chip',
        'Has_Recent_Paper': 1
    },
    {
        'Name': 'Linda Wu',
        'Title': 'Software Engineer',
        'Funding': 'Public',
        'Location': 'San Francisco, CA',
        'Tech_Stack': 'Java',
        'Has_Recent_Paper': 0
    }
])

# Predict Probabilities
# predict_proba returns [Prob_0, Prob_1]. We want Prob_1 (Conversion).
probs = model_pipeline.predict_proba(new_leads)[:, 1]

# Add to DataFrame and Scale to 0-100 for the Dashboard
new_leads['Score'] = (probs * 100).round(1)

# Assign Actions based on ML Score
def get_action(score):
    if score > 80: return "Call Now"
    if score > 50: return "Email Sequence"
    return "Ignore"

new_leads['Action'] = new_leads['Score'].apply(get_action)

# Sort and Rank
final_dashboard = new_leads.sort_values(by='Score', ascending=False)
final_dashboard.insert(0, 'Rank', range(1, 1 + len(final_dashboard)))

print("\n--- FINAL PREDICTIVE DASHBOARD ---")
print(final_dashboard[['Rank', 'Score', 'Name', 'Title', 'Action']].to_markdown(index=False))

--- TRAINING DATA GENERATED: 1000 rows ---
| Title              | Funding   | Location   | Tech_Stack    |   Has_Recent_Paper |   Converted |
|:-------------------|:----------|:-----------|:--------------|-------------------:|------------:|
| Research Assistant | Public    | Texas      | Organ-on-chip |                  1 |           1 |
| Lab Tech           | Seed      | London     | Organ-on-chip |                  0 |           0 |
| CEO                | Seed      | Remote     | None          |                  1 |           0 |
------------------------------------------------------------
✅ MODEL TRAINED successfully on historical data.

--- WHAT THE MODEL LEARNED (Top Positive Signals) ---
| Feature          |   Weight |
|:-----------------|---------:|
| Has_Recent_Paper | 1.27546  |
| safety           | 0.769753 |
| Funding_Series A | 0.736151 |
| director         | 0.708975 |
| toxicology       | 0.708975 |
------------------------------------------------------------

--- FINAL P