In [1]:
# ==========================================
# DYNAMIC KNAPSACK TEAM FORMATION
# ==========================================
# This script implements an "Iterative Dynamic Knapsack" approach.
# Unlike standard Knapsack (static values), this updates item values
# dynamically to represent "Marginal Utility" (new skills covered),
# ensuring efficient coverage without redundancy.

import pandas as pd
import numpy as np
import ast
import random
import datetime
import os
import nlp_techniques
import M1
import math

[nltk_data] Downloading package wordnet to /Users/tej/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tej/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# ==========================================
# 1. SETUP & DATA LOADING
# ==========================================
print("Start time:\t", datetime.datetime.now())

# Configuration
INPUT_DIR = "../data/v1_input_files/"
OUT_DIR = "../data/v1_output_teaming/teaming_1698proposals_316researchers/"
os.makedirs(OUT_DIR, exist_ok=True)

# Import list of researchers
og_researchers = pd.read_csv(INPUT_DIR + 'v1_researchers.csv')

# Import proposals
proposal_info = pd.read_csv(INPUT_DIR + 'v1_proposal_links_title_synopsis.csv')
proposal_info.sort_values(["nsf_proposal_links_v1"], ascending=[False], inplace=True)
proposal_info.pop("Unnamed: 0")
proposal_info.reset_index(drop=True, inplace=True)

# Remove error case
error_values = ['https://www.nsf.gov/funding/pgm_summ.jsp?pims_id=505073'] 
proposal_info = proposal_info[proposal_info.nsf_proposal_links_v1.isin(error_values) == False]
proposal_info.reset_index(drop=True, inplace=True)

print(f"Loaded {len(og_researchers)} researchers and {len(proposal_info)} proposals.")

Start time:	 2026-01-28 19:20:48.108072
Loaded 202 researchers and 434 proposals.


In [3]:
# ==========================================
# 2. PROCESS RESEARCHER SKILLS 
# ==========================================
# Standard NLP Preprocessing (The "Eyes" of the system)
print("Processing Researcher Skills...")

m1_researcher_skills = {}

for i in range(len(og_researchers["research"])):
    researcher = og_researchers["names"][i]
    interests = og_researchers["research"][i]
    
    if pd.isna(interests):
        interests = "['research', 'general', 'computer', 'science', 'engineering']"
    
    try:
        interests_list = ast.literal_eval(interests)
        if isinstance(interests_list, list) and len(interests_list) > 0:
            interests_list = interests_list[0].split(", ")
        else:
            interests_list = []
    except:
        interests_list = []

    processed_interests = [nlp_techniques.preprocess(x) for x in interests_list if x]
    
    n_grams = []
    for term in processed_interests:
        n_grams.extend(nlp_techniques.generate_N_grams(term, ngram=2))
        
    final_skills = set(processed_interests + n_grams)
    final_skills.discard('')
    m1_researcher_skills[researcher] = final_skills

# Create Global Skill Pool
m1_all_researcher_skills = []
for r in m1_researcher_skills:
    for s in m1_researcher_skills[r]:
        if s not in m1_all_researcher_skills:
            m1_all_researcher_skills.append(s)

Processing Researcher Skills...


In [4]:
# ==========================================
# 3. PROCESS PROPOSAL SKILLS
# ==========================================
print("Processing Proposal Skills...")

m1_proposal_skills = {}

for i in range(len(proposal_info["nsf_proposal_links_v1"])):
    p_link = proposal_info["nsf_proposal_links_v1"][i]
    title = str(proposal_info["title"][i]) if pd.notna(proposal_info["title"][i]) else "general"
    synopsis = str(proposal_info["synopsis"][i]) if pd.notna(proposal_info["synopsis"][i]) else ""
    
    title = nlp_techniques.preprocess(title)
    synopsis = nlp_techniques.preprocess(synopsis)
    
    keywords = title.split(" ") + synopsis.split(" ")
    title_n = nlp_techniques.generate_N_grams(title, ngram=2)
    synopsis_n = nlp_techniques.generate_N_grams(synopsis, ngram=2)
    
    all_keywords = set(keywords + title_n + synopsis_n)
    
    # Filter: Keep only skills that exist in researcher pool
    valid_skills = {k for k in all_keywords if k in m1_all_researcher_skills}
    
    if not valid_skills:
        valid_skills = {"general"}
        
    m1_proposal_skills[p_link] = valid_skills

Processing Proposal Skills...


In [5]:
# ==========================================
# 3.5 PRE-CALCULATE SKILL WEIGHTS (IDF)
# ==========================================
print("Calculating Skill Rarity (Supply & Demand)...")

skill_counts = {}
total_researchers = len(og_researchers)

# 1. Count global frequency of every skill
for r in m1_researcher_skills:
    skills = m1_researcher_skills[r]
    for s in skills:
        skill_counts[s] = skill_counts.get(s, 0) + 1

# 2. Calculate Inverse Document Frequency (IDF) Weight
# Formula: Weight = log( Total / (Freq + 1) ) + 1
skill_weights = {}
for s, count in skill_counts.items():
    # Rare skills -> High Weight | Common skills -> Low Weight
    weight = math.log(total_researchers / (count + 1)) + 1
    skill_weights[s] = weight

# Fallback for completely new/unseen skills
DEFAULT_WEIGHT = 1.0

print(f"Weight Index Built. Examples:")
print(f" - Common (High Freq): {list(skill_weights.values())[-1]:.2f}")
print(f" - Rare (Low Freq): {list(skill_weights.values())[0]:.2f}")

Calculating Skill Rarity (Supply & Demand)...
Weight Index Built. Examples:
 - Common (High Freq): 5.62
 - Rare (Low Freq): 5.62


In [13]:


def solve_dynamic_knapsack(target_r, p_link, pseudo_skills_map, capacity_limit=8, gain_threshold=0.0, stochastic=False):
    """
    Principled Information-Theoretic Knapsack:
    - Adaptive Friction: Cost is based on the proposal's local skill weights.
    - Diminishing Returns: Naturally encourages coverage and robustness.
    """
    current_team = [target_r]
    req_skills = m1_proposal_skills.get(p_link, set())
    if not req_skills: return [target_r]

    # --- UPDATED GENERIC AUTO-COST (The "Local Market" Fix) ---
    # We calculate the average importance (weight) of a single skill in THIS proposal.
    # --- THE PRECISION TUNING ---
    if req_skills:
        avg_skill_weight = sum([skill_weights.get(s, 1.0) for s in req_skills]) / len(req_skills)
        
        # Lowering from 0.2 to 0.05. 
        # This makes 'seats' cheaper, encouraging the algorithm to fill more skill gaps.
        cost_constant = avg_skill_weight * 0.05 
    else:
        cost_constant = 0.01
    # ---------------------------------------------------------

    coverage_counts = {skill: 0 for skill in req_skills}
    
    # Initialize with target researcher's knowledge
    target_skills = set(pseudo_skills_map.get(target_r, set())).intersection(req_skills)
    for skill in target_skills:
        coverage_counts[skill] += 1

    while len(current_team) < capacity_limit:
        candidates = []
        available_pool = [r for r in og_researchers["names"] if r not in current_team]
        
        for cand in available_pool:
            cand_skills = set(pseudo_skills_map.get(cand, set())).intersection(req_skills)
            total_cand_utility = 0
            
            for skill in cand_skills:
                n = coverage_counts[skill]
                weight = skill_weights.get(skill, 1.0)
                # Mathematical Principle: Diminishing Marginal Returns (1.0 -> 0.5 -> 0.25)
                total_cand_utility += weight * (math.pow(0.5, n))
            
            # Net Gain = Information Gain - Adaptive Complexity Cost
            marginal_utility = total_cand_utility - cost_constant

            if marginal_utility > 0:
                candidates.append((cand, marginal_utility))

        if not candidates: break
        
        candidates.sort(key=lambda x: x[1], reverse=True)
        
        if not stochastic:
            best_cand, max_utility = candidates[0]
        else:
            # Stochastic selection from top 3 for team variety
            top_pool = candidates[:3] 
            best_cand, max_utility = random.choice(top_pool)

        if max_utility > gain_threshold:
            current_team.append(best_cand)
            new_skills = set(pseudo_skills_map.get(best_cand, set())).intersection(req_skills)
            for s in new_skills:
                coverage_counts[s] += 1
        else:
            break

    return current_team

In [14]:
# ==========================================
# TEST RUN: GENERATE TEAMS FOR FIRST 5 PROPOSALS
# ==========================================
print("Running TEST RUN for first 5 proposals...")

m1_teaming = {} 
m1_goodness_scores = {}
num_test_proposals = 50  # <--- Change this to test more or less

# Ensure initialization
if 'm1_pseudo_researcher_skills' not in locals():
    m1_pseudo_researcher_skills = {}

string_matching_threshold = 0.3
print("Num req skills for proposal:", len(m1_proposal_skills[p_link]))
print("Sample req skills:", list(m1_proposal_skills[p_link])[:10])


# Only loop through the first few links
for i in range(min(num_test_proposals, len(proposal_info["nsf_proposal_links_v1"]))):
    p_link = proposal_info["nsf_proposal_links_v1"][i]
    m1_teaming[p_link] = []
    m1_goodness_scores[p_link] = []

    print(f"\n--- Testing Proposal {i+1}: {p_link} ---")

    # Calculate Pseudo Skills
    if p_link not in m1_pseudo_researcher_skills:
        _, pseudo_skills = M1.string_matching_ranking(
            m1_researcher_skills, 
            m1_proposal_skills[p_link], 
            {}, 
            matching_threshold=string_matching_threshold
        )
        m1_pseudo_researcher_skills[p_link] = pseudo_skills

    current_map = m1_pseudo_researcher_skills[p_link]

    # Just pick 1 or 2 target researchers to keep the test fast
    for j in range(min(2, len(og_researchers["names"]))): 
        target_r = og_researchers["names"][j]
        
        # 1. Deterministic Best
        best_team = solve_dynamic_knapsack(target_r, p_link, current_map, stochastic=False)
        current_teams_for_r = [best_team]
        
        # 2. Stochastic Variations
        attempts = 0
        while len(current_teams_for_r) < 10 and attempts < 40:
            alt_team = solve_dynamic_knapsack(target_r, p_link, current_map, stochastic=True)
            if sorted(alt_team) not in [sorted(t) for t in current_teams_for_r]:
                current_teams_for_r.append(alt_team)
            attempts += 1
            
        while len(current_teams_for_r) < 10:
            current_teams_for_r.append(best_team)

        # 3. Store and Judge
        scores = [M1.apply_ultra_metric(m1_proposal_skills[p_link], t, current_map) for t in current_teams_for_r]
        
        # PRINT SUMMARY FOR THIS RESEARCHER
        print(f"Target: {target_r} | Best Team Size: {len(best_team)} | Best Score: {max(scores):.4f}")

print("\nTest Run Complete.")

Running TEST RUN for first 5 proposals...
Num req skills for proposal: 6
Sample req skills: ['mm', 'statistic', 'national science', 'development', 'science foundation', 'production']

--- Testing Proposal 1: https://www.nsf.gov/pubs/2021/nsf21598/nsf21598.htm ---
Target: Agostinelli, Forest | Best Team Size: 6 | Best Score: 0.6500
Target: Ahmad, Iftikhar | Best Team Size: 6 | Best Score: 0.6500

--- Testing Proposal 2: https://www.nsf.gov/pubs/2021/nsf21527/nsf21527.htm ---
Target: Agostinelli, Forest | Best Team Size: 6 | Best Score: 0.6500
Target: Ahmad, Iftikhar | Best Team Size: 6 | Best Score: 0.6500

--- Testing Proposal 3: https://www.nsf.gov/pubs/2020/nsf20609/nsf20609.htm ---
Target: Agostinelli, Forest | Best Team Size: 6 | Best Score: 0.6500
Target: Ahmad, Iftikhar | Best Team Size: 6 | Best Score: 0.6500

--- Testing Proposal 4: https://www.nsf.gov/pubs/2020/nsf20604/nsf20604.htm ---
Target: Agostinelli, Forest | Best Team Size: 6 | Best Score: 0.6500
Target: Ahmad, Iftikha

In [12]:

# ==========================================
# 5. GENERATE DIVERSE TEAMS (Principled Optimization)
# ==========================================
print("Running Principled Dynamic Knapsack with Diversity...")

m1_teaming = {} 
m1_goodness_scores = {}

# Ensure initialization
if 'm1_pseudo_researcher_skills' not in locals():
    m1_pseudo_researcher_skills = {}

# Use the semantic threshold consistent with M3
string_matching_threshold = 0.3

for i in range(len(proposal_info["nsf_proposal_links_v1"])):
    p_link = proposal_info["nsf_proposal_links_v1"][i]
    m1_teaming[p_link] = []
    m1_goodness_scores[p_link] = []

    if i % 50 == 0:
        print(f"Processing proposal {i} / {len(proposal_info)}...")

    # Calculate Pseudo Skills (Semantic Vision)
    if p_link not in m1_pseudo_researcher_skills:
        _, pseudo_skills = M1.string_matching_ranking(
            m1_researcher_skills, 
            m1_proposal_skills[p_link], 
            {}, 
            matching_threshold=string_matching_threshold
        )
        m1_pseudo_researcher_skills[p_link] = pseudo_skills

    current_map = m1_pseudo_researcher_skills[p_link]

    for j in range(len(og_researchers["names"])):
        target_r = og_researchers["names"][j]
        
        # 1. Deterministic Best (The 'Anchor' Team)
        # This uses the pure mathematical optimum
        best_team = solve_dynamic_knapsack(target_r, p_link, current_map, stochastic=False)
        current_teams_for_r = [best_team]
        
        # 2. Stochastic Variations (Diversity)
        # Finds 9 other teams that are near-optimal
        attempts = 0
        while len(current_teams_for_r) < 10 and attempts < 40:
            alt_team = solve_dynamic_knapsack(target_r, p_link, current_map, stochastic=True)
            
            # Canonical check: Ensure unique membership, regardless of order
            if sorted(alt_team) not in [sorted(t) for t in current_teams_for_r]:
                current_teams_for_r.append(alt_team)
            attempts += 1
            
        # Fallback if diversity pool is small
        while len(current_teams_for_r) < 10:
            current_teams_for_r.append(best_team)

        # 3. Store Results and Compute Judge Metrics
        m1_teaming[p_link].append([target_r, current_teams_for_r])
        
        # Calculate Goodness for each variation using the ULTRA metric as the 'Judge'
        scores = [M1.apply_ultra_metric(m1_proposal_skills[p_link], t, current_map) for t in current_teams_for_r]
        m1_goodness_scores[p_link].append([target_r, scores])

print("Generation Complete.")

Running Principled Dynamic Knapsack with Diversity...
Processing proposal 0 / 434...
Processing proposal 50 / 434...
Processing proposal 100 / 434...
Processing proposal 150 / 434...
Processing proposal 200 / 434...
Processing proposal 250 / 434...
Processing proposal 300 / 434...
Processing proposal 350 / 434...
Processing proposal 400 / 434...
Generation Complete.


In [13]:

# ==========================================
# 6. EXPORT FINAL CSV
# ==========================================
print("Exporting Final Dynamic Knapsack Data...")

csv_uc1_m1_teaming = []

for p_link in m1_teaming:
    # Get metadata
    row = proposal_info[proposal_info["nsf_proposal_links_v1"] == p_link].iloc[0]
    title = row["title"]
    try:
        year = p_link.split("/")[4]
        prop_id = p_link.split("/")[5]
    except:
        year = "N/A"
        prop_id = "ID"
        
    csv_hyperlink_text = f"{title} ({year})"

    for j in range(len(m1_teaming[p_link])):
        r_name = m1_teaming[p_link][j][0]
        teams = m1_teaming[p_link][j][1]
        scores = m1_goodness_scores[p_link][j][1] 

        # Sort by Score
        zipped = sorted(zip(scores, teams), reverse=True, key=lambda x: x[0])
        sorted_scores = [round(z[0], 4) for z in zipped]
        sorted_teams = [z[1] for z in zipped]

        csv_uc1_m1_teaming.append([
            prop_id, year, p_link, csv_hyperlink_text,
            m1_proposal_skills[p_link],
            r_name,
            sorted_teams,
            sorted_scores
        ])

df_final = pd.DataFrame(
    csv_uc1_m1_teaming, 
    columns=['proposal_id', 'year', 'proposal_link', 'title', "skills", "researcher_name", "team", "goodness"]
)

output_file = OUT_DIR + 'teaming_uc1_dynamic_knapsack_v3_6_Cap_05_cost.csv'
df_final.to_csv(output_file, encoding='utf-8', index=False)

print("End time:\t", datetime.datetime.now())
print(f"Success! File saved to: {output_file}")
df_final.head()


Exporting Final Dynamic Knapsack Data...
End time:	 2026-01-26 13:53:47.622439
Success! File saved to: ../data/v1_output_teaming/teaming_1698proposals_316researchers/teaming_uc1_dynamic_knapsack_v3_6_Cap_05_cost.csv


Unnamed: 0,proposal_id,year,proposal_link,title,skills,researcher_name,team,goodness
0,nsf21598,2021,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,Advanced Technological Education (2021),"{development, economic development, material, ...","Agostinelli, Forest","[[Agostinelli, Forest, Ahmad, Iftikhar, Alexee...","[0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.6..."
1,nsf21598,2021,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,Advanced Technological Education (2021),"{development, economic development, material, ...","Ahmad, Iftikhar","[[Ahmad, Iftikhar, Alexeev, Oleg S., Ali, Moha...","[0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.6..."
2,nsf21598,2021,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,Advanced Technological Education (2021),"{development, economic development, material, ...","Alexeev, Oleg S.","[[Alexeev, Oleg S., Ahmad, Iftikhar, Ali, Moha...","[0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.6..."
3,nsf21598,2021,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,Advanced Technological Education (2021),"{development, economic development, material, ...","Ali, Mohammod","[[Ali, Mohammod, Ahmad, Iftikhar, Alexeev, Ole...","[0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.6..."
4,nsf21598,2021,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,Advanced Technological Education (2021),"{development, economic development, material, ...","Ammal, Salai C.","[[Ammal, Salai C., Ahmad, Iftikhar, Alexeev, O...","[0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.6..."


In [35]:
# CORRECTED Verification Code Snippet
p_link = "https://www.nsf.gov/pubs/2013/nsf13543/nsf13543.htm"
req_skills = m1_proposal_skills[p_link]
# Use the pseudo-map for THIS specific proposal
pseudo_map = m1_pseudo_researcher_skills[p_link] 

team_members = ['Chen, Fanglin (Frank)']
team_skills = set()

for member in team_members:
    # Use pseudo_map (Soft Match) instead of m1_researcher_skills (Hard Match)
    # Cast to set to avoid the AttributeError
    team_skills.update(set(pseudo_map.get(member, [])))

# Check the overlap
covered = req_skills.intersection(team_skills)
missing = req_skills - team_skills

print(f"Required Skills ({len(req_skills)}): {req_skills}")
print(f"Covered Skills ({len(covered)}): {covered}")
print(f"Coverage Percentage: {len(covered)/len(req_skills) * 100:.2f}%")

Required Skills (15): {'national institute', 'learning', 'support system', 'clinical practice', 'nih', 'area', 'foundation nsf', 'modeling', 'national science', 'development', 'decision support', 'health', 'machine learning', 'science foundation', 'process modeling'}
Covered Skills (14): {'national institute', 'learning', 'support system', 'clinical practice', 'area', 'foundation nsf', 'modeling', 'national science', 'development', 'decision support', 'health', 'machine learning', 'science foundation', 'process modeling'}
Coverage Percentage: 93.33%


In [36]:
# --- Quick Diagnostic Check ---
p_link_test = "https://www.nsf.gov/pubs/2020/nsf20564/nsf20564.htm"

test_skills = m1_proposal_skills.get(p_link_test)
print(f"Type of skills object: {type(test_skills)}")
print(f"Skills found: {test_skills}")

if test_skills:
    print(f"Number of skills: {len(test_skills)}")
else:
    print("CRITICAL: No skills found for this proposal link.")

# Check overall health
empty_proposals = [link for link, skills in m1_proposal_skills.items() if not skills]
print(f"\nProposals with ZERO skills: {len(empty_proposals)} out of {len(m1_proposal_skills)}")

Type of skills object: <class 'set'>
Skills found: {'synthesis'}
Number of skills: 1

Proposals with ZERO skills: 0 out of 434


In [37]:
import ast

def check_team_diversity(teams_data):
    # 1. Handle string vs list
    if isinstance(teams_data, str):
        try:
            all_teams = ast.literal_eval(teams_data)
        except:
            print("Error: Could not parse string.")
            return
    else:
        all_teams = teams_data

    # 2. Flatten if nested (handles [[team1, team2]])
    if len(all_teams) > 0 and not isinstance(all_teams[0], str):
        # If it's the [target, [teams]] structure, grab the inner list
        if isinstance(all_teams[0], list) and len(all_teams[0]) > 1:
            all_teams = all_teams[0][1]
    
    # 3. Count Unique Combinations
    # We sort each team so that [A, B] is seen as the same as [B, A]
    unique_combinations = set(tuple(sorted(t)) for t in all_teams)
    
    # 4. Count Unique Individuals involved across all variations
    all_members = set(member for team in all_teams for member in team)
    
    print(f"--- Diversity Report ---")
    print(f"Total Variations Generated: {len(all_teams)}")
    print(f"Mathematically Unique Teams: {len(unique_combinations)}")
    print(f"Total Unique Researchers involved: {len(all_members)}")
    
    # 5. Show a sample of the first two teams to see the difference
    if len(all_teams) > 1:
        print(f"\nTeam 1 Sample: {all_teams[0]}")
        print(f"Team 2 Sample: {all_teams[1]}")

# Run it again
check_team_diversity(df_final['team'].iloc[0])

--- Diversity Report ---
Total Variations Generated: 15
Mathematically Unique Teams: 13
Total Unique Researchers involved: 13

Team 1 Sample: A
Team 2 Sample: h


In [38]:
import ast

def normalize_teams(teams_data):
    # Parse if string
    if isinstance(teams_data, str):
        try:
            obj = ast.literal_eval(teams_data)
        except Exception as e:
            raise ValueError(f"Could not parse teams string: {e}")
    else:
        obj = teams_data

    # Now normalize to: List[List[str]]
    # Case 1: already list of teams -> [[name1, name2], [name1, name3], ...]
    if isinstance(obj, list) and obj and isinstance(obj[0], list) and obj[0] and isinstance(obj[0][0], str):
        # obj looks like list-of-teams
        return obj

    # Case 2: sometimes it's wrapped like [target, [teams]] or [[target, [teams]]]
    # We'll peel wrappers until we find list-of-teams
    cur = obj
    for _ in range(5):  # prevent infinite loops
        if isinstance(cur, list) and len(cur) == 2 and isinstance(cur[0], str) and isinstance(cur[1], list):
            # [target, teams]
            cur = cur[1]
        elif isinstance(cur, list) and len(cur) == 1 and isinstance(cur[0], list):
            # [[...]] wrapper
            cur = cur[0]
        else:
            break

        # After peeling once, check if we reached list-of-teams
        if isinstance(cur, list) and cur and isinstance(cur[0], list) and cur[0] and isinstance(cur[0][0], str):
            return cur

    # Case 3: a single team like ['A','B',...] -> wrap it
    if isinstance(cur, list) and cur and isinstance(cur[0], str):
        return [cur]

    raise ValueError(f"Unrecognized team structure after parsing/peeling: {type(obj)} | sample={str(obj)[:200]}")

def check_team_diversity(teams_data):
    all_teams = normalize_teams(teams_data)

    # Unique combinations ignoring order
    unique_combinations = set(tuple(sorted(team)) for team in all_teams)

    # Unique members
    all_members = {m for team in all_teams for m in team}

    print(f"--- Diversity Report ---")
    print(f"Total Variations Generated: {len(all_teams)}")
    print(f"Mathematically Unique Teams: {len(unique_combinations)}")
    print(f"Total Unique Researchers involved: {len(all_members)}")

    if len(all_teams) > 1:
        print(f"\nTeam 1 Sample ({len(all_teams[0])}): {all_teams[0]}")
        print(f"Team 2 Sample ({len(all_teams[1])}): {all_teams[1]}")

# Run
check_team_diversity(df_final['team'].iloc[0])


--- Diversity Report ---
Total Variations Generated: 10
Mathematically Unique Teams: 10
Total Unique Researchers involved: 7

Team 1 Sample (5): ['Agostinelli, Forest', 'Ahmad, Iftikhar', 'Alexeev, Oleg S.', 'Ali, Mohammod', 'Bayoumi, Abdel-Moez E. ']
Team 2 Sample (5): ['Agostinelli, Forest', 'Ahmad, Iftikhar', 'Ali, Mohammod', 'Berge, Nicole D.', 'Blanchette, James  Otto']


In [39]:
def jaccard(a, b):
    a, b = set(a), set(b)
    return len(a & b) / max(len(a | b), 1)

teams = normalize_teams(df_final['team'].iloc[0])
pairwise = [jaccard(teams[i], teams[j]) for i in range(len(teams)) for j in range(i+1, len(teams))]
print("Avg pairwise overlap:", round(sum(pairwise)/len(pairwise), 3))
print("Min/Max overlap:", round(min(pairwise), 3), round(max(pairwise), 3))


Avg pairwise overlap: 0.577
Min/Max overlap: 0.429 0.667


In [40]:
from collections import Counter
counts = Counter()
for t in teams:
    counts.update(set(t))
always = [r for r,c in counts.items() if c == len(teams)]
print("Always present:", always)


Always present: ['Agostinelli, Forest']


In [41]:
import ast
import itertools
import pandas as pd
import numpy as np

def analyze_team_composition(teams_data):
    # 1. Parsing logic
    if isinstance(teams_data, str):
        all_teams = ast.literal_eval(teams_data)
    else:
        all_teams = teams_data
    
    # Flatten structure if nested
    if len(all_teams) > 0 and not isinstance(all_teams[0], str):
        if isinstance(all_teams[0], list) and len(all_teams[0]) > 1:
            all_teams = all_teams[0][1]

    # Limit to first 10 for analysis
    all_teams = all_teams[:10]
    
    # ---------------------------------------------------------
    # PART 1: PRINT ACTUAL TEAMS
    # ---------------------------------------------------------
    print("--- FIRST 3 RECOMMENDED TEAMS ---")
    for i, team in enumerate(all_teams[:3]):
        print(f"Team {i+1}: {sorted(team)}")
    print("\n" + "="*50 + "\n")

    # ---------------------------------------------------------
    # PART 2: PAIRWISE JACCARD DIVERSITY
    # ---------------------------------------------------------
    # Jaccard = (Intersection) / (Union)
    jaccard_scores = []
    for t1, t2 in itertools.combinations(all_teams, 2):
        s1, s2 = set(t1), set(t2)
        score = len(s1.intersection(s2)) / len(s1.union(s2))
        jaccard_scores.append(score)
    
    avg_jaccard = np.mean(jaccard_scores)
    diversity_index = 1 - avg_jaccard # 0 = identical, 1 = completely different

    # ---------------------------------------------------------
    # PART 3: CORE VS OPTIONAL MEMBERS
    # ---------------------------------------------------------
    member_counts = pd.Series([m for t in all_teams for m in t]).value_counts()
    num_teams = len(all_teams)
    
    # Core members appear in > 80% of recommendations
    core_members = member_counts[member_counts >= (num_teams * 0.8)].index.tolist()
    # Optional members appear in < 50% (helping you pivot/explore)
    optional_members = member_counts[member_counts <= (num_teams * 0.5)].index.tolist()

    print("--- DIVERSITY METRICS ---")
    print(f"A) Avg Pairwise Overlap (Jaccard): {avg_jaccard:.2f}")
    print(f"B) Diversity Index (1 - Jaccard): {diversity_index:.2f} (High is better for exploration)")
    
    print("\n--- MEMBER ROLE ANALYSIS ---")
    print(f"Core Members (The Backbone): {len(core_members)}")
    for m in core_members: print(f"  - [CORE] {m}")
    
    print(f"\nOptional/Niche Members (The Substitutes): {len(optional_members)}")
    print(f"Total Unique Researchers involved: {len(member_counts)}")

# Execute on your data
analyze_team_composition(df_final['team'].iloc[0])

--- FIRST 3 RECOMMENDED TEAMS ---
Team 1: ['A']
Team 2: ['h']
Team 3: ['m']


--- DIVERSITY METRICS ---
A) Avg Pairwise Overlap (Jaccard): 0.00
B) Diversity Index (1 - Jaccard): 1.00 (High is better for exploration)

--- MEMBER ROLE ANALYSIS ---
Core Members (The Backbone): 0

Optional/Niche Members (The Substitutes): 10
Total Unique Researchers involved: 10


In [42]:
import ast
import pandas as pd
from itertools import combinations

# ---------- Helpers you already have ----------
def normalize_teams(teams_data):
    if isinstance(teams_data, str):
        try:
            obj = ast.literal_eval(teams_data)
        except Exception as e:
            raise ValueError(f"Could not parse teams string: {e}")
    else:
        obj = teams_data

    # Case 1: list-of-teams
    if isinstance(obj, list) and obj and isinstance(obj[0], list) and obj[0] and isinstance(obj[0][0], str):
        return obj

    # Case 2: peel wrappers like [target, [teams]] or [[target, [teams]]]
    cur = obj
    for _ in range(5):
        if isinstance(cur, list) and len(cur) == 2 and isinstance(cur[0], str) and isinstance(cur[1], list):
            cur = cur[1]
        elif isinstance(cur, list) and len(cur) == 1 and isinstance(cur[0], list):
            cur = cur[0]
        else:
            break

        if isinstance(cur, list) and cur and isinstance(cur[0], list) and cur[0] and isinstance(cur[0][0], str):
            return cur

    # Case 3: single team
    if isinstance(cur, list) and cur and isinstance(cur[0], str):
        return [cur]

    raise ValueError(f"Unrecognized team structure: {type(obj)} | sample={str(obj)[:200]}")

def parse_goodness(goodness_data):
    """
    goodness column can be:
    - list[float]
    - string like "[0.7, 0.7, ...]"
    Returns list[float].
    """
    if isinstance(goodness_data, list):
        return [float(x) for x in goodness_data]
    if isinstance(goodness_data, str):
        try:
            vals = ast.literal_eval(goodness_data)
            return [float(x) for x in vals]
        except Exception:
            # sometimes a single float stored as string
            try:
                return [float(goodness_data)]
            except Exception as e:
                raise ValueError(f"Could not parse goodness: {goodness_data} | err={e}")
    # fallback
    return [float(goodness_data)]

def jaccard(team_a, team_b):
    a, b = set(team_a), set(team_b)
    return len(a & b) / max(len(a | b), 1)

def avg_pairwise_jaccard(teams):
    if len(teams) < 2:
        return 1.0
    vals = [jaccard(a, b) for a, b in combinations(teams, 2)]
    return sum(vals) / len(vals)

# ---------- Row-level stats (per proposal+researcher row) ----------
def compute_row_stats(row):
    teams = normalize_teams(row["team"])
    goodness_list = parse_goodness(row["goodness"])

    # In case lengths mismatch (shouldn't, but safe):
    if len(goodness_list) != len(teams):
        # if goodness is a single value repeated, expand it
        if len(goodness_list) == 1:
            goodness_list = goodness_list * len(teams)
        else:
            # truncate to min length
            m = min(len(goodness_list), len(teams))
            goodness_list = goodness_list[:m]
            teams = teams[:m]

    team_sizes = [len(t) for t in teams]

    return {
        "proposal_id": row.get("proposal_id", None),
        "proposal_link": row.get("proposal_link", None),
        "year": row.get("year", None),
        "researcher_name": row.get("researcher_name", None),
        "num_variations": len(teams),
        "unique_teams": len({tuple(sorted(t)) for t in teams}),
        "unique_researchers_in_variations": len({m for t in teams for m in t}),
        "avg_team_size": sum(team_sizes) / max(len(team_sizes), 1),
        "avg_goodness": sum(goodness_list) / max(len(goodness_list), 1),
        "avg_pairwise_jaccard": avg_pairwise_jaccard(teams),
    }

# ---------- Proposal-level summary (aggregates across targets within a proposal) ----------
def summarize_by_proposal(df, n_proposals=5, proposal_col="proposal_id"):
    # pick first N proposals in the dataframe order
    chosen = df[proposal_col].dropna().unique()[:n_proposals]
    df_sub = df[df[proposal_col].isin(chosen)].copy()

    # compute per-row stats
    row_stats = []
    for _, r in df_sub.iterrows():
        try:
            row_stats.append(compute_row_stats(r))
        except Exception as e:
            # keep going but report what failed
            row_stats.append({
                "proposal_id": r.get("proposal_id", None),
                "proposal_link": r.get("proposal_link", None),
                "year": r.get("year", None),
                "researcher_name": r.get("researcher_name", None),
                "error": str(e)[:200],
            })

    stats_df = pd.DataFrame(row_stats)

    # drop rows with errors for aggregation (but you can inspect them)
    ok = stats_df[~stats_df.columns.isin(["error"])] if "error" in stats_df.columns else stats_df
    if "error" in stats_df.columns:
        stats_ok = stats_df[stats_df["error"].isna()] if stats_df["error"].isna().any() else stats_df[stats_df["error"].isna()]
    else:
        stats_ok = stats_df

    # aggregate across targets for each proposal
    agg = (stats_ok
           .groupby(["proposal_id", "proposal_link", "year"], dropna=False)
           .agg(
               targets=("researcher_name", "nunique"),
               avg_team_size=("avg_team_size", "mean"),
               avg_goodness=("avg_goodness", "mean"),
               avg_jaccard=("avg_pairwise_jaccard", "mean"),
               avg_unique_teams=("unique_teams", "mean"),
               avg_unique_researchers=("unique_researchers_in_variations", "mean"),
           )
           .reset_index()
          )

    # nice rounding for display
    for c in ["avg_team_size", "avg_goodness", "avg_jaccard", "avg_unique_teams", "avg_unique_researchers"]:
        if c in agg.columns:
            agg[c] = agg[c].astype(float).round(3)

    return agg, stats_df

# ---------- Run for 3â€“5 proposals ----------
proposal_summary, per_row_debug = summarize_by_proposal(df_final, n_proposals=5)

print("=== Proposal-level Summary (first 5 proposals) ===")
display(proposal_summary)

print("\nIf anything failed to parse, inspect rows with errors:")
if "error" in per_row_debug.columns:
    display(per_row_debug[per_row_debug["error"].notna()].head(10))


=== Proposal-level Summary (first 5 proposals) ===


Unnamed: 0,proposal_id,proposal_link,year,targets,avg_team_size,avg_goodness,avg_jaccard,avg_unique_teams,avg_unique_researchers
0,nsf20591,https://www.nsf.gov/pubs/2020/nsf20591/nsf2059...,2020,202,5.0,0.625,0.572,9.931,7.0
1,nsf20604,https://www.nsf.gov/pubs/2020/nsf20604/nsf2060...,2020,202,5.0,0.625,0.571,9.95,7.0
2,nsf20609,https://www.nsf.gov/pubs/2020/nsf20609/nsf2060...,2020,202,5.0,0.625,0.573,9.926,7.0
3,nsf21527,https://www.nsf.gov/pubs/2021/nsf21527/nsf2152...,2021,202,5.0,0.625,0.572,9.926,7.0
4,nsf21598,https://www.nsf.gov/pubs/2021/nsf21598/nsf2159...,2021,202,5.0,0.625,0.573,9.921,7.0



If anything failed to parse, inspect rows with errors:
