# 1. simple example

In [1]:
from desync_search import DesyncClient

client = DesyncClient()

result = client.search("https://example.com")


### Results Display Section ####
print("URL:", result.url)
print("Number of internal links:", len(result.internal_links))
print("Number of external links:", len(result.external_links))
print("Text content length:", len(result.text_content))

URL: 
Number of internal links: 0
Number of external links: 1
Text content length: 189


# 2. pulling yc companies sitemap

In [2]:
from desync_search import DesyncClient
from desync_search import extract_links_from_sitemap

client = DesyncClient()

target_url = "https://www.ycombinator.com/companies/sitemap.xml"
result = client.search(url=target_url, scrape_full_html=False)

yc_company_profile_links = extract_links_from_sitemap(result.text_content)
# print(links)
print(f"total links in sitemap: {len(yc_company_profile_links)}")

total links in sitemap: 5235


# 3. pulling all yc company profiles

In [3]:
import math

def chunk_list(lst, max_size):
    """
    Splits `lst` into equal sized sublists such that none exceed `max_size` in length.
    The sublists are as equal as possible in size.

    Args:
        lst (list): The list to be split.
        max_size (int): The maximum size for each sublist.

    Returns:
        list: A list of sublists.
    """
    n = len(lst)
    if n <= max_size:
        return [lst]
    
    # Determine how many chunks are needed.
    number_of_chunks = math.ceil(n / float(max_size))
    # Calculate the floor size of each chunk.
    chunk_size_floor = n // number_of_chunks
    # The remainder tells us how many chunks need one extra element.
    remainder = n % number_of_chunks

    chunks = []
    start = 0
    for i in range(number_of_chunks):
        # Distribute the extra items to the first 'remainder' chunks.
        current_chunk_size = chunk_size_floor + (1 if i < remainder else 0)
        chunks.append(lst[start:start + current_chunk_size])
        start += current_chunk_size
    return chunks

def reprocess_entries(yc_profile_pagedata_list):
    reprocess_times = 2
    for run in range(reprocess_times):
        # Build a list of URLs from pages with empty text_content.
        empty_links = [page.url for page in yc_profile_pagedata_list if len(page.text_content) == 0]
        
        # If no empty pages remain, exit early.
        if not empty_links:
            print(f"No empty pages remain after {run} reprocessing runs.")
            break

        print(f"\nReprocessing run {run+1}/{reprocess_times}: {len(empty_links)} empty pages")
        
        # Create a mapping from URL to its index (for pages that are empty).
        empty_indices = { page.url: idx for idx, page in enumerate(yc_profile_pagedata_list) if len(page.text_content) == 0 }
        
        # Split the empty URLs into chunks.
        chunked_empty_links = chunk_list(empty_links, 2000)
        print(f"Total empty chunks to process: {len(chunked_empty_links)}")
        
        # Process each chunk.
        for idx, chunk in enumerate(chunked_empty_links):
            print(f"Reprocessing empty chunk {idx+1}/{len(chunked_empty_links)} with {len(chunk)} links")
            new_results = client.simple_bulk_search(target_list=chunk)
            # Overwrite the corresponding entries in yc_profile_pagedata_list.
            for result in new_results:
                if result.url in empty_indices:
                    index = empty_indices[result.url]
                    yc_profile_pagedata_list[index] = result

# --- Part 1: Initial Bulk Processing ---
yc_profile_pagedata_list = []

# Assume yc_company_profile_links is your list of profile URLs.
chunked_links = chunk_list(yc_company_profile_links, 2000)
print(f"Total chunks to process: {len(chunked_links)}")

# Process each chunk and build the initial list of PageData objects.
for idx, chunk in enumerate(chunked_links):
    print(f"Processing chunk {idx+1}/{len(chunked_links)} with {len(chunk)} links")
    results = client.simple_bulk_search(target_list=chunk)
    yc_profile_pagedata_list.extend(results)
    print(f"Retrieved {len(yc_profile_pagedata_list)} pages so far.")

# Report empty vs. non-empty pages.
empties = sum(1 for page in yc_profile_pagedata_list if len(page.text_content) == 0)
nonempties = len(yc_profile_pagedata_list) - empties
print(f"Empties: {empties}")
print(f"Non-empties: {nonempties}")

# --- Part 2: Reprocess Empty Pages (Multiple Runs) ---

# Set the number of reprocessing attempts (default = 2).

reprocess_entries(yc_profile_pagedata_list)

# --- Final Reporting ---
empties = sum(1 for page in yc_profile_pagedata_list if len(page.text_content) == 0)
nonempties = len(yc_profile_pagedata_list) - empties
print(f"\nAfter reprocessing, Empties: {empties}")
print(f"After reprocessing, Non-empties: {nonempties}")


Total chunks to process: 3
Processing chunk 1/3 with 1745 links
Retrieved 1745 pages so far.
Processing chunk 2/3 with 1745 links
Retrieved 3480 pages so far.
Processing chunk 3/3 with 1745 links
Retrieved 5225 pages so far.
Empties: 3462
Non-empties: 1763

Reprocessing run 1/2: 3462 empty pages
Total empty chunks to process: 2
Reprocessing empty chunk 1/2 with 1731 links
Reprocessing empty chunk 2/2 with 1731 links

Reprocessing run 2/2: 2264 empty pages
Total empty chunks to process: 2
Reprocessing empty chunk 1/2 with 1132 links
Reprocessing empty chunk 2/2 with 1132 links

After reprocessing, Empties: 1139
After reprocessing, Non-empties: 4086


# 4. extracting profile info

In [1]:
import re
import sqlite3
import json


def extract_yc_profile_info(text):
    """
    Given the text content of a YC company profile,
    extracts:
      - Company name (the line immediately preceding the first occurrence of "Founded:")
      - Key/value pairs for:
            Founded:
            Team Size:
            Status:
            Location:
            Group Partner:
      - Tags: Lines starting 4 lines after the sequence:
                ›
                Companies
                ›
             up until the line that is exactly "Company".
             Only tags that do not contain any lowercase letters are considered.
      - URL: The first URL-like string found in the lines between "Jobs" and "Founded:".
    
    Returns:
        dict: A dictionary containing the extracted data.
    """
    # Split the text into non-empty, stripped lines.
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    info = {}
    
    # 1. Extract the key/value pairs for the known keys.
    keys_to_extract = ["Founded:", "Team Size:", "Status:", "Location:", "Group Partner:"]
    for i, line in enumerate(lines):
        for key in keys_to_extract:
            if line.startswith(key):
                # Take the next line as the value (if available)
                value = lines[i+1] if (i+1) < len(lines) else ""
                # Store without the trailing colon in the key.
                info[key.rstrip(':')] = value
    
    # 2. Extract the company name.
    # We assume the company name is the line immediately preceding the first occurrence of "Founded:".
    founded_index = next((i for i, line in enumerate(lines) if line.startswith("Founded:")), None)
    if founded_index is not None and founded_index > 0:
        info["Company Name"] = lines[founded_index - 1]
    else:
        info["Company Name"] = "Not Found"
    
    # 3. Extract Tags.
    # Look for the sequence: "›", "Companies", "›"
    tags = []
    sequence_found = False
    for i in range(len(lines) - 2):
        if lines[i] == "›" and lines[i+1] == "Companies" and lines[i+2] == "›":
            sequence_found = True
            # Jump 4 lines below the end of this sequence.
            start_index = i + 2 + 3  # i+2 is the last line ("›"), then +3.
            # Gather lines until we hit a line exactly equal to "Company".
            j = start_index
            while j < len(lines) and lines[j] != "Company":
                tag_candidate = lines[j]
                # Only add the tag if it does not contain any lowercase letters.
                if tag_candidate == tag_candidate.upper():
                    tags.append(tag_candidate)
                j += 1
            break  # Stop after processing the first occurrence.
    info["Tags"] = tags if sequence_found else []
    
    # 4. Extract URL from between "Jobs" and "Founded:".
    # Find the index for "Jobs" and "Founded:".
    jobs_index = None
    # We'll consider the first occurrence of "Jobs" that appears before the "Founded:" line.
    for i, line in enumerate(lines):
        if line == "Jobs" and (founded_index is None or i < founded_index):
            jobs_index = i
            break

    found_url = None
    if jobs_index is not None and founded_index is not None:
        # Check the lines between "Jobs" (exclusive) and "Founded:" (exclusive) for a URL.
        for line in lines[jobs_index+1:founded_index]:
            # A simple regex to match an http or https URL.
            match = re.search(r'https?://\S+', line)
            if match:
                found_url = match.group(0)
                break
    info["URL"] = found_url if found_url else "Not Found"
    
    return info

# --------------------------------------------------
# Process each YC profile and build a list of dictionaries.
# --------------------------------------------------
processed_profiles = []

for page in yc_profile_pagedata_list:
    # Use the text_content of the page to extract profile information.
    profile_info = extract_yc_profile_info(page.text_content)
    
    # For traceability, record the source URL.
    profile_info["source_url"] = page.url
    
    # Append the processed record to the list.
    processed_profiles.append(profile_info)

# --------------------------------------------------
# Now store the processed profiles in a SQLite database.
# The database file will be created (if it does not exist) at the specified location.
# --------------------------------------------------
db_path = "/home/vlad/vlad/"
conn = sqlite3.connect(db_path)
cur = conn.cursor()

# Create a table for storing profile information if it doesn't already exist.
cur.execute('''
CREATE TABLE IF NOT EXISTS profiles (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    company_name TEXT,
    founded TEXT,
    team_size TEXT,
    status TEXT,
    location TEXT,
    group_partner TEXT,
    tags TEXT,
    url TEXT,
    source_url TEXT
)
''')

# Insert each processed profile into the database.
for profile in processed_profiles:
    company_name = profile.get("Company Name", "")
    founded = profile.get("Founded", "")
    team_size = profile.get("Team Size", "")
    status = profile.get("Status", "")
    location = profile.get("Location", "")
    group_partner = profile.get("Group Partner", "")
    # Convert the list of tags into a JSON string for storage.
    tags = json.dumps(profile.get("Tags", []))
    url = profile.get("URL", "")
    source_url = profile.get("source_url", "")
    
    cur.execute('''
    INSERT INTO profiles (company_name, founded, team_size, status, location, group_partner, tags, url, source_url)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (company_name, founded, team_size, status, location, group_partner, tags, url, source_url))

# Commit the changes and close the connection.
conn.commit()
conn.close()


NameError: name 'yc_profile_pagedata_list' is not defined

# 5. Tags Analysis

In [None]:
import collections

# ----- Process and Print All Tags -----
all_tags = []
for profile in processed_profiles:
    # Ensure we get a list of tags
    tags = profile.get("Tags", [])
    if isinstance(tags, list):
        all_tags.extend(tags)
    else:
        # In case tags are stored as a JSON string or other format, try to handle it
        try:
            import json
            all_tags.extend(json.loads(tags))
        except Exception:
            pass

# Count the frequency of each tag.
tag_counts = collections.Counter(all_tags)

print("All Tags (most common to least common):")
for tag, count in tag_counts.most_common():
    print(f"{tag}: {count}")


# ----- Process and Print Group Partner Entries -----
group_partners = []
for profile in processed_profiles:
    # Note: the extraction function stores the value under "Group Partner" (without the colon).
    partner = profile.get("Group Partner", "").strip()
    if partner:
        group_partners.append(partner)

group_partner_counts = collections.Counter(group_partners)

print("\nGroup Partner (most common to least common):")
for partner, count in group_partner_counts.most_common():
    print(f"{partner}: {count}")

status_entries = []
for profile in processed_profiles:
    status = profile.get("Status", "").strip()
    if status:
        status_entries.append(status)

status_counts = collections.Counter(status_entries)
print("\nStatus (most common to least common):")
for status, count in status_counts.most_common():
    print(f"{status}: {count}")

# 6. Cohort Analysis

In [None]:
import re
import json
import collections
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

# --- Helper functions ---

def is_cohort(tag):
    """
    Returns True if the tag is a YC cohort tag.
    A valid cohort tag is either:
      - A single uppercase letter followed by two digits (e.g., "W18" or "S19"), or
      - The special case "IK12".
    """
    if tag == "IK12":
        return True
    if re.match(r'^[A-Z]\d\d$', tag):
        return True
    return False

def cohort_sort_key(cohort):
    """
    Returns a tuple for sorting a cohort tag.
    Sorting is done first by the numeric (year) part, then by a rank of the letter,
    where the order is: W (rank 0), S (rank 1), IK (rank 2), and then others (rank 3).
    For the special case "IK12", we treat it as (12, 2).
    """
    if cohort == "IK12":
        return (12, 2)
    m = re.match(r'^([A-Z])(\d\d)$', cohort)
    if m:
        letter = m.group(1)
        year = int(m.group(2))
        letter_order_map = {"W": 0, "S": 1}  # IK is handled above.
        letter_rank = letter_order_map.get(letter, 3)
        return (year, letter_rank)
    return (999, 999)  # In case of an unexpected format.

# --- Build data by cohort ---

# We'll build a dictionary mapping each cohort to:
#   - "count": the number of companies (profiles) in that cohort
#   - "tag_counter": a Counter for all non-cohort tags that appear among the companies in that cohort.
cohort_data = defaultdict(lambda: {"count": 0, "tag_counter": Counter()})

for profile in processed_profiles:
    # Get the list of tags; if not already a list, try to decode it.
    tags = profile.get("Tags", [])
    if not isinstance(tags, list):
        try:
            tags = json.loads(tags)
        except Exception:
            tags = []
    
    # Identify the cohort tags in this profile.
    profile_cohorts = [tag for tag in tags if is_cohort(tag)]
    if not profile_cohorts:
        continue  # Skip profiles with no cohort tag.
    
    # For each profile, consider the non-cohort tags (unique per profile)
    non_cohort_tags = set(tag for tag in tags if not is_cohort(tag))
    
    # Update each cohort the company belongs to.
    for cohort in profile_cohorts:
        cohort_data[cohort]["count"] += 1
        cohort_data[cohort]["tag_counter"].update(non_cohort_tags)

# --- Sort cohorts as required ---
sorted_cohorts = sorted(cohort_data.keys(), key=cohort_sort_key)
x_indices = list(range(len(sorted_cohorts)))  # For plotting along the x axis.
cohort_labels = sorted_cohorts

# --- Build a tag frequency table per cohort ---
# We want, for each non-cohort tag, to know the fraction of companies in that cohort that hold the tag.
all_non_cohort_tags = set()
for data in cohort_data.values():
    all_non_cohort_tags.update(data["tag_counter"].keys())

# tag_fractions will map each tag to a list of fractions (one fraction per sorted cohort).
tag_fractions = {}
for tag in all_non_cohort_tags:
    fractions = []
    for cohort in sorted_cohorts:
        data = cohort_data[cohort]
        total = data["count"]
        count = data["tag_counter"][tag]
        fraction = count / total if total > 0 else 0
        fractions.append(fraction)
    tag_fractions[tag] = fractions

# --- (Optional) Select a subset of tags for clarity ---
# For example, only include tags that ever appear in at least 5% of companies in any cohort.
selected_tags = [tag for tag, fracs in tag_fractions.items() if max(fracs) >= 0.20]

# cmap = plt.get_cmap('viridis', len(selected_tags))

# plt.figure(figsize=(12, 8))
# for i, tag in enumerate(selected_tags):
#     plt.plot(x_indices, tag_fractions[tag], marker='o', color=cmap(i), label=tag)

# --- Plot the graph ---
plt.figure(figsize=(12, 8))
for tag in selected_tags:
    plt.plot(x_indices, tag_fractions[tag], marker='o', label=tag)


plt.xticks(x_indices, cohort_labels, rotation=45)
plt.xlabel("YC Cohort")
plt.ylabel("Fraction of Companies with Tag")
plt.title("Historical Frequency of Tags Across YC Cohorts")
plt.legend(title="Tag", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


# 7. Clustering Analysis with Embeddings

In [None]:
import spacy
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import collections
import re

# Helper function for determining if a tag is a cohort tag.
def is_cohort(tag):
    """
    Returns True if the tag is a YC cohort tag.
    A valid cohort tag is either:
      - A single uppercase letter followed by two digits (e.g., "W18" or "S19"), or
      - The special case "IK12".
    """
    if tag == "IK12":
        return True
    if re.match(r'^[A-Z]\d\d$', tag):
        return True
    return False

# Load spaCy model (make sure you've installed a model with vectors, e.g. en_core_web_md)
nlp = spacy.load('en_core_web_md')

# ---- Step 1: Compute Company Embeddings Based on Their Tags ----
company_embeddings = []
company_labels = []      # for identification (e.g., company name)
company_tags_list = []   # non-cohort tags for each company

for idx, profile in enumerate(processed_profiles):
    tags = profile.get("Tags", [])
    # Ensure we have a list of tags
    if not isinstance(tags, list):
        try:
            tags = json.loads(tags)
        except Exception:
            tags = []
    # For analysis, we want the non-cohort tags
    non_cohort_tags = [tag for tag in tags if not is_cohort(tag)]
    
    tag_vectors = []
    # Use all tags (cohort or not) to compute an embedding
    for tag in tags:
        doc = nlp(tag)
        if doc.has_vector:
            tag_vectors.append(doc.vector)
    if tag_vectors:
        company_name = profile.get("Company Name", f"Company {idx}")
        # Skip profiles that don't have a valid company name.
        if company_name == "Not Found":
            continue
        avg_vector = np.mean(tag_vectors, axis=0)
        company_embeddings.append(avg_vector)
        company_labels.append(company_name)
        company_tags_list.append(non_cohort_tags)

company_embeddings = np.array(company_embeddings)

# ---- Step 2: Clustering Analysis (e.g., K-Means) ----
num_clusters = 5  # adjust this as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(company_embeddings)

# Compute the silhouette score to gauge clustering quality.
score = silhouette_score(company_embeddings, clusters)
print("Silhouette Score:", score)

# ---- Step 3: Dimensionality Reduction for Visualization (Optional) ----
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(company_embeddings)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.title("K-Means Clustering of Companies Based on Tag Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

# ---- Step 4: Further Analysis by Cluster ----

# (A) For each cluster, identify representative companies.
cluster_representatives = {}
for cluster_label in range(num_clusters):
    # Get indices of companies in this cluster
    indices = np.where(clusters == cluster_label)[0]
    # Compute the Euclidean distance from each company's embedding to the cluster centroid.
    centroid = kmeans.cluster_centers_[cluster_label]
    distances = np.linalg.norm(company_embeddings[indices] - centroid, axis=1)
    # Sort the indices by distance to the centroid.
    sorted_indices = indices[np.argsort(distances)]
    # Save the top 3 representative companies for this cluster.
    cluster_representatives[cluster_label] = [company_labels[i] for i in sorted_indices[:3]]

# (B) For each cluster, summarize the most common non-cohort tags.
cluster_tag_summary = {}
# We'll use a Counter for each cluster.
for cluster_label in range(num_clusters):
    cluster_tag_summary[cluster_label] = collections.Counter()

for i, cluster_label in enumerate(clusters):
    # Update the counter with the non-cohort tags for the company.
    cluster_tag_summary[cluster_label].update(company_tags_list[i])

# ---- Print out the analysis results ----
for cluster_label in range(num_clusters):
    print(f"\nCluster {cluster_label}:")
    print("  Representative Companies:")
    for comp in cluster_representatives[cluster_label]:
        print("   -", comp)
    print("  Top Tags:")
    for tag, count in cluster_tag_summary[cluster_label].most_common(5):
        print(f"   - {tag}: {count}")
