In [None]:
import json
import time
import os
import requests
from statistics import mean
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

# Constants
START_USER = "CatalinPlesu"
MAX_DEPTH = 5
MAX_PAGES = 1000
MAX_FOLLOWERS_FOLLOWING = 25
FAIR_LIMIT = 10
WHITELIST = [START_USER, "Ernest96"]
CRAWL_DELAY = 1
ESTIMATE_PAGES_PER_USER = 3
ESTIMATE_DELAY_PER_PAGE = 2

DATA_FILE = "github_users.json"
STATE_FILE = "crawler_state.json"
IMG_FOLDER = "img"

# Setup
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
os.makedirs(IMG_FOLDER, exist_ok=True)

# Persistence
def save_data(data):
    with open(DATA_FILE, "w") as f:
        json.dump(data, f, indent=2)

def load_data():
    try:
        with open(DATA_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"users": {}}

def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)

def load_state():
    try:
        with open(STATE_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {
            "depth": 1,
            "pages_visited": 0,
            "current_level": [START_USER],
            "next_level": [],
            "durations": [],
            "crawled_users": []
        }

# Image Download
def download_image(url, username):
    if not url:
        return None
    image_path = os.path.join(IMG_FOLDER, f"{username}.jpg")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(image_path, "wb") as f:
            f.write(response.content)
        time.sleep(CRAWL_DELAY)
        return f"./{image_path}"
    except Exception as e:
        print(f"Failed to download image for {username}: {e}")
        return None

# GitHub Crawling Logic
def get_user_profile(username):
    url = f"https://github.com/{username}"
    driver.get(url)
    time.sleep(CRAWL_DELAY)
    try:
        display_name = driver.find_element(By.CSS_SELECTOR, ".vcard-fullname").text.strip()
    except:
        display_name = None
    try:
        image_url = driver.find_element(By.CSS_SELECTOR, "img.avatar-user").get_attribute("src")
    except:
        image_url = None
    local_image_path = download_image(image_url, username)
    return display_name, local_image_path

def get_usernames_from_list_page():
    elements = driver.find_elements(By.CSS_SELECTOR, "div.d-table div:nth-child(2) > a:nth-child(1) > span:nth-child(2)")
    return [el.text.strip() for el in elements]

def crawl_follow_list(username, tab, depth):
    users = []
    page = 1
    while True:
        url = f"https://github.com/{username}?tab={tab}&page={page}"
        driver.get(url)
        time.sleep(CRAWL_DELAY)
        page_users = get_usernames_from_list_page()
        if not page_users:
            break
        users.extend(page_users)
        try:
            next_btn = driver.find_element(By.CSS_SELECTOR, "a.next_page")
            if 'disabled' in next_btn.get_attribute("class"):
                break
        except:
            break
        page += 1
    if username not in WHITELIST and len(users) > MAX_FOLLOWERS_FOLLOWING:
        print(f"Limiting {tab} for {username}: {len(users)} -> {FAIR_LIMIT} (not in whitelist)")
        return users[:FAIR_LIMIT]
    return users

def crawl_user(username, data, depth):
    if depth > MAX_DEPTH:
        print(f"Max depth {MAX_DEPTH} reached, skipping user {username}")
        return
    if username in data["users"]:
        print(f"User {username} already crawled.")
        return

    display_name, image = get_user_profile(username)
    followers = crawl_follow_list(username, "followers", depth)
    following = crawl_follow_list(username, "following", depth)

    data["users"][username] = {
        "username": username,
        "displayName": display_name,
        "image": image,
        "followers": followers,
        "following": following,
        "depth": depth
    }

    whitelist_status = " (whitelisted)" if username in WHITELIST else ""
    print(f"Crawled {username} at depth {depth}{whitelist_status}: followers={len(followers)}, following={len(following)}")
    save_data(data)

# Estimation Helper
def estimate_remaining_time(users_remaining, per_user_pages=ESTIMATE_PAGES_PER_USER, delay_per_page=ESTIMATE_DELAY_PER_PAGE):
    return users_remaining * per_user_pages * delay_per_page

# Main Crawl Loop
def main():
    data = load_data()
    state = load_state()

    depth = state["depth"]
    pages_visited = state["pages_visited"]
    current_level = state["current_level"]
    next_level = state["next_level"]
    durations = state["durations"]
    crawled = set(state.get("crawled_users", []))

    while current_level and pages_visited < MAX_PAGES and depth <= MAX_DEPTH:
        user = current_level.pop(0)
        if user in crawled:
            continue

        start_time = time.time()
        crawl_user(user, data, depth)
        durations.append(time.time() - start_time)

        crawled.add(user)
        pages_visited += 1

        if depth <= MAX_DEPTH:
            user_data = data["users"].get(user, {})
            if not isinstance(user_data, dict):
                print(f"Invalid user data format for {user}, skipping.")
                continue
            neighbors = user_data.get("followers", []) + user_data.get("following", [])
            for u in neighbors:
                if u not in crawled and u not in current_level and u not in next_level:
                    next_level.append(u)

        if not current_level:
            current_level, next_level = next_level, []
            depth += 1

        # Save crawler state with crawled users
        save_state({
            "depth": depth,
            "pages_visited": pages_visited,
            "current_level": current_level,
            "next_level": next_level,
            "durations": durations,
            "crawled_users": list(crawled)
        })

        # Progress report every 10 pages
        if pages_visited % 10 == 0 and durations:
            avg_time = mean(durations)
            est_current = len(current_level) * avg_time
            est_next = len(next_level) * avg_time
            est_manual = estimate_remaining_time(len(current_level) + len(next_level))
            print("############################################################################")
            print(f"Pages visited: {pages_visited}")
            print(f"Current depth: {depth}")
            print(f"Average duration: {avg_time:.2f}s/user")
            print(f"Users remaining in current_level queue: {len(current_level)} (~{est_current:.1f}s)")
            print(f"Users remaining in next_level queue: {len(next_level)} (~{est_next:.1f}s)")
            print(f"Manual estimate: ~{est_manual:.1f}s")
            print("############################################################################")

    driver.quit()
    print(f"Crawling complete. Total pages visited: {pages_visited}, Final depth reached: {depth-1}")

if __name__ == "__main__":
    main()

Limiting followers for dmtrKovalenko: 50 -> 10 (not in whitelist)
Crawled dmtrKovalenko at depth 3: followers=10, following=25
############################################################################
Pages visited: 130
Current depth: 3
Average duration: 7.49s/user
Users remaining in current_level queue: 388 (~2905.0s)
Users remaining in next_level queue: 989 (~7404.8s)
Manual estimate: ~8262.0s
############################################################################
Limiting followers for win001: 35 -> 10 (not in whitelist)
Crawled win001 at depth 3: followers=10, following=12
Limiting followers for alan-turing-institute: 50 -> 10 (not in whitelist)
Crawled alan-turing-institute at depth 3: followers=10, following=0
Limiting followers for jmooring: 50 -> 10 (not in whitelist)
Crawled jmooring at depth 3: followers=10, following=0
Limiting followers for YaroslavPodorvanov: 50 -> 10 (not in whitelist)
Limiting following for YaroslavPodorvanov: 50 -> 10 (not in whitelist)
Crawled 

In [None]:
import json

DATA_FILE = "github_users.json"
OUTPUT_FILE = "graph-data.json"

with open(DATA_FILE, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

users = raw_data["users"]

nodes = []
usernames_seen = set()
username_to_depth = {}

# Add main users with depth
for username, info in users.items():
    depth = info.get("depth", 3)
    nodes.append({
        "id": username,
        "name": info.get("displayName", username),
        "img": info.get("image", ""),
        "url": f"https://github.com/{username}",
        "depth": depth
    })
    usernames_seen.add(username)
    username_to_depth[username] = depth

# Determine current max depth
max_depth = max(username_to_depth.values(), default=3)

# Add followers if not already included
for info in users.values():
    for follower in info.get("followers", []):
        if follower not in usernames_seen:
            follower_depth = max_depth + 1
            nodes.append({
                "id": follower,
                "name": follower,
                "img": f"./src/{follower}.jpg",
                "url": f"https://github.com/{follower}",
                "depth": follower_depth
            })
            usernames_seen.add(follower)
            username_to_depth[follower] = follower_depth

# Create edges
links = []
for username, info in users.items():
    for follower in info.get("followers", []):
        links.append({
            "source": follower,
            "target": username
        })

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump({"nodes": nodes, "links": links}, f, indent=2)

print(f"Graph data written to {OUTPUT_FILE}")

In [16]:
# Serve the file for the html to be able to load the json
# For Python 3.x
#python -m http.server 8000
