In [None]:
import json
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

START_USER = "CatalinPlesu"
MAX_DEPTH = 5
MAX_PAGES = 1000
MAX_FOLLOWERS_FOLLOWING = 25
DATA_FILE = "github_users.json"
IMG_FOLDER = "img"

options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

os.makedirs(IMG_FOLDER, exist_ok=True)

def save_data(data):
    with open(DATA_FILE, "w") as f:
        json.dump(data, f, indent=2)

def load_data():
    try:
        with open(DATA_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"users": {}}

def download_image(url, username):
    if not url:
        return None
    image_path = os.path.join(IMG_FOLDER, f"{username}.jpg")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(image_path, "wb") as f:
            f.write(response.content)
        return f"./{image_path}"
    except Exception as e:
        print(f"Failed to download image for {username}: {e}")
        return None

def get_user_profile(username):
    url = f"https://github.com/{username}"
    driver.get(url)
    time.sleep(1)
    try:
        display_name = driver.find_element(By.CSS_SELECTOR, ".vcard-fullname").text.strip()
    except:
        display_name = None
    try:
        image_url = driver.find_element(By.CSS_SELECTOR, "img.avatar-user").get_attribute("src")
    except:
        image_url = None

    local_image_path = download_image(image_url, username)
    return display_name, local_image_path

def get_usernames_from_list_page():
    elements = driver.find_elements(By.CSS_SELECTOR, "div.d-table div:nth-child(2) > a:nth-child(1) > span:nth-child(2)")
    return [el.text.strip() for el in elements]

def crawl_follow_list(username, tab, depth):
    users = []
    page = 1
    while True:
        url = f"https://github.com/{username}?tab={tab}&page={page}"
        driver.get(url)
        time.sleep(1)
        page_users = get_usernames_from_list_page()
        if not page_users:
            break
        
        if depth != 1 and (len(users) + len(page_users)) > MAX_FOLLOWERS_FOLLOWING:
            print(f"Aborting {tab} crawl for {username} at page {page} due to length > {MAX_FOLLOWERS_FOLLOWING} and depth != 1")
            return []

        users.extend(page_users)

        try:
            next_btn = driver.find_element(By.CSS_SELECTOR, "a.next_page")
            if 'disabled' in next_btn.get_attribute("class"):
                break
        except:
            break
        page += 1
    return users

def crawl_user(username, data, depth):
    if depth > MAX_DEPTH:
        print(f"Max depth {MAX_DEPTH} reached, skipping user {username}")
        return
    if username in data["users"]:
        print(f"User {username} already crawled.")
        return

    display_name, image = get_user_profile(username)
    followers = crawl_follow_list(username, "followers", depth)
    following = crawl_follow_list(username, "following", depth)

    data["users"][username] = {
        "username": username,
        "displayName": display_name,
        "image": image,
        "followers": followers,
        "following": following,
        "depth": depth
    }
    print(f"Crawled {username} at depth {depth}: followers={len(followers)}, following={len(following)}")
    save_data(data)

def main():
    data = load_data()
    max_depth = MAX_DEPTH
    max_pages = MAX_PAGES
    pages_visited = 0

    current_level = [START_USER]
    next_level = []
    depth = 1
    crawled = set(data["users"].keys())

    while current_level and pages_visited < max_pages and depth <= max_depth:
        user = current_level.pop(0)
        if user in crawled:
            continue

        crawl_user(user, data, depth)
        crawled.add(user)
        pages_visited += 1

        if depth <= max_depth:
            user_data = data["users"].get(user, {})
            neighbors = user_data.get("followers", []) + user_data.get("following", [])
            for u in neighbors:
                if u not in crawled and u not in current_level and u not in next_level:
                    next_level.append(u)

        if not current_level:
            current_level, next_level = next_level, []
            depth += 1

        if pages_visited % 10 == 0:
            print(f"Pages visited: {pages_visited}")
            print(f"Current depth: {depth}")
            print(f"Users remaining in current_level queue: {len(current_level)}")
            print(f"Users remaining in next_level queue: {len(next_level)}")

    driver.quit()
    print(f"Crawling complete. Total pages visited: {pages_visited}, Final depth reached: {depth-1}")

if __name__ == "__main__":
    main()

Crawled CatalinPlesu at depth 1: followers=28, following=50
Aborting followers crawl for mi6paulino at page 1 due to length > 25 and depth != 1
Aborting following crawl for mi6paulino at page 1 due to length > 25 and depth != 1
Crawled mi6paulino at depth 2: followers=0, following=0
Crawled DenisMunjiu at depth 2: followers=1, following=1
Crawled BeginnerDuelist at depth 2: followers=6, following=11
Crawled c-harea at depth 2: followers=4, following=4
Crawled UzunPaula at depth 2: followers=6, following=21
Aborting following crawl for sergiuprt at page 1 due to length > 25 and depth != 1
Crawled sergiuprt at depth 2: followers=19, following=0
Aborting followers crawl for Raduc4 at page 1 due to length > 25 and depth != 1
Aborting following crawl for Raduc4 at page 1 due to length > 25 and depth != 1
Crawled Raduc4 at depth 2: followers=0, following=0
Aborting following crawl for AnnHR at page 1 due to length > 25 and depth != 1
Crawled AnnHR at depth 2: followers=5, following=0
Crawled

In [17]:
import json
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By

START_USER = "CatalinPlesu"
MAX_DEPTH = 3
MAX_PAGES = 1000
MAX_FOLLOWERS_FOLLOWING = 25
DATA_FILE = "github_users.json"
IMG_FOLDER = "img"

options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

os.makedirs(IMG_FOLDER, exist_ok=True)

def save_data(data):
    with open(DATA_FILE, "w") as f:
        json.dump(data, f, indent=2)

def load_data():
    try:
        with open(DATA_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"users": {}}

def download_image(url, username):
    if not url:
        return None
    image_path = os.path.join(IMG_FOLDER, f"{username}.jpg")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(image_path, "wb") as f:
            f.write(response.content)
        return f"./{image_path}"
    except Exception as e:
        print(f"Failed to download image for {username}: {e}")
        return None

def get_user_profile(username):
    url = f"https://github.com/{username}"
    driver.get(url)
    time.sleep(1)
    try:
        display_name = driver.find_element(By.CSS_SELECTOR, ".vcard-fullname").text.strip()
    except:
        display_name = None
    try:
        image_url = driver.find_element(By.CSS_SELECTOR, "img.avatar-user").get_attribute("src")
    except:
        image_url = None

    local_image_path = download_image(image_url, username)
    return display_name, local_image_path

def get_usernames_from_list_page():
    elements = driver.find_elements(By.CSS_SELECTOR, "div.d-table div:nth-child(2) > a:nth-child(1) > span:nth-child(2)")
    return [el.text.strip() for el in elements]

def crawl_follow_list(username, tab, depth):
    users = []
    page = 1
    while True:
        url = f"https://github.com/{username}?tab={tab}&page={page}"
        driver.get(url)
        time.sleep(1)
        page_users = get_usernames_from_list_page()
        if not page_users:
            break
        
        if depth != 1 and (len(users) + len(page_users)) > MAX_FOLLOWERS_FOLLOWING:
            print(f"Aborting {tab} crawl for {username} at page {page} due to length > {MAX_FOLLOWERS_FOLLOWING} and depth != 1")
            return []

        users.extend(page_users)

        try:
            next_btn = driver.find_element(By.CSS_SELECTOR, "a.next_page")
            if 'disabled' in next_btn.get_attribute("class"):
                break
        except:
            break
        page += 1
    return users

def crawl_user(username, data, depth):
    if depth > MAX_DEPTH:
        print(f"Max depth {MAX_DEPTH} reached, skipping user {username}")
        return
    if username in data["users"]:
        print(f"User {username} already crawled.")
        return

    display_name, image = get_user_profile(username)
    followers = crawl_follow_list(username, "followers", depth)
    following = crawl_follow_list(username, "following", depth)

    data["users"][username] = {
        "username": username,
        "displayName": display_name,
        "image": image,
        "followers": followers,
        "following": following,
        "depth": depth
    }
    print(f"Crawled {username} at depth {depth}: followers={len(followers)}, following={len(following)}")
    save_data(data)

def main():
    data = load_data()
    max_depth = MAX_DEPTH
    max_pages = MAX_PAGES
    pages_visited = 0

    current_level = [START_USER]
    next_level = []
    depth = 1
    crawled = set(data["users"].keys())

    while current_level and pages_visited < max_pages and depth <= max_depth:
        user = current_level.pop(0)
        if user in crawled:
            continue

        crawl_user(user, data, depth)
        crawled.add(user)
        pages_visited += 1

        if depth <= max_depth:
            user_data = data["users"].get(user, {})
            neighbors = user_data.get("followers", []) + user_data.get("following", [])
            for u in neighbors:
                if u not in crawled and u not in current_level and u not in next_level:
                    next_level.append(u)

        if not current_level:
            current_level, next_level = next_level, []
            depth += 1

        if pages_visited % 10 == 0:
            print(f"Pages visited: {pages_visited}")
            print(f"Current depth: {depth}")
            print(f"Users remaining in current_level queue: {len(current_level)}")
            print(f"Users remaining in next_level queue: {len(next_level)}")

    driver.quit()
    print(f"Crawling complete. Total pages visited: {pages_visited}, Final depth reached: {depth-1}")

if __name__ == "__main__":
    main()

Graph data written to graph-data.json


In [16]:
# Serve the file for the html to be able to load the json
# For Python 3.x
#python -m http.server 8000
