In [25]:
import json
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import random
import logging

# Configuration
START_USER = "CatalinPlesu"
MAX_DEPTH = 5
MAX_PAGES = 1000
MAX_FOLLOWERS_FOLLOWING = 25
WHITELIST = [START_USER, "Ernest96"]
FAIR_LIMIT = 10
DATA_FILE = "github_users.json"
IMG_FOLDER = "img"

# Timeout settings
PAGE_LOAD_TIMEOUT = 30  # seconds
ELEMENT_WAIT_TIMEOUT = 15  # seconds
REQUEST_TIMEOUT = 20  # seconds for image downloads

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def setup_driver():
    """Setup Firefox driver with timeout configurations"""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    # Create driver
    driver = webdriver.Firefox(options=options)
    
    # Set timeouts
    driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
    driver.implicitly_wait(ELEMENT_WAIT_TIMEOUT)
    
    return driver

def safe_get_page(driver, url, max_retries=3):
    """Safely load a page with retries and timeout handling"""
    for attempt in range(max_retries):
        try:
            logger.info(f"Loading {url} (attempt {attempt + 1}/{max_retries})")
            driver.get(url)
            
            # Wait for page to be ready
            WebDriverWait(driver, ELEMENT_WAIT_TIMEOUT).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
            
            # Add random delay to avoid being too aggressive
            time.sleep(random.uniform(1, 3))
            return True
            
        except TimeoutException:
            logger.warning(f"Timeout loading {url} on attempt {attempt + 1}")
            if attempt < max_retries - 1:
                time.sleep(random.uniform(3, 8))  # Wait before retry
                continue
                
        except WebDriverException as e:
            logger.error(f"WebDriver error loading {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(random.uniform(3, 8))
                continue
                
        except Exception as e:
            logger.error(f"Unexpected error loading {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(random.uniform(3, 8))
                continue
    
    logger.error(f"Failed to load {url} after {max_retries} attempts")
    return False

def save_data(data):
    """Save data with error handling"""
    try:
        with open(DATA_FILE, "w") as f:
            json.dump(data, f, indent=2)
        logger.info(f"Data saved. Total users: {len(data['users'])}")
    except Exception as e:
        logger.error(f"Failed to save data: {e}")

def load_data():
    """Load existing data"""
    try:
        with open(DATA_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"users": {}}

def download_image(url, username):
    """Download image with timeout handling"""
    if not url:
        return None
        
    image_path = os.path.join(IMG_FOLDER, f"{username}.jpg")
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
        response.raise_for_status()
        
        with open(image_path, "wb") as f:
            f.write(response.content)
        return f"./{image_path}"
        
    except requests.exceptions.Timeout:
        logger.warning(f"Timeout downloading image for {username}")
        return None
    except Exception as e:
        logger.warning(f"Failed to download image for {username}: {e}")
        return None

def get_user_profile(driver, username):
    """Get user profile with timeout handling"""
    url = f"https://github.com/{username}"
    
    if not safe_get_page(driver, url):
        return None, None
    
    try:
        # Check if user exists
        if "Page not found" in driver.page_source or "404" in driver.title:
            logger.warning(f"User {username} not found")
            return None, None
        
        # Get display name with timeout
        display_name = None
        try:
            wait = WebDriverWait(driver, ELEMENT_WAIT_TIMEOUT)
            display_name_elem = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".vcard-fullname"))
            )
            display_name = display_name_elem.text.strip()
        except TimeoutException:
            logger.debug(f"No display name found for {username}")
        
        # Get profile image
        image_url = None
        try:
            img_elem = driver.find_element(By.CSS_SELECTOR, "img.avatar-user")
            image_url = img_elem.get_attribute("src")
        except Exception:
            logger.debug(f"No profile image found for {username}")

        local_image_path = download_image(image_url, username)
        return display_name, local_image_path
        
    except Exception as e:
        logger.error(f"Error getting profile for {username}: {e}")
        return None, None

def get_usernames_from_list_page(driver):
    """Extract usernames from current page"""
    try:
        wait = WebDriverWait(driver, ELEMENT_WAIT_TIMEOUT)
        elements = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "div.d-table div:nth-child(2) > a:nth-child(1) > span:nth-child(2)")
            )
        )
        return [el.text.strip() for el in elements if el.text.strip()]
    except TimeoutException:
        logger.debug("No usernames found on current page")
        return []

def crawl_follow_list(driver, username, tab, depth):
    """Crawl followers or following list with better error handling"""
    users = []
    page = 1
    max_pages_per_user = 10
    consecutive_failures = 0
    max_consecutive_failures = 3
    
    while page <= max_pages_per_user and consecutive_failures < max_consecutive_failures:
        url = f"https://github.com/{username}?tab={tab}&page={page}"
        
        if not safe_get_page(driver, url):
            consecutive_failures += 1
            logger.warning(f"Failed to load page {page} for {username}'s {tab}")
            continue
        
        try:
            page_users = get_usernames_from_list_page(driver)
            if not page_users:
                logger.debug(f"No users found on page {page} for {username}'s {tab}")
                break

            users.extend(page_users)
            consecutive_failures = 0  # Reset failure counter
            logger.debug(f"Found {len(page_users)} users on page {page} of {username}'s {tab}")

            # Check for next page
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, "a.next_page")
                if 'disabled' in next_btn.get_attribute("class"):
                    break
            except Exception:
                break
                
            page += 1
            
        except Exception as e:
            consecutive_failures += 1
            logger.warning(f"Error processing page {page} for {username}'s {tab}: {e}")
    
    # Apply fair limit
    if username not in WHITELIST and len(users) > MAX_FOLLOWERS_FOLLOWING:
        logger.info(f"Limiting {tab} for {username}: {len(users)} -> {FAIR_LIMIT} (not whitelisted)")
        return users[:FAIR_LIMIT]
    
    return users

def crawl_user(driver, username, data, depth):
    """Crawl a single user with comprehensive error handling"""
    if depth > MAX_DEPTH:
        logger.info(f"Max depth {MAX_DEPTH} reached, skipping user {username}")
        return False
        
    if username in data["users"]:
        logger.debug(f"User {username} already crawled")
        return True

    try:
        logger.info(f"Crawling {username} at depth {depth}")
        
        display_name, image = get_user_profile(driver, username)
        if display_name is None and image is None:
            logger.warning(f"Could not get profile for {username}")
            return False
            
        followers = crawl_follow_list(driver, username, "followers", depth)
        following = crawl_follow_list(driver, username, "following", depth)

        data["users"][username] = {
            "username": username,
            "displayName": display_name,
            "image": image,
            "followers": followers,
            "following": following,
            "depth": depth,
            "crawled_at": time.time()
        }
        
        whitelist_status = " (whitelisted)" if username in WHITELIST else ""
        logger.info(f"Successfully crawled {username} at depth {depth}{whitelist_status}: "
                   f"followers={len(followers)}, following={len(following)}")
        
        save_data(data)
        return True
        
    except Exception as e:
        logger.error(f"Failed to crawl {username}: {e}")
        return False

def main():
    """Main function with better resource management"""
    driver = None
    try:
        # Setup
        driver = setup_driver()
        os.makedirs(IMG_FOLDER, exist_ok=True)
        
        data = load_data()
        pages_visited = 0
        failed_users = set()

        current_level = [START_USER]
        next_level = []
        depth = 1
        crawled = set(data["users"].keys())

        logger.info(f"Starting crawl from {START_USER}")
        logger.info(f"Max depth: {MAX_DEPTH}, Max pages: {MAX_PAGES}")
        logger.info(f"Timeouts - Page load: {PAGE_LOAD_TIMEOUT}s, Element wait: {ELEMENT_WAIT_TIMEOUT}s")

        while current_level and pages_visited < MAX_PAGES and depth <= MAX_DEPTH:
            user = current_level.pop(0)
            
            if user in crawled or user in failed_users:
                continue

            success = crawl_user(driver, user, data, depth)
            
            if success:
                crawled.add(user)
                pages_visited += 1

                # Add neighbors to next level
                if depth < MAX_DEPTH:
                    user_data = data["users"].get(user, {})
                    neighbors = user_data.get("followers", []) + user_data.get("following", [])
                    for neighbor in neighbors:
                        if (neighbor not in crawled and 
                            neighbor not in failed_users and
                            neighbor not in current_level and 
                            neighbor not in next_level):
                            next_level.append(neighbor)
            else:
                failed_users.add(user)
                logger.warning(f"Added {user} to failed users list")

            # Move to next depth level
            if not current_level and next_level:
                current_level, next_level = next_level, []
                depth += 1
                logger.info(f"Moving to depth {depth}, {len(current_level)} users queued")

            # Progress reporting
            if pages_visited % 5 == 0:  # More frequent reporting
                logger.info(f"Progress - Pages: {pages_visited}, Depth: {depth}, "
                          f"Current queue: {len(current_level)}, Next queue: {len(next_level)}, "
                          f"Failed: {len(failed_users)}")

        logger.info(f"Crawling complete! Pages visited: {pages_visited}, "
                   f"Final depth: {depth-1}, Failed users: {len(failed_users)}")
        
    except KeyboardInterrupt:
        logger.info("Crawling interrupted by user")
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
    finally:
        if driver:
            try:
                driver.quit()
                logger.info("Browser closed")
            except Exception as e:
                logger.error(f"Error closing browser: {e}")

if __name__ == "__main__":
    main()

2025-05-28 10:08:31,524 - INFO - Starting crawl from CatalinPlesu
2025-05-28 10:08:31,527 - INFO - Max depth: 5, Max pages: 1000
2025-05-28 10:08:31,529 - INFO - Timeouts - Page load: 30s, Element wait: 15s
2025-05-28 10:08:31,531 - INFO - Crawling CatalinPlesu at depth 1
2025-05-28 10:08:31,532 - INFO - Loading https://github.com/CatalinPlesu (attempt 1/3)
2025-05-28 10:09:04,658 - INFO - Loading https://github.com/CatalinPlesu (attempt 2/3)
2025-05-28 10:09:23,791 - INFO - Crawling interrupted by user
2025-05-28 10:09:23,805 - INFO - Browser closed


In [24]:
import json

DATA_FILE = "github_users.json"
OUTPUT_FILE = "graph-data.json"

with open(DATA_FILE, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

users = raw_data["users"]

nodes = []
usernames_seen = set()
username_to_depth = {}

# Add main users with depth
for username, info in users.items():
    depth = info.get("depth", 3)
    nodes.append({
        "id": username,
        "name": info.get("displayName", username),
        "img": info.get("image", ""),
        "url": f"https://github.com/{username}",
        "depth": depth
    })
    usernames_seen.add(username)
    username_to_depth[username] = depth

# Determine current max depth
max_depth = max(username_to_depth.values(), default=3)

# Add followers if not already included
for info in users.values():
    for follower in info.get("followers", []):
        if follower not in usernames_seen:
            follower_depth = max_depth + 1
            nodes.append({
                "id": follower,
                "name": follower,
                "img": f"./src/{follower}.jpg",
                "url": f"https://github.com/{follower}",
                "depth": follower_depth
            })
            usernames_seen.add(follower)
            username_to_depth[follower] = follower_depth

# Create edges
links = []
for username, info in users.items():
    for follower in info.get("followers", []):
        links.append({
            "source": follower,
            "target": username
        })

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump({"nodes": nodes, "links": links}, f, indent=2)

print(f"Graph data written to {OUTPUT_FILE}")

FileNotFoundError: [Errno 2] No such file or directory: 'github_users.json'

In [16]:
# Serve the file for the html to be able to load the json
# For Python 3.x
#python -m http.server 8000
