In [6]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time

# URL for Google's repositories
URL = "https://github.com/orgs/google/repositories"

# --- 1. Setup Database ---
conn = sqlite3.connect("google_repos.db")
cur = conn.cursor()

# Reset table to ensure clean data for this run (optional)
cur.execute("DROP TABLE IF EXISTS repositories") 
cur.execute("""
CREATE TABLE repositories (
    name TEXT,
    language TEXT,
    stars TEXT
)
""")
conn.commit()

print(f"Connecting to {URL}...")

# --- 2. Scrape Page ---
# We use a header to look like a real browser (Chrome)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

res = requests.get(URL, headers=headers)
print(f"Page Status Code: {res.status_code}") # Should be 200

if res.status_code != 200:
    print("Error: Failed to retrieve page. GitHub might be blocking the request.")
else:
    soup = BeautifulSoup(res.text, "html.parser")
    
    # STRATEGY 1: Look for list items with class 'public' (Standard GitHub view)
    repo_list = soup.select("li.public")
    
    # STRATEGY 2: If that failed, look for ANY link that looks like a repository title
    if not repo_list:
        print("Standard list not found. Trying backup strategy...")
        # Find all <h3> tags that contain a link, then get their parent <li>
        repo_titles = soup.select("h3 a")
        # We filter to keep only valid repo links (usually contain '/google/')
        repo_list = [t.find_parent("li") for t in repo_titles if t.find_parent("li")]

    print(f"Found {len(repo_list)} repositories.")

    # --- 3. Extract & Save Data ---
    for item in repo_list:
        try:
            # Name: Usually in an <h3> tag with an <a> inside
            name_tag = item.select_one("h3 a")
            name = name_tag.get_text(strip=True) if name_tag else "Unknown"

            # Language: Look for the specific programmingLanguage itemprop
            lang_tag = item.select_one("[itemprop='programmingLanguage']")
            language = lang_tag.get_text(strip=True) if lang_tag else "None"

            # Stars: Look for the link that points to /stargazers
            star_tag = item.select_one("a[href$='/stargazers']")
            stars = star_tag.get_text(strip=True) if star_tag else "0"
            
            # Clean up star text (e.g. "1.5k" -> "1.5k")
            stars = stars.replace(",", "").strip()

            print(f"Saving: {name} | {language} | {stars}")

            cur.execute("INSERT INTO repositories VALUES (?, ?, ?)", (name, language, stars))
            conn.commit()
            
            time.sleep(1) # Requirement: sleep 1 second

        except Exception as e:
            print(f"Skipping an item due to error: {e}")

# --- 4. Verify Data ---
print("\n--- Verifying Database Content ---")
cur.execute("SELECT * FROM repositories")
rows = cur.fetchall()

if not rows:
    print("WARNING: Database is still empty!")
else:
    for row in rows:
        print(row)

conn.close()
print("Done.")

Connecting to https://github.com/orgs/google/repositories...
Page Status Code: 200
Standard list not found. Trying backup strategy...
Found 30 repositories.
Saving: zerocopy | None | 2.1k
Saving: closure-templates | None | 671
Saving: skia-buildbot | None | 158
Saving: perfetto | None | 5k
Saving: netkat | None | 27
Saving: site-kit-wp | None | 1.3k
Saving: skia | None | 10k
Saving: tunix | None | 1.9k
Saving: qwix | None | 66
Saving: cameratrapai | None | 387
Saving: XNNPACK | None | 2.2k
Saving: cel-java | None | 227
Saving: dawn | None | 780
Saving: meridian | None | 1.2k
Saving: selinux-policy-languages | None | 14
Saving: adk-samples | None | 6.5k
Saving: orbax | None | 455
Saving: gvisor | None | 17k
Saving: osv-scanner | None | 8.1k
Saving: jetpack-camera-app | None | 276
Saving: crubit | None | 924
Saving: earthengine-api | None | 3.1k
Saving: osv.dev | None | 2.4k
Saving: inverting-proxy | None | 262
Saving: closure-compiler | None | 7.6k
Saving: error-prone | None | 7.1k
Savi