In [4]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time

# URL for Google's repositories
URL = "https://github.com/orgs/google/repositories"

# --- 1. Setup Database ---
conn = sqlite3.connect("google_repos.db")
cur = conn.cursor()

# Reset table to ensure clean data for this run (optional)
cur.execute("DROP TABLE IF EXISTS repositories") 
cur.execute("""
CREATE TABLE repositories (
    name TEXT,
    language TEXT,
    stars TEXT
)
""")
conn.commit()

print(f"Connecting to {URL}...")

# --- 2. Scrape Page ---
# We use a header to look like a real browser (Chrome)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

res = requests.get(URL, headers=headers)
print(f"Page Status Code: {res.status_code}") # Should be 200

if res.status_code != 200:
    print("Error: Failed to retrieve page. GitHub might be blocking the request.")
else:
    soup = BeautifulSoup(res.text, "html.parser")
    
    # STRATEGY 1: Look for list items with class 'public' (Standard GitHub view)
    repo_list = soup.select("li.public")
    
    # STRATEGY 2: If that failed, look for ANY link that looks like a repository title
    if not repo_list:
        print("Standard list not found. Trying backup strategy...")
        # Find all <h3> tags that contain a link, then get their parent <li>
        repo_titles = soup.select("h3 a")
        # We filter to keep only valid repo links (usually contain '/google/')
        repo_list = [t.find_parent("li") for t in repo_titles if t.find_parent("li")]

    print(f"Found {len(repo_list)} repositories.")

    # --- 3. Extract & Save Data ---
    for item in repo_list:
        try:
            # Name: Usually in an <h3> tag with an <a> inside
            name_tag = item.select_one("h3 a")
            name = name_tag.get_text(strip=True) if name_tag else "Unknown"

            # --- FIXED LANGUAGE SELECTOR ---
            # We look for the colored dot (span.repo-language-color)
            # The language name is usually the text immediately following it.
            lang_dot = item.select_one("span.repo-language-color")
            
            if lang_dot:
                # Get the text property of the parent, or the next sibling
                language = lang_dot.parent.get_text(strip=True)
                # Removing the empty space often leaves just the language name
            else:
                language = "Unknown"
            
            # A fallback if it grabs too much text (Clean up)
            # If the language string is too long, it might be wrong, so specific check:
            possible_langs = ["Java", "Python", "C++", "Go", "Rust", "TypeScript", "JavaScript", "C", "HTML", "Kotlin", "Dart"]
            # Simple check if our messy scrape contains a known language
            found_lang = next((l for l in possible_langs if l in language), language)
            # -------------------------------

            # Stars: Look for the link that points to /stargazers
            star_tag = item.select_one("a[href$='/stargazers']")
            stars = star_tag.get_text(strip=True) if star_tag else "0"
            stars = stars.replace(",", "").strip()

            print(f"Saving: {name} | {found_lang} | {stars}")

            cur.execute("INSERT INTO repositories VALUES (?, ?, ?)", (name, found_lang, stars))
            conn.commit()
            
            time.sleep(1) 

        except Exception as e:
            print(f"Skipping item: {e}")

# --- 4. Verify Data ---
print("\n--- Verifying Database Content ---")
cur.execute("SELECT * FROM repositories")
rows = cur.fetchall()

if not rows:
    print("WARNING: Database is still empty!")
else:
    for row in rows:
        print(row)

conn.close()
print("Done.")

Connecting to https://github.com/orgs/google/repositories...
Page Status Code: 200
Standard list not found. Trying backup strategy...
Found 30 repositories.
Saving: dawn | Unknown | 780
Saving: adk-python | Unknown | 16k
Saving: XNNPACK | Unknown | 2.2k
Saving: zerocopy | Unknown | 2.1k
Saving: closure-templates | Unknown | 671
Saving: perfetto | Unknown | 5k
Saving: skia-buildbot | Unknown | 158
Saving: netkat | Unknown | 27
Saving: site-kit-wp | Unknown | 1.3k
Saving: skia | Unknown | 10k
Saving: tunix | Unknown | 1.9k
Saving: qwix | Unknown | 66
Saving: cameratrapai | Unknown | 387
Saving: cel-java | Unknown | 227
Saving: meridian | Unknown | 1.2k
Saving: selinux-policy-languages | Unknown | 14
Saving: adk-samples | Unknown | 6.5k
Saving: orbax | Unknown | 455
Saving: gvisor | Unknown | 17k
Saving: osv-scanner | Unknown | 8.1k
Saving: jetpack-camera-app | Unknown | 276
Saving: crubit | Unknown | 924
Saving: earthengine-api | Unknown | 3.1k
Saving: osv.dev | Unknown | 2.4k
Saving: in