In [1]:
# Install necessary packages
%pip install requests --quiet
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Function to get the page content for a given URL
def get_page_content(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {url}")
    return BeautifulSoup(response.text, 'html.parser')

In [3]:
# Function to parse star count (handling k as thousand)
def parse_star_count(star_string):
    star_string = star_string.strip()
    if star_string[-1] == 'k':
        return int(float(star_string[:-1]) * 1000)
    return int(star_string)

In [4]:
# Function to get repo info from repo tag and star tag
def get_repo_info(repo_tag, star_tag):
    a_tags = repo_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = 'https://github.com' + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [5]:
# Function to get repositories from a topic page
def get_topic_repositories(topic_soup):
    repo_tag_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_soup.find_all('h3', {'class': repo_tag_class})
    star_tags = topic_soup.find_all('span', {'class': 'Counter js-social-count'})
    
    repo_dict = {
        'username': [],
        'repo_name': [],
        'stars': [],
        'repo_url': [],
    }

    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        repo_dict['username'].append(repo_info[0])
        repo_dict['repo_name'].append(repo_info[1])
        repo_dict['stars'].append(repo_info[2])
        repo_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(repo_dict)

In [6]:
# Function to scrape repositories for a given topic and save to CSV
def scrape_topic_repositories(topic_url, file_path):
    if os.path.exists(file_path):
        print(f"The file {file_path} already exists. Skipping...")
        return
    topic_soup = get_page_content(topic_url)
    topic_repos_df = get_topic_repositories(topic_soup)
    topic_repos_df.to_csv(file_path, index=None)

In [7]:
# Functions to scrape topics data (title, description, URL)
def get_topic_titles(doc):
    title_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    title_tags = doc.find_all('p', {'class': title_class})
    return [tag.text for tag in title_tags]

def get_topic_descriptions(doc):
    desc_class = 'f5 color-fg-muted mb-0 mt-1'
    desc_tags = doc.find_all('p', {'class': desc_class})
    return [tag.text.strip() for tag in desc_tags]

def get_topic_urls(doc):
    link_class = 'no-underline flex-grow-0'
    link_tags = doc.find_all('a', {'class': link_class})
    base_url = 'https://github.com'
    return [base_url + tag['href'] for tag in link_tags]

In [8]:
# Scraping all topics and repositories
def scrape_all_topics_and_repos():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception(f"Failed to load page {topics_url}")
    topics_page_soup = get_page_content(topics_url)
    
    topics_dict = {
        'title': get_topic_titles(topics_page_soup),
        'description': get_topic_descriptions(topics_page_soup),
        'url': get_topic_urls(topics_page_soup)
    }
    
    topics_df = pd.DataFrame(topics_dict)
    
    os.makedirs('data', exist_ok=True)

    for index, row in topics_df.iterrows():
        print(f'Scraping top repositories for "{row["title"]}"...')
        scrape_topic_repositories(row['url'], f'data/{row["title"]}.csv')

In [9]:
# Run the scraping process for all topics and repositories
scrape_all_topics_and_repos()

# Reading the data for a particular topic (for example, "Android")
pd.read_csv("data/Android.csv")

Scraping top repositories for "3D"...
Scraping top repositories for "Ajax"...
Scraping top repositories for "Algorithm"...
Scraping top repositories for "Amp"...
Scraping top repositories for "Android"...
Scraping top repositories for "Angular"...
Scraping top repositories for "Ansible"...
Scraping top repositories for "API"...
Scraping top repositories for "Arduino"...
Scraping top repositories for "ASP.NET"...
Scraping top repositories for "Awesome Lists"...
Scraping top repositories for "Amazon Web Services"...
Scraping top repositories for "Azure"...
Scraping top repositories for "Babel"...
Scraping top repositories for "Bash"...
Scraping top repositories for "Bitcoin"...
Scraping top repositories for "Bootstrap"...
Scraping top repositories for "Bot"...
Scraping top repositories for "C"...
Scraping top repositories for "Chrome"...
Scraping top repositories for "Chrome extension"...
Scraping top repositories for "Command-line interface"...
Scraping top repositories for "Clojure"...

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,165000,https://github.com/flutter/flutter
1,facebook,react-native,118000,https://github.com/facebook/react-native
2,justjavac,free-programming-books-zh_CN,111000,https://github.com/justjavac/free-programming-...
3,Genymobile,scrcpy,109000,https://github.com/Genymobile/scrcpy
4,Hack-with-Github,Awesome-Hacking,83000,https://github.com/Hack-with-Github/Awesome-Ha...
5,Solido,awesome-flutter,53100,https://github.com/Solido/awesome-flutter
6,tldr-pages,tldr,50500,https://github.com/tldr-pages/tldr
7,google,material-design-icons,50400,https://github.com/google/material-design-icons
8,wasabeef,awesome-android-ui,50300,https://github.com/wasabeef/awesome-android-ui
9,square,okhttp,45700,https://github.com/square/okhttp


In [10]:
# Fetch and save an image from a URL
image_url = "https://raw.githubusercontent.com/github/explore/80688e429a7d4ef2fca1e82350fe8e3517d3494d/topics/nativescript/nativescript.png"
image_response = requests.get(image_url)
if image_response.status_code == 200:
    with open('nativescript_image.png', 'wb') as image_file:
        image_file.write(image_response.content)