In [20]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd

base_url = "https://github.com"

In [21]:
def get_topic_title(doc):
    selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all('p', {'class' : selection_class})

    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_des(doc):
    topic_desc_tag = doc.find_all('p', {'class' : 'f5 color-fg-muted mb-0 mt-1'})
    topic_descriptions = []
    for tag in topic_desc_tag:
        topic_descriptions.append(tag.text.strip())
    return topic_descriptions

def get_topic_url(doc):
    topic_link_tags = doc.find_all('a', {'class' : 'no-underline flex-1 d-flex flex-column'})
    topic_urls = []
    
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag["href"])
    return topic_urls


def scrape_topics():
    topic_url = "https://github.com/topics"
    res = requests.get(topic_url)
    if(res.status_code != 200):
        raise Exception('Failed to load page {}'.format(topic_url))

    doc = BeautifulSoup(res.text, 'html.parser')

    topic_dict = {
        'title' : get_topic_title(doc),
        'description' : get_topic_des(doc),
        'url' : get_topic_url(doc)
    }
    return pd.DataFrame(topic_dict)

In [22]:
def get_topic_page(topic_url):
     # Download the page
    res = requests.get(topic_url)

    # checking the status code
    if res.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))

    # Parse using beautifulsoup
    topic_doc = BeautifulSoup(res.text, 'html.parser')
    return topic_doc

def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

def get_repo_info(h_tag,star_tag):
    a_tag = h_tag.find_all('a')
    username = a_tag[0].text.strip()
    repo_name = a_tag[1].text.strip()
    repo_url = base_url + a_tag[1]["href"]
    star = parse_star_count(star_tag.text)
    return username, repo_name, repo_url, star

def get_topic_repos(topic_doc):
    
    # repo_tags containing repo_name, repo_url and username
    repo_tags = topic_doc.find_all('h3', {'class' : 'f3 color-fg-muted text-normal lh-condensed'})
    star_tag = topic_doc.find_all('span',{'id' : 'repo-stars-counter-star'})

    topic_repos_dict = {
        'username' : [],
        'repo_name' : [],
        'stars' : [],
        'repo_url' : [],
    }

    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i],star_tag[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[3])
        topic_repos_dict['repo_url'].append(repo_info[2])

    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url, topic_name):
    fname = topic_name + '.csv'
    if os.path.exists(fname):
        print("The file {} already exists. Skipping..".format(fname))
        return
        
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(fname, index=None)

In [23]:
def scrape_topic_repos():
    print("Scraping list of topics")
    topics_df = scrape_topics()
    for index, row in topics_df.iterrows():
        print('Scraping top reops for "{}"'.format(row['title']))
        scrape_topic(row['url'],row['title'])

In [24]:
scrape_topic_repos()

Scraping list of topics
Scraping top reops for "3D"
Scraping top reops for "Ajax"
Scraping top reops for "Algorithm"
Scraping top reops for "Amp"
Scraping top reops for "Android"
Scraping top reops for "Angular"
Scraping top reops for "Ansible"
Scraping top reops for "API"
Scraping top reops for "Arduino"
Scraping top reops for "ASP.NET"
Scraping top reops for "Awesome Lists"
Scraping top reops for "Amazon Web Services"
Scraping top reops for "Azure"
Scraping top reops for "Babel"
Scraping top reops for "Bash"
Scraping top reops for "Bitcoin"
Scraping top reops for "Bootstrap"
Scraping top reops for "Bot"
Scraping top reops for "C"
Scraping top reops for "Chrome"
Scraping top reops for "Chrome extension"
Scraping top reops for "Command-line interface"
Scraping top reops for "Clojure"
Scraping top reops for "Code quality"
Scraping top reops for "Code review"
Scraping top reops for "Compiler"
Scraping top reops for "Continuous integration"
Scraping top reops for "C++"
Scraping top reops 