In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [None]:
def download_web(url):
  response = requests.get(url)

  if response.status_code != 200:
    print("Failed to load page {}".format(url))
  
  doc = BeautifulSoup(response.text, 'html.parser')
  return doc

In [None]:
def scrape_topics(url):
  parsed_data = download_web(url)
  select_topic = "f3 lh-condensed mb-0 mt-1 Link--primary"
  topic_title = parsed_data.find_all('p',{'class':select_topic})

  select_topic_desc = "f5 color-fg-muted mb-0 mt-1"
  topic_desc = parsed_data.find_all('p',{'class':select_topic_desc})

  select_topic_url = "no-underline flex-1 d-flex flex-column"
  topic_url = parsed_data.find_all('a',{'class':select_topic_url})

  All_topic_titles = []
  All_topic_desc = []
  All_topic_urls = []

  for tag in topic_title:
    All_topic_titles.append(tag.text)
  for tag in topic_desc:
    All_topic_desc.append(tag.text.strip())
  for tag in topic_url:
    All_topic_urls.append('https://github.com' + tag['href'])
 
  return All_topic_titles, All_topic_desc, All_topic_urls


In [None]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if ',' in stars_str:
        stars_str = stars_str.replace(',', '')
    stars_str = int(stars_str)
    return(stars_str)

In [None]:
def get_repo_info(repo_tag, star_tag):
    a_tags = repo_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = "https://github.com" + a_tags[1]['href']
    stars = parse_star_count(star_tag['title'])
    return username, repo_name, stars, repo_url

In [None]:
def get_repo_details(topic_url):
  topic_data = download_web(topic_url)

  # get the parent tag (h3) which has the required tags
  h3_selector = "f3 color-fg-muted text-normal lh-condensed"
  repo_tags = topic_data.find_all('h3', class_=h3_selector)

  # get the tag that contains stars info
  stars_selector = "Counter js-social-count"
  star_tags = topic_data.find_all('span', class_=stars_selector)

  topic_repos_dict = {
        'username': [],
        'repo_name': [],
        'stars': [],
        'repo_url': []
    }

  for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

  topic_repos_df = pd.DataFrame(topic_repos_dict)

  return topic_repos_df

In [None]:
def scrape_web(url):
  topic_details = scrape_topics(url)
  topic_dict = {
        'title': topic_details[0],
        'description': topic_details[1],
        'url': topic_details[2]
    }
  topic_df = pd.DataFrame(topic_dict)
  all_df = [topic_df]

  for url in topic_dict['url']:
    data = get_repo_details(url)

  return all_df

In [None]:
def main():
  url = "https://github.com/topics"
  topic = scrape_web(url)
  #print(topic)

  os.makedirs("scraped data", exist_ok=True)

  topic_titles = topic[0]['title']

  for i in range(len(topic)):
      if i == 0:
          topic[i].to_csv("scraped data/{}".format("allTopics.csv"), index = None)
      else:
          topic[i].to_csv("scraped data/{}".format(topic_titles[i-1] + ".csv"), index = None)
if __name__=='__main__':
    main()
