# Scraping top Repositories for Github Topics from https://github.com/topics

### Import required libraries

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582'}

### Parse through Beautiful Soup object to find Top Github Topics, Topic Desciption and URL of Topic Page

In [None]:
# Function to scrape topic titles
def get_topic_title(soup):
    top_tit = []
    topic_title_tag = soup.find_all(
        "p", class_="f3 lh-condensed mb-0 mt-1 Link--primary")
    for title in topic_title_tag:
        top_tit.append(title.text)
    return top_tit


# Function to scrape description of each topic
def get_topic_desc(soup):
    top_desc = []
    topic_tag_desc = soup.find_all("p", class_="f5 color-fg-muted mb-0 mt-1")
    for desc in topic_tag_desc:
        top_desc.append(desc.text.strip())
    return top_desc


# Function to scarpe url of each topic page
def get_topic_url(soup):
    top_url = []
    topic_tag_url = soup.find_all(
        class_="no-underline flex-1 d-flex flex-column")
    for url in topic_tag_url:
        top_url.append("https://github.com"+url["href"])
    return top_url


# Function to combine scraped data, return data frame and convert it into csv file
def get_topics():
    github_topics_url = "https://github.com/topics"
    data = requests.get(github_topics_url, headers=header)
    if data.status_code != 200:
        raise Exception(f"Failed to load page {github_topics_url}")
    soup = BeautifulSoup(data.text, "html.parser")
    dict_topics = {
        "Topic Title": get_topic_title(soup),
        "Topic Desciption": get_topic_desc(soup),
        "Topic URL": get_topic_url(soup)
    }
    git_topics_df=pd.DataFrame(dict_topics)
    git_topics_df.to_csv("Github Topics.csv", index=False) 
    return git_topics_df

In [None]:
get_topics()

### Get Information about top repositories of each topic

In [None]:
# Function to ocnvert stars from string to integer
def convert_star_to_int(repo_stars):
    star_list = []
    for star in repo_stars:
        if (star.text.strip()[-1] == "k"):
            star_list.append(int(float(star.text.strip()[:-1])*1000))
        else:
            star_list.append(int(star.text.strip()[:-1]))
    return star_list


# Function to get information of each repo
def get_repo_info(topic_repo, repo_stars):
    repo_dict = {
        "Username": [],
        "Repository Name": [],
        "Repository Stars": [],
        "Repository URL": []
    }
    for repo in topic_repo:
        repo_dict["Username"].append(repo.find_all("a")[0].text.strip())
    for repo in topic_repo:
        repo_dict["Repository Name"].append(repo.find_all("a")[1].text.strip())
    for repo in topic_repo:
        repo_dict["Repository URL"].append(
            "https://github.com"+repo.find_all("a")[1]["href"])
    repo_dict["Repository Stars"] = (convert_star_to_int(repo_stars))[:]
    return repo_dict


# Function to combine scraped data, return data frame and convert it into csv file
def get_topics_repo(topic_url, path):
    # Download page from URL
    data_topic = requests.get(topic_url, headers=header)
    # Check status code
    if data_topic.status_code != 200:
        raise Exception(f"Failed to load page {topic_url}")
    # Parse using beautiful soup
    soup_topic = BeautifulSoup(data_topic.text, "html.parser")
    topic_repo = soup_topic.find_all(
        "h3", class_="f3 color-fg-muted text-normal lh-condensed")
    repo_stars = soup_topic.find_all("span", class_="Counter js-social-count")
    # Return dataframe
    repo_dict = get_repo_info(topic_repo, repo_stars)
    repo_df = pd.DataFrame(repo_dict)
    repo_df.to_csv(path, index=False)
    return repo_df
    

# Function to scrape repo data for each topic
def scrape_topics():
    git_topics_df = get_topics()
    os.makedirs("Data", exist_ok=True)
    for index, row in git_topics_df.iterrows():
        if os.path.exists(f"Data/{row['Topic Title']}.csv"):
            print(f"Data/{row['Topic Title']}.csv already exists.\nSkipping")
        else:
            print(f"Scraping top repositories for {row['Topic Title']}")
            get_topics_repo(row["Topic URL"], f"Data/{row['Topic Title']}.csv")


scrape_topics()