
# Scraping the top repositories for various Topics on Github

In [1]:
#Importing required libraries


#To download the page
import requests

#To parse and extract information
from bs4 import BeautifulSoup

#To convert information to a pandas dataframe
import pandas as pd

In [53]:
#Page that we are scraping
primary_url='https://github.com/topics'
base_url='https://github.com'


#Function to download page
def get_page(topics_url):
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to download page {}'.format(topics_url))

    #Creating the document
    doc = BeautifulSoup(response.text,'html.parser')
    return doc

In [4]:
#Function to get the titles of topics displayed on the page
def get_topic_titles(doc):
    topic_title_tags=doc.find_all('p', {'class':"f3 lh-condensed mb-0 mt-1 Link--primary"})
    topic_titles=[]
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

In [18]:
#Function to get all descriptions of all topics
def get_topic_desc(doc):
    topic_desc_tags = doc.find_all('p',
                               {'class':'f5 color-fg-muted mb-0 mt-1'})
    desc=[]
    for tag in topic_desc_tags:
        desc.append(tag.text.strip())
    return desc

In [19]:
#Function to get all topic links
def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a',{'class':'no-underline flex-grow-0'})
    arr=[]
    base_url='https://github.com'
    for tag in topic_link_tags:
        arr.append(base_url + tag['href'])
    return arr

In [23]:
def formulate_df(url):
    doc = get_page(url)
    topics_dict = {
        'Title':get_topic_titles(doc),
        'Description': get_topic_desc(doc),
        'Url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

In [24]:
topics_data_frame = formulate_df(primary_url)
topics_data_frame.head()

Unnamed: 0,Title,Description,Url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android


Parsing individual topic pages to get info for the top 25 repositories

In [31]:
#Getting a list of all topic wise urls
topics_urls=[]
for i in topics_data_frame['Url']:
    topics_urls.append(i)
topics_urls[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [38]:
#Converting star count from k notation to a number
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return float(stars_str[:-1])*1000

In [39]:
def get_repo_info(h3_tag,star_tag):
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    url = base_url+a_tags[1]['href']
    star_count= parse_star_count(star_tag.text)
    return username, repo_name,url,star_count

In [49]:
def get_topic_repos(topic_url):
    response = requests.get(topic_url)
    if response.status_code != 200:
        raise Exception('Failed to load page')
    topic_doc = BeautifulSoup(response.text,'html.parser')
    repo_tag = topic_doc.find_all('h3',{'class':'f3 color-fg-muted text-normal lh-condensed'})
    star_tags = topic_doc.find_all('span',{'class':'Counter js-social-count'})
    topics_repos_dict={'username':[],'repo_name':[],'urls':[],'stars':[]}
    for i in range (len(repo_tag)):
        info = list(get_repo_info(repo_tag[i],star_tags[i]))
        topics_repos_dict['username'].append(info[0])
        topics_repos_dict['repo_name'].append(info[1])
        topics_repos_dict['urls'].append(info[2])
        topics_repos_dict['stars'].append(info[3])
    return pd.DataFrame(topics_repos_dict)

In [55]:
#Testing functions
get_topic_repos(topics_urls[2]).head()

Unnamed: 0,username,repo_name,urls,stars
0,jwasham,coding-interview-university,https://github.com/jwasham/coding-interview-un...,205000.0
1,CyC2018,CS-Notes,https://github.com/CyC2018/CS-Notes,145000.0
2,trekhleb,javascript-algorithms,https://github.com/trekhleb/javascript-algorithms,134000.0
3,TheAlgorithms,Python,https://github.com/TheAlgorithms/Python,128000.0
4,yangshun,tech-interview-handbook,https://github.com/yangshun/tech-interview-han...,65100.0


Converting all the information scraped to CSV files

In [56]:
#For the first page containing all topics
topics_data_frame.to_csv('GithubTopics.csv',index=None)

In [68]:
k=0
for i in topics_data_frame['Url']:
    df = get_topic_repos(i)
    title = topics_data_frame['Title'][k] + ".csv"
    k = k+1
    df.to_csv(title,index=None)

Exception: Failed to load page