### Use the request library to download web pages

In [9]:
!pip install requests --upgrade --quiet

In [11]:
import requests

In [12]:
topics_url = 'https://github.com/topics'

In [13]:
response = requests.get(topics_url)

In [16]:
response.status_code  

'''
informational responses (100-199)
Successful responses (200-299)
Redirects(300-399)
Client errors (400-499)
Server errors(500-599)
'''

200

In [18]:
len(response.text)

177080

In [19]:
page_contents = response.text
page_contents[0:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-1G4rYJktwRTQKn7fVfJUxH8RRZFUJlGo77xMZfBfIhZPx4BHVrzPE1VgnafttXI8G3y/PywH3uXyhNkSLp3+oA==" rel="stylesheet" href="https://github.githubassets.com/assets/light-d46e2b60992dc114d02a7edf55f254c4.css" /><link crossorigin="anonymous" media="all" integrity="sha512-hI5b2oqTE9njfjYrfuzXqA4bSGSNrE5OMc9IiFhZy+RDGg9Qn4Si1A97o0MlinlwFt3xAifvoLX0s7jH

In [20]:
with open('webpage.html', 'w') as f:
    f.write(page_contents)

### Use BeautifulSoup to parse and extract information

In [21]:
!pip install beautifulsoup4 --upgrade --quiet

In [22]:
from bs4 import BeautifulSoup

In [23]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [33]:
selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"

topic_title_tags = doc.find_all('p', {'class':selection_class})

In [34]:
len(topic_title_tags)

30

In [35]:
topic_title_tags[0:10]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Angular</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ansible</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">API</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Arduino</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">ASP.NET</p>]

In [36]:
desc_selector = "f5 color-fg-muted mb-0 mt-1"

topic_desc_tags = doc.find_all('p', {'class': desc_selector})

In [37]:
len(topic_desc_tags)

30

In [38]:
topic_desc_tags[:10]

[<p class="f5 color-fg-muted mb-0 mt-1">
               3D modeling is the process of virtually developing the surface and structure of a 3D object.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Ajax is a technique for creating interactive web applications.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Algorithms are self-contained sequences that carry out a variety of tasks.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Amp is a non-blocking concurrency framework for PHP.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Android is an operating system built by Google designed for mobile devices.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Angular is an open source web application platform.
             </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
               Ansible is a simple and powerful automation engine.
             </p>,
 <p class="

In [39]:
topic_title_tag0 = topic_desc_tags[0]

In [40]:
topic_title_tag0

<p class="f5 color-fg-muted mb-0 mt-1">
              3D modeling is the process of virtually developing the surface and structure of a 3D object.
            </p>

In [44]:
div_tag = topic_title_tag0.parent
div_tag

<div class="flex-auto">
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
<p class="f5 color-fg-muted mb-0 mt-1">
              3D modeling is the process of virtually developing the surface and structure of a 3D object.
            </p>
</div>

In [46]:
topic_link_tags = doc.find_all('a', {'class':"d-flex no-underline"})
len(topic_link_tags)

30

In [49]:
topic_link_tags[0]['href']

'/topics/3d'

In [95]:
base_url = "https://github.com"

topic0_url = base_url + topic_link_tags[0]['href']
print(topic0_url)

https://github.com/topics/3d


In [96]:
topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text)

In [97]:
topic_titles

['3D',
 'Ajax',
 'Algorithm',
 'Amp',
 'Android',
 'Angular',
 'Ansible',
 'API',
 'Arduino',
 'ASP.NET',
 'Atom',
 'Awesome Lists',
 'Amazon Web Services',
 'Azure',
 'Babel',
 'Bash',
 'Bitcoin',
 'Bootstrap',
 'Bot',
 'C',
 'Chrome',
 'Chrome extension',
 'Command line interface',
 'Clojure',
 'Code quality',
 'Code review',
 'Compiler',
 'Continuous integration',
 'COVID-19',
 'C++']

In [98]:
topic_descriptions = []

for tag in topic_desc_tags:
    topic_descriptions.append(tag.text.strip()) #strip for removing any unnecessary space

In [99]:
topic_descriptions[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency framework for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [100]:
topic_urls = []

for tag in topic_link_tags:
    topic_urls.append("https://github.com"+tag['href']) # base url: https://github.com

In [101]:
topic_urls[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [102]:
import pandas as pd

In [103]:
topics_dict = {'title': topic_titles,
              'description': topic_descriptions,
              'URLs': topic_urls}

In [104]:
topics_df = pd.DataFrame(topics_dict)

In [105]:
topics_df.head()

Unnamed: 0,title,description,URLs
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android


### Creating a CSV file with the extracted information

In [106]:
topics_df.to_csv('topic.csv', index=None) # remove the index

### Getting information out of a topic page

In [107]:
topic_page_url = topic_urls[0]

In [108]:
topic_page_url

'https://github.com/topics/3d'

In [109]:
response = requests.get(topic_page_url)

In [110]:
response.status_code

200

In [111]:
len(response.text)

651360

In [112]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [113]:
h3_selection_tag = "f3 color-fg-muted text-normal lh-condensed"

repo_tags = topic_doc.find_all('h3', {'class': "f3 color-fg-muted text-normal lh-condensed"})

In [114]:
len(repo_tags)

30

In [115]:
repo_tags[0]

<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
            mrdoob
</a>          /
          <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d897521

In [116]:
a_tags = repo_tags[0].find_all('a')

a_tags[0].text.strip()



'mrdoob'

In [117]:
a_tags[1].text.strip()

'three.js'

In [119]:
# base_url = "https://github.com"

a_tags[1]['href']

'/mrdoob/three.js'

In [120]:
repo_url = base_url + a_tags[1]['href']  # base_url = "https://github.com"
repo_url

'https://github.com/mrdoob/three.js'

In [121]:
star_tags = topic_doc.find_all('a', {'class': "social-count js-social-count"})

len(star_tags)

30

In [131]:
star_tags[0].text.strip()

'76.6k'

In [132]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    
    if stars_str[-1] == 'k':
        return float(stars_str[:-1]) * 1000
    
    return int(stars_str)

In [133]:
parse_star_count(star_tags[0].text.strip())

76600.0

In [134]:
def get_repo_info(h3_tag, stars_tag):
    # return all the required information about repository
    a_tags = h3_tag.find_all('a')
    
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(stars_tag.text.strip())
    
    return username, repo_name, stars, repo_url

In [137]:
get_repo_info(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 76600.0, 'https://github.com/mrdoob/three.js')

In [139]:
topic_repos_dict = {
                        'username': [],
                        'repo_name': [],
                        'stars':[],
                        'repo_url':[]
}


for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])

In [142]:
topic_repos_df = pd.DataFrame(topic_repos_dict)

In [144]:
topic_repos_df.head()

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,76600.0,https://github.com/mrdoob/three.js
1,libgdx,libgdx,19400.0,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,15900.0,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,15400.0,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,13400.0,https://github.com/aframevr/aframe


# Final Code

In [192]:
import os

In [203]:
def get_topic_page(topic_url):
    
    # download the page
    response = requests.get(topic_url)
    
    #check successful response
    if response.status_code != 200:
        raise Exception("Failed to load page {}".format(topic_url))
    
    # Parse using beautifulsoup
    topic_doc = BeautifulSoup(response.text, 'html.parser')

    return topic_doc


def get_repo_info(h3_tag, star_tag):
    # returns all the required info about the repository
    
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url
    

def get_topic_repos(topic_doc):
    
    # Get the h3 tags containing repo title, repo URL and username
    h3_selection_tag = "f3 color-fg-muted text-normal lh-condensed"
    repo_tags = topic_doc.find_all('h3', {'class': h3_selection_tag})
    
    # get star tags
    star_tags = topic_doc.find_all('a', {'class': "social-count js-social-count"})
    
    #
    topic_repos_dict = {
                        'username': [],
                        'repo_name': [],
                        'stars':[],
                        'repo_url':[]
}
    
    # get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url, path):
    
    if os.path.exists(path):
        print("The file {} already exists. skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path ,index= None)

In [155]:
url4 = topic_urls[4]
url4

'https://github.com/topics/android'

In [161]:
topic4_repos = get_topic_repos(get_topic_page(url4)) 

In [164]:
topic4_repos.head()

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,133000.0,https://github.com/flutter/flutter
1,justjavac,free-programming-books-zh_CN,84700.0,https://github.com/justjavac/free-programming-...
2,Genymobile,scrcpy,58300.0,https://github.com/Genymobile/scrcpy
3,Hack-with-Github,Awesome-Hacking,47900.0,https://github.com/Hack-with-Github/Awesome-Ha...
4,google,material-design-icons,44500.0,https://github.com/google/material-design-icons


Write a single function to:

1. Get the list of topics from the topics page
2. Get the list of top repositories from the individual topic page
3. For each topic, create a CSV of the top repos for the topic 

In [165]:
def get_topic_titles(doc):
    selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all('p', {'class':selection_class})
    
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    
    return topic_titles

def get_topic_descs(doc):
    desc_selector = "f5 color-fg-muted mb-0 mt-1"
    topic_desc_tags = doc.find_all('p', {'class': desc_selector})
    
    topic_descriptions = []
    for tag in topic_desc_tags:
        topic_descriptions.append(tag.text.strip()) #strip for removing any unnecessary space
        
    return topic_descriptions


    

def scrape_topics():
    # download the page
    topics_url = "https://github.com/topics"
    response = requests.get(topics_url)
    
    #check successful response
    if response.status_code != 200:
        raise Exception("Failed to load page {}".format(topic_url))
    
    topics_dict = {
        'title': topic_titles,
        'desciption': topic_descriptions,
        'url': topic_urls
    }
    
    return pd.DataFrame(topics_dict)

    

In [167]:
scrape_topics().head()

Unnamed: 0,title,desciption,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android


In [201]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    # create a folder
    os.makedirs('data', exist_ok=True)
    
    for index, row in topics_df.iterrows():
        print("Scraping top repositories for '{}'.".format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [211]:

scrape_topics_repos()

Scraping list of topics
Scraping top repositories for '3D'.
The file data/3D.csv already exists. skipping...
Scraping top repositories for 'Ajax'.
The file data/Ajax.csv already exists. skipping...
Scraping top repositories for 'Algorithm'.
The file data/Algorithm.csv already exists. skipping...
Scraping top repositories for 'Amp'.
The file data/Amp.csv already exists. skipping...
Scraping top repositories for 'Android'.
The file data/Android.csv already exists. skipping...
Scraping top repositories for 'Angular'.
The file data/Angular.csv already exists. skipping...
Scraping top repositories for 'Ansible'.
The file data/Ansible.csv already exists. skipping...
Scraping top repositories for 'API'.
The file data/API.csv already exists. skipping...
Scraping top repositories for 'Arduino'.
The file data/Arduino.csv already exists. skipping...
Scraping top repositories for 'ASP.NET'.
The file data/ASP.NET.csv already exists. skipping...
Scraping top repositories for 'Atom'.
The file data/At