# Web Scraping Using BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [25]:
base_url = 'https://github.com/topics'
html_code = urlopen(base_url).read().decode('utf-8')

In [26]:
doc = BeautifulSoup(html_code, 'html.parser')

In [38]:
# print(doc)

In [35]:
p_tags = doc.find_all('p', class_="f3 lh-condensed mb-0 mt-1 Link--primary")

In [36]:
len(p_tags)

30

In [37]:
p_tags

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Angular</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ansible</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">API</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Arduino</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">ASP.NET</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Atom</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Awesome Lists</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amazon Web Services</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Azure</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Babel</p>,
 <p class="f3 lh-condensed m

In [39]:
topic_title_tags = p_tags

In [41]:
topic_desc_tags = doc.find_all('p', class_="f5 color-text-secondary mb-0 mt-1")

In [42]:
topic_desc_tags[:5]

[<p class="f5 color-text-secondary mb-0 mt-1">
               3D modeling is the process of virtually developing the surface and structure of a 3D object.
             </p>,
 <p class="f5 color-text-secondary mb-0 mt-1">
               Ajax is a technique for creating interactive web applications.
             </p>,
 <p class="f5 color-text-secondary mb-0 mt-1">
               Algorithms are self-contained sequences that carry out a variety of tasks.
             </p>,
 <p class="f5 color-text-secondary mb-0 mt-1">
               Amp is a non-blocking concurrency framework for PHP.
             </p>,
 <p class="f5 color-text-secondary mb-0 mt-1">
               Android is an operating system built by Google designed for mobile devices.
             </p>]

In [43]:
topic_link_tags = doc.find_all('a', class_="d-flex no-underline")
len(topic_link_tags)

30

In [45]:
topic_link_tags[0]['href']

'/topics/3d'

In [75]:
topic_url = 'https://github.com' + topic_link_tags[0]['href']
print(topic_url)

https://github.com/topics/3d


In [76]:
print(topic_title_tags[0])
print( )
print(topic_title_tags[0].text)

<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>

3D


In [59]:
topic_title = []

for tag in topic_title_tags:
    topic_title.append(tag.text)
    
print(topic_title)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [63]:
topic_description = []

for desc in topic_desc_tags:
    topic_description.append(desc.text.strip())

In [65]:
topic_description[:3]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.']

In [84]:
topic_urls = []
url = 'https://github.com'

for tag in topic_link_tags:
    topic_urls.append(url + tag['href'])

In [85]:
topic_urls

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android',
 'https://github.com/topics/angular',
 'https://github.com/topics/ansible',
 'https://github.com/topics/api',
 'https://github.com/topics/arduino',
 'https://github.com/topics/aspnet',
 'https://github.com/topics/atom',
 'https://github.com/topics/awesome',
 'https://github.com/topics/aws',
 'https://github.com/topics/azure',
 'https://github.com/topics/babel',
 'https://github.com/topics/bash',
 'https://github.com/topics/bitcoin',
 'https://github.com/topics/bootstrap',
 'https://github.com/topics/bot',
 'https://github.com/topics/c',
 'https://github.com/topics/chrome',
 'https://github.com/topics/chrome-extension',
 'https://github.com/topics/cli',
 'https://github.com/topics/clojure',
 'https://github.com/topics/code-quality',
 'https://github.com/topics/code-review',
 'https://github.com/topics/compil

## Creating a dataframe from crapped data

In [86]:
import pandas as pd

In [88]:
dict = {
    'Title':topic_title,
    'Description':topic_description,
    'Url':topic_urls
}

In [89]:
topic_df = pd.DataFrame(dict)

In [90]:
topic_df

Unnamed: 0,Title,Description,Url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [91]:
topic_df.to_csv('Github topics.csv', index=False)

## Getting info out of topic page

In [93]:
topic_page_url = topic_urls[0]
topic_page_url

'https://github.com/topics/3d'

In [94]:
response = requests.get(topic_page_url)

In [95]:
response.status_code

200

In [96]:
len(response.text)

624137

In [97]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [98]:
repo_tags = topic_doc.find_all('h3', class_="f3 color-text-secondary text-normal lh-condensed")
len(repo_tags)

30

In [99]:
repo_tags[0]

<h3 class="f3 color-text-secondary text-normal lh-condensed">
<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
            mrdoob
</a>          /
          <a class="wb-break-word text-bold" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d

In [102]:
a_tags = repo_tags[0].find_all('a')
a_tags[0].text.strip()

'mrdoob'

In [104]:
a_tags[1].text.strip()

'three.js'

In [105]:
a_tags[1]['href']

'/mrdoob/three.js'

In [106]:
url

'https://github.com'

In [108]:
print(url + a_tags[1]['href'])

https://github.com/mrdoob/three.js


In [110]:
star_tags = topic_doc.find_all('a', class_="social-count float-none")

In [111]:
len(star_tags)

30

In [122]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[0:-1]) * 1000)
    else:
        return int(stars_str)

In [123]:
star_tags[0].text

'\n          74k\n'

In [124]:
parse_star_count(star_tags[0].text)

74000

In [128]:
def get_repo_info(h3_tag, star_tag):
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [129]:
get_repo_info(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 74000, 'https://github.com/mrdoob/three.js')

In [131]:
topic_repos_dict = {
    'Username':[],
    'Repo Name':[],
    'Stars':[],
    'Repo Url':[]
}

for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['Username'].append(repo_info[0])
    topic_repos_dict['Repo Name'].append(repo_info[1])
    topic_repos_dict['Stars'].append(repo_info[2])
    topic_repos_dict['Repo Url'].append(repo_info[3])

In [132]:
topic_repos_dict

{'Username': ['mrdoob',
  'libgdx',
  'pmndrs',
  'BabylonJS',
  'aframevr',
  'ssloy',
  'lettier',
  'FreeCAD',
  'metafizzy',
  'CesiumGS',
  'timzhang642',
  'a1studmuffin',
  'isl-org',
  'spritejs',
  'tensorspace-team',
  'jagenjo',
  'YadiraF',
  'AaronJackson',
  'domlysz',
  'openscad',
  'ssloy',
  'mosra',
  'google',
  'blender',
  'gfxfundamentals',
  'cleardusk',
  'jasonlong',
  'rg3dengine',
  'antvis',
  'cnr-isti-vclab'],
 'Repo Name': ['three.js',
  'libgdx',
  'react-three-fiber',
  'Babylon.js',
  'aframe',
  'tinyrenderer',
  '3d-game-shaders-for-beginners',
  'FreeCAD',
  'zdog',
  'cesium',
  '3D-Machine-Learning',
  'SpaceshipGenerator',
  'Open3D',
  'spritejs',
  'tensorspace',
  'webglstudio.js',
  'PRNet',
  'vrn',
  'BlenderGIS',
  'openscad',
  'tinyraytracer',
  'magnum',
  'model-viewer',
  'blender',
  'webgl-fundamentals',
  '3DDFA',
  'isometric-contributions',
  'rg3d',
  'L7',
  'meshlab'],
 'Stars': [74000,
  18900,
  14900,
  14800,
  13000,
  1

In [134]:
topic_repos_df = pd.DataFrame(topic_repos_dict)
topic_repos_df

Unnamed: 0,Username,Repo Name,Stars,Repo Url
0,mrdoob,three.js,74000,https://github.com/mrdoob/three.js
1,libgdx,libgdx,18900,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,14900,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,14800,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,13000,https://github.com/aframevr/aframe
5,ssloy,tinyrenderer,11200,https://github.com/ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,11000,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,9800,https://github.com/FreeCAD/FreeCAD
8,metafizzy,zdog,8700,https://github.com/metafizzy/zdog
9,CesiumGS,cesium,7400,https://github.com/CesiumGS/cesium


## Getting info out of every topic page

In [154]:
def get_topic_page(topic_urls):
    response = requests.get(topic_urls)
    
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_urls))
        
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc


def get_repo_info(h3_tag, star_tag):
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url   


def get_topic_repos(topic_doc):
    
    repo_tags = topic_doc.find_all('h3', class_="f3 color-text-secondary text-normal lh-condensed")
    star_tags = topic_doc.find_all('a', class_="social-count float-none")
    
    topic_repos_dict = {
        'Username':[],
        'Repo Name':[],
        'Stars':[],
        'Repo Url':[]
    }
    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['Username'].append(repo_info[0])
        topic_repos_dict['Repo Name'].append(repo_info[1])
        topic_repos_dict['Stars'].append(repo_info[2])
        topic_repos_dict['Repo Url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)

In [155]:
url4 = topic_urls[4]

In [156]:
topic4_doc = get_topic_page(url4)

In [157]:
topic4_repos = get_topic_repos(topic4_doc)

In [158]:
topic4_repos

Unnamed: 0,Username,Repo Name,Stars,Repo Url
0,flutter,flutter,129000,https://github.com/flutter/flutter
1,justjavac,free-programming-books-zh_CN,82700,https://github.com/justjavac/free-programming-...
2,Genymobile,scrcpy,53900,https://github.com/Genymobile/scrcpy
3,Hack-with-Github,Awesome-Hacking,45700,https://github.com/Hack-with-Github/Awesome-Ha...
4,google,material-design-icons,43800,https://github.com/google/material-design-icons
5,wasabeef,awesome-android-ui,41200,https://github.com/wasabeef/awesome-android-ui
6,square,okhttp,40700,https://github.com/square/okhttp
7,android,architecture-samples,39400,https://github.com/android/architecture-samples
8,square,retrofit,38700,https://github.com/square/retrofit
9,Solido,awesome-flutter,37300,https://github.com/Solido/awesome-flutter


In [161]:
topic_urls[6]

'https://github.com/topics/ansible'

In [160]:
get_topic_repos(get_topic_page(topic_urls[6]))

Unnamed: 0,Username,Repo Name,Stars,Repo Url
0,ansible,ansible,49800,https://github.com/ansible/ansible
1,StreisandEffect,streisand,22500,https://github.com/StreisandEffect/streisand
2,trailofbits,algo,21400,https://github.com/trailofbits/algo
3,bregman-arie,devops-exercises,13900,https://github.com/bregman-arie/devops-exercises
4,kubernetes-sigs,kubespray,11100,https://github.com/kubernetes-sigs/kubespray
5,ansible,awx,10100,https://github.com/ansible/awx
6,easzlab,kubeasz,7300,https://github.com/easzlab/kubeasz
7,geerlingguy,ansible-for-devops,4900,https://github.com/geerlingguy/ansible-for-devops
8,ansible-semaphore,semaphore,4300,https://github.com/ansible-semaphore/semaphore
9,rundeck,rundeck,4300,https://github.com/rundeck/rundeck


## How to Scale it