In [129]:
print("Welcome to the WebScrapping Project!")

Welcome to the WebScrapping Project!


# Project Outline

  - We're going to scrape https://github.com/topics     Means: Browse popular topics on GitHub.
  - We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
  - For each topic, we'll get the top 25 repositories in the topic from the topic page
  - For each repository, we'll grab the repo name, username, stars and repo URL
  - For each topic we'll create a CSV file in the following format:


# Result will look like this
- Repo Name,Username,Stars,Repo URL
- three.js,mrdoob,69700,https://github.com/mrdoob/three.js
- libgdx,libgdx,18300,https://github.com/libgdx/libgdx

# Use the requests library to download web pages


In [2]:
import requests

In [3]:
topics_url = 'https://github.com/topics'

In [4]:
response = requests.get(topics_url)

In [5]:
# To check whether my query's get succesfully executed or not
response.status_code

200

In [6]:
len(response.text)

137252

In [7]:
page_contents = response.text

In [8]:
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-+LlYCzoIINFSwdPG8+vqo6w8TBMZRoJRBASiCU48bzL3w5EvYcA2sb9hCNg0CTnuvvQrruwwAIfQVjZPw2dqGg==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-f8b9580b3a0820d152c1d3c6f3ebeaa3.css" />\n  \n    <link crossorigin="anonymous" media="all" integrity="sha512-33HQeFC86zGNKWxAWytJAe2R+EaDTvW6lZG/7eM494AhegB7xjBuf7ASXrCzicENC

In [9]:
with open('webpage.html', 'w',encoding="utf-8") as f:
    f.write(page_contents)


# Use Beautiful Soup to parse and extract information in the beautiful way


In [130]:
#importing library
from bs4 import BeautifulSoup

In [11]:
#parsing the html to beautiful lib
doc = BeautifulSoup(page_contents, 'html.parser')

In [12]:
doc 


<!DOCTYPE html>

<html data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
<link href="https://avatars.githubusercontent.com" rel="preconnect"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-f8b9580b3a0820d152c1d3c6f3ebeaa3.css" integrity="sha512-+LlYCzoIINFSwdPG8+vqo6w8TBMZRoJRBASiCU48bzL3w5EvYcA2sb9hCNg0CTnuvvQrruwwAIfQVjZPw2dqGg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/behaviors-df71d07850bceb318d296c405b2b4901.css" integrity="sha512-33HQeFC86zGNKWxAWytJAe2R

In [13]:
#To find the topic-name from the page 
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p', {'class': selection_class})

In [14]:
#first five topics 
topic_title_tags[0:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [18]:
len(topic_title_tags)

30

In [15]:
#To find the description of the topic
desc_selector = 'f5 color-text-secondary mb-0 mt-1'
topic_desc_tags = doc.find_all('p', {'class': desc_selector})

In [17]:
len(topic_desc_tags)

30

In [16]:
#Description of First five topics 
topic_desc_tags[0:5]

[<p class="f5 color-text-secondary mb-0 mt-1">
               3D modeling is the process of virtually developing the surface and structure of a 3D object.
             </p>, <p class="f5 color-text-secondary mb-0 mt-1">
               Ajax is a technique for creating interactive web applications.
             </p>, <p class="f5 color-text-secondary mb-0 mt-1">
               Algorithms are self-contained sequences that carry out a variety of tasks.
             </p>, <p class="f5 color-text-secondary mb-0 mt-1">
               Amp is a non-blocking concurrency framework for PHP.
             </p>, <p class="f5 color-text-secondary mb-0 mt-1">
               Android is an operating system built by Google designed for mobile devices.
             </p>]

In [19]:
#To find the link url of the topic
topic_link_tags = doc.find_all('a', {'class': 'd-flex no-underline'})

In [20]:
len(topic_link_tags)

30

In [25]:
topic0_url = "https://github.com" + topic_link_tags[0]['href']

In [27]:
print(topic0_url)

https://github.com/topics/3d


In [28]:
#List of topics
topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text)
    
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [29]:
#List of topics Descriptions
topic_descs = []

for tag in topic_desc_tags:
    topic_descs.append(tag.text.strip())
    
topic_descs[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency framework for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [31]:
#List of topics url
base_url = 'https://github.com'

for tag in topic_link_tags:
    topic_urls.append(base_url + tag['href'])
    
topic_urls[0:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [32]:
#importing Library for Data frame
import pandas as pd

In [36]:
topics_dict = {
    'Title': topic_titles,
    'Description': topic_descs,
    'Url': topic_urls
}

In [37]:
#creating dataframe
topics_df = pd.DataFrame(topics_dict)

In [38]:
topics_df

Unnamed: 0,Title,Description,Url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


# Import data into csv

In [39]:
topics_df.to_csv('topics.csv', index=None)

# Getting information out of a topic page


# We are going to scrape the inside the topic page

In [40]:
topic_page_url = topic_urls[0]

In [42]:
print(topic_page_url)

https://github.com/topics/3d


In [52]:
#download the first topic url
response = requests.get(topic_page_url)

In [53]:
response.status_code

200

In [54]:
len(response.text)

619522

In [55]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [65]:
#finding the repos from first topic
h3_selection_class = 'f3 color-text-secondary text-normal lh-condensed'
repo_tags = topic_doc.find_all('h3', {'class': h1_selection_class} )

In [72]:
repo_tags

[<h3 class="f3 color-text-secondary text-normal lh-condensed">
 <a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
             mrdoob
 </a>          /
           <a class="text-bold" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d897521569

In [68]:
a_tags = repo_tags[0].find_all('a')

In [71]:
a_tags

[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="/mrdoob">
             mrdoob
 </a>,
 <a class="text-bold" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-view-component="true" href="/mrd

In [69]:
#extracting username
a_tags[0].text.strip()

'mrdoob'

In [70]:
#extracting reponame
a_tags[1].text.strip()

'three.js'

In [75]:
#creating the url of the repo
repo_url=base_url+a_tags[1]['href']

In [77]:
print(repo_url)

https://github.com/mrdoob/three.js


In [78]:
#To find the star count
star_tags = topic_doc.find_all('a', { 'class': 'social-count float-none'})

In [79]:
len(star_tags)

30

In [84]:
#extracting the star count only
star_tags[0].text.strip()

'73.6k'

In [85]:
#converting star count into integers
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)

In [86]:
parse_star_count(star_tags[0].text.strip())

73600

In [87]:
#creating the function to get the all info in oneline
def get_repo_info(h1_tag, star_tag):
    # returns all the required info about a repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url =  base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [134]:
get_repo_info(repo_tags[5], star_tags[5])

('ssloy', 'tinyrenderer', 11100, 'https://github.com/ssloy/tinyrenderer')

In [94]:
topic_repos_dict = {
    'Username': [],
    'Repo_name': [],
    'Stars': [],
    'Repo_url': []
}


for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i], star_tags[i])
    topic_repos_dict['Username'].append(repo_info[0])
    topic_repos_dict['Repo_name'].append(repo_info[1])
    topic_repos_dict['Stars'].append(repo_info[2])
    topic_repos_dict['Repo_url'].append(repo_info[3])

In [95]:
topic_repos=pd.DataFrame(topic_repos_dict)

In [96]:
topic_repos

Unnamed: 0,Username,Repo_name,Stars,Repo_url
0,mrdoob,three.js,73600,https://github.com/mrdoob/three.js
1,libgdx,libgdx,18800,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,14700,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,14700,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,13000,https://github.com/aframevr/aframe
5,ssloy,tinyrenderer,11100,https://github.com/ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,10900,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,9700,https://github.com/FreeCAD/FreeCAD
8,metafizzy,zdog,8600,https://github.com/metafizzy/zdog
9,CesiumGS,cesium,7400,https://github.com/CesiumGS/cesium


In [97]:
#importing dataframe into csv file
topic_repos.to_csv('topics_repo.csv', index=None)

In [146]:
#creating function to get the all info of any topic
import os
def get_topic_page(topic_url):
    # Download the page
    response = requests.get(topic_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

def get_repo_info(h1_tag, star_tag):
    # returns all the required info about a repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url =  base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

def get_topic_repos(topic_doc):
    # Get the h1 tags containing repo title, repo URL and username
    h3_selection_class = 'f3 color-text-secondary text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h1_selection_class} )
    # Get star tags
    star_tags = topic_doc.find_all('a', { 'class': 'social-count float-none'})
    
    topic_repos_dict = { 'username': [], 'repo_name': [], 'stars': [],'repo_url': []}

    # Get repo info
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path, index=None)

In [147]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class': selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_descs(doc):
    desc_selector = 'f5 color-text-secondary mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', {'class': desc_selector})
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs

def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', {'class': 'd-flex no-underline'})
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls
    

def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topics_dict = {
        'title': get_topic_titles(doc),
        'description': get_topic_descs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)


In [148]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    os.makedirs('data', exist_ok=True)
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [149]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for "3D"
Scraping top repositories for "Ajax"
Scraping top repositories for "Algorithm"
Scraping top repositories for "Amp"
Scraping top repositories for "Android"
Scraping top repositories for "Angular"
Scraping top repositories for "Ansible"
Scraping top repositories for "API"
Scraping top repositories for "Arduino"
Scraping top repositories for "ASP.NET"
Scraping top repositories for "Atom"
Scraping top repositories for "Awesome Lists"
Scraping top repositories for "Amazon Web Services"
Scraping top repositories for "Azure"
Scraping top repositories for "Babel"
Scraping top repositories for "Bash"
Scraping top repositories for "Bitcoin"
Scraping top repositories for "Bootstrap"
Scraping top repositories for "Bot"
Scraping top repositories for "C"
Scraping top repositories for "Chrome"
Scraping top repositories for "Chrome extension"
Scraping top repositories for "Command line interface"
Scraping top repositories for "Clojure"
Scrapin

In [137]:
#passin the 5th topic
url_4=topic_urls[4]

In [138]:
url_4

'https://github.com/topics/android'

In [111]:
#getting the docs of 5th topic
topic4_doc=get_topic_page(url_4)

In [112]:
topic4_doc


<!DOCTYPE html>

<html data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
<link href="https://avatars.githubusercontent.com" rel="preconnect"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-f8b9580b3a0820d152c1d3c6f3ebeaa3.css" integrity="sha512-+LlYCzoIINFSwdPG8+vqo6w8TBMZRoJRBASiCU48bzL3w5EvYcA2sb9hCNg0CTnuvvQrruwwAIfQVjZPw2dqGg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/behaviors-df71d07850bceb318d296c405b2b4901.css" integrity="sha512-33HQeFC86zGNKWxAWytJAe2R

In [113]:
topic4_repo=get_topic_repos(topic4_doc)

In [114]:
topic4_repo

Unnamed: 0,username,repo_name,stars,repo_url
0,flutter,flutter,128000,https://github.com/flutter/flutter
1,justjavac,free-programming-books-zh_CN,82200,https://github.com/justjavac/free-programming-...
2,Genymobile,scrcpy,53300,https://github.com/Genymobile/scrcpy
3,Hack-with-Github,Awesome-Hacking,45500,https://github.com/Hack-with-Github/Awesome-Ha...
4,google,material-design-icons,43600,https://github.com/google/material-design-icons
5,wasabeef,awesome-android-ui,41100,https://github.com/wasabeef/awesome-android-ui
6,square,okhttp,40600,https://github.com/square/okhttp
7,android,architecture-samples,39300,https://github.com/android/architecture-samples
8,square,retrofit,38600,https://github.com/square/retrofit
9,Solido,awesome-flutter,37100,https://github.com/Solido/awesome-flutter


In [139]:
topic_urls[0]

'https://github.com/topics/3d'

In [141]:
get_topic_repos(get_topic_page(topic_urls[0])).to_csv('3D.csv', index=None)