In [3]:
# OBJECTIVE

# Scrape topics from github 
# Grab 25 repositories from each topic which includes repository name, username, stars, and repository URL  
# Create a CSV file with all scrapped data

In [371]:
# Importing libraries


import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [23]:
# Defining what website to use


URL = 'https://github.com/topics'

page = requests.get(URL)

page_contents = page.text

doc = BeautifulSoup(page_contents,'html.parser')

# print(page_contents[:1000])

In [40]:
# Finding tags for topic names


topic_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'

topic_title_tags = doc.find_all('p',{'class': topic_class})

topic_title_tags[:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

In [41]:
# Finding tags for descriptions


descr_class = 'f5 color-fg-muted mb-0 mt-1'

topic_descr_tags = doc.find_all('p',{'class': descr_class})

topic_descr_tags[:5]

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D modeling is the process of virtually developing the surface and structure of a 3D object.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>]

In [67]:
# finding link tags p1


topic_title_tag0 = topic_title_tags[0]

div_tag = topic_title_tag0.parent

div_tag

<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
<p class="f5 color-fg-muted mb-0 mt-1">
          3D modeling is the process of virtually developing the surface and structure of a 3D object.
        </p>
</a>

In [71]:
# finding link tags p2


link_class = 'no-underline flex-1 d-flex flex-column' #also the a tag

topic_link_tags = doc.find_all('a', {'class': link_class})

topic_link_tags[:5]

[<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           3D modeling is the process of virtually developing the surface and structure of a 3D object.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/ajax">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/algorithm">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>
 </a>,
 <a class="no-underline flex-1 d-flex flex-column" href="/topics/amphp">
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>
 <p class="f

In [400]:
# Creating a topic titles list


topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text)

topic_titles[:5]

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android']

In [53]:
# Creating a topic descriptions list


topic_descr = []

for tag in topic_descr_tags:
    topic_descr.append(tag.text.strip())

topic_descr[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [81]:
# Creating a topic links/URLs list


topic_urls = []

base_url = 'https://github.com'

for tag in topic_link_tags:
    topic_urls.append(base_url + tag['href'])
    
topic_urls[:5]    

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

In [83]:
# Creating data frame 


topics_dict = {
    'title': topic_titles,
    'description': topic_descr,
    'url': topic_urls
}

topics_df = pd.DataFrame(topics_dict)

topics_df

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [86]:
# Creating a CSV file


topics_df.to_csv('topic.csv', index=None)
#                              ^ to get rid of row number

In [94]:
# Moving onto scraping the contents in each specific topic







In [340]:
# Defining what website/URL to use


topic_page_url = topic_urls[2]


topic_doc = BeautifulSoup(response.text, 'html.parser')

topic_page_url

'https://github.com/topics/algorithm'

In [331]:
response = requests.get(topic_page_url)

response.status_code

200

In [310]:
# Finding username and attached repository tags p1


repo_tags = topic_doc.find_all('h3',{'class': 'f3 color-fg-muted text-normal lh-condensed'})

a_tags = repo_tags[0].find_all('a')

a_tags

[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":3771963,"originating_url":"https://github.com/topics/algorithm","user_id":null}}' data-hydro-click-hmac="806006bf6ecca87f1d4103212d684b4f8f9d8f897c7349a1cd7e37852661db87" data-turbo="false" data-view-component="true" href="/jwasham">
   
             jwasham
 
   
 </a>,
 <a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":60493101,"originating_url":"https://github.com/topics/algorithm","user_id":null}}' data-hydro-click-hmac="00d35a03113917ed4a7133838f34531a079c7e92

In [311]:
# Finding (username) and attached repository tags p2


a_tags[0].text.strip()

'jwasham'

In [312]:
# Finding username and attached (repository) tags p3


a_tags[1].text.strip()

'coding-interview-university'

In [313]:
# Creating URL for attached repository


repo_url = base_url + a_tags[1]['href']

repo_url

'https://github.com/jwasham/coding-interview-university'

In [314]:
# Finding star tags

star_class = 'repo-stars-counter-star'

star_tags = topic_doc.find_all(id= star_class)

# star_tags[:5]

star_tags[0].text

'219k'

In [315]:
# Changing star_tags from text to int


def parse_star_count(stars_str):
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1]) * 1000)
    return int(stars_str)


parse_star_count(star_tags[0].text)

219000

In [316]:
# Creating function to return all required info about a repository


def get_repo_info(repo_tags, star_tags):
    a_tags = repo_tags.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tags.text)
    return username, repo_name, stars, repo_url

get_repo_info(repo_tags[0], star_tags[0])

('jwasham',
 'coding-interview-university',
 219000,
 'https://github.com/jwasham/coding-interview-university')

In [422]:
# Create data frame p1


topic_repos_dict = {
    'username':[],
    'repo_name':[],
    'stars': [],
    'repo_url': []
}

for x in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[x], star_tags[x])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])
    

In [318]:
# Create data frame p2



topic_repos_df = pd.DataFrame(topic_repos_dict)

topic_repos_df

Unnamed: 0,username,repo_name,stars,repo_url
0,jwasham,coding-interview-university,219000,https://github.com/jwasham/coding-interview-un...
1,CyC2018,CS-Notes,150000,https://github.com/CyC2018/CS-Notes
2,trekhleb,javascript-algorithms,140000,https://github.com/trekhleb/javascript-algorithms
3,TheAlgorithms,Python,136000,https://github.com/TheAlgorithms/Python
4,yangshun,tech-interview-handbook,69600,https://github.com/yangshun/tech-interview-han...
5,kdn251,interviews,57100,https://github.com/kdn251/interviews
6,azl397985856,leetcode,47800,https://github.com/azl397985856/leetcode
7,TheAlgorithms,Java,45900,https://github.com/TheAlgorithms/Java
8,algorithm-visualizer,algorithm-visualizer,36800,https://github.com/algorithm-visualizer/algori...
9,crossoverJie,JCSprout,26700,https://github.com/crossoverJie/JCSprout


In [346]:
# Creating function to grab topic repository


def get_topic_repos1(topic_urls):
    
    # Download page
    response = requests.get(topic_urls)
    
    # Check successfull response
    if response.status_code !=200:
        raise Exception('failed to load page {}'.format(topic_url))
        
    # Parse using BeautifulSoup    
    topic_doc = BeautifulSoup(response.content, 'html.parser')
    
    # Get repo tags(or h3 tags) containing repo title, repo URL and username 
    repo_tags = topic_doc.find_all('h3',{'class': 'f3 color-fg-muted text-normal lh-condensed'})
    
    # Get star tags
    star_tags = topic_doc.find_all(id= star_class)
    
    # topic repository dictionary
    topic_repos_dict = {
        'username':[],
        'repo_name':[],
        'stars': [],
        'repo_url': []
    }
    
    # Get repo info
    for x in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[x], star_tags[x])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
    
    return pd.DataFrame(topic_repos_dict)


In [389]:
# Testing Fuction 


get_topic_repos1(topic_urls[1])

Unnamed: 0,username,repo_name,stars,repo_url
0,metafizzy,infinite-scroll,7100,https://github.com/metafizzy/infinite-scroll
1,ljianshu,Blog,7100,https://github.com/ljianshu/Blog
2,developit,unfetch,5300,https://github.com/developit/unfetch
3,jquery-form,form,5100,https://github.com/jquery-form/form
4,olifolkerd,tabulator,4600,https://github.com/olifolkerd/tabulator
5,Studio-42,elFinder,4300,https://github.com/Studio-42/elFinder
6,ded,reqwest,2900,https://github.com/ded/reqwest
7,dwyl,learn-to-send-email-via-google-script-html-no-...,2800,https://github.com/dwyl/learn-to-send-email-vi...
8,elbywan,wretch,2400,https://github.com/elbywan/wretch
9,LeaVerou,bliss,2400,https://github.com/LeaVerou/bliss


In [420]:
# splitting function into multiple functions 


def get_topic_page(topic_urls):
    
    # Download page
    response = requests.get(topic_urls)
    
    # Check successfull response
    if response.status_code !=200:
        raise Exception('failed to load page {}'.format(topic_urls))
        
    # Parse using BeautifulSoup    
    topic_doc = BeautifulSoup(response.content, 'html.parser')
    
    return topic_doc


def get_repo_info(repo_tags, star_tags):
    a_tags = repo_tags.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tags.text)
    
    # returns all the required info about a repository
    return username, repo_name, stars, repo_url


def get_topic_repos(topic_doc):
    
    # Get repo tags(or h3 tags) containing repo title, repo URL and username 
    repo_tags = topic_doc.find_all('h3',{'class': 'f3 color-fg-muted text-normal lh-condensed'})
    
    # Get star tags
    star_tags = topic_doc.find_all(id= star_class)
    
    # topic repository dictionary
    topic_repos_dict = {
        'username':[],
        'repo_name':[],
        'stars': [],
        'repo_url': []
    }
    
    # Get repo info
    for x in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[x], star_tags[x])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
    
    return pd.DataFrame(topic_repos_dict)




In [354]:
# Testing multiple functions


url0 = topic_urls[0]

topic0_doc = get_topic_page(url0)

topic0_repos = get_topic_repos(topic0_doc)

topic0_repos

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,81700,https://github.com/mrdoob/three.js
1,libgdx,libgdx,20000,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,17800,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,16700,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,14100,https://github.com/aframevr/aframe
5,ssloy,tinyrenderer,13600,https://github.com/ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,12800,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,11200,https://github.com/FreeCAD/FreeCAD
8,metafizzy,zdog,9100,https://github.com/metafizzy/zdog
9,CesiumGS,cesium,8600,https://github.com/CesiumGS/cesium


In [374]:
# Creating a function to scrape topics


def scrape_topics1():
    topics_url = 'https://github.com/topics'
    requests.get(topics_url)
    if response.status_code !=200:
        raise Exception('failed to load page {}'.format(topic_urls))
        
        
    topic_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p',{'class': topic_class})
    
    
    descr_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_descr_tags = doc.find_all('p',{'class': descr_class})
    
    
    link_class = 'no-underline flex-1 d-flex flex-column' 
    topic_link_tags = doc.find_all('a', {'class': link_class})
   

    topic_titles = []

    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    
    
    topic_descr = []

    for tag in topic_descr_tags:
        topic_descr.append(tag.text.strip())

    
    topic_urls = []

    base_url = 'https://github.com'

    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
     
    
    topics_dict = {
        'title':get_topic_titles(doc),
        'description': get_topic_descrs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)     


# Testing Function


scrape_topics1()

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [358]:
# Splitting scrape_topic into mutiple functions


def get_topic_titles(doc):
    topic_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p',{'class': topic_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles


def get_topic_descrs(doc):
    descr_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_descr_tags = doc.find_all('p',{'class': descr_class})
    topic_descr = []
    for tag in topic_descr_tags:
        topic_descr.append(tag.text.strip())
    return topic_descr


def get_topic_urls(doc):
    link_class = 'no-underline flex-1 d-flex flex-column' 
    topic_link_tags = doc.find_all('a', {'class': link_class})
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls
    

In [405]:
#create a function to scrape top topics


def scrape_topics():
    topics_url = 'https://github.com/topics'
    requests.get(topics_url)
    if response.status_code !=200:
        raise Exception('failed to load page {}'.format(topic_urls))
    topics_dict = {
        'title':get_topic_titles(doc),
        'description': get_topic_descrs(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)


# Testing function
scrape_topics()

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [417]:
# Creating a function to scrape a topic


def scrape_topic(topic_url, topic_name):
        fname = topic_name + '.csv'
        if os.path.exists(fname):
            print('The file {} already exists. Skipping...'.format(fname))
            return
        topic_df = get_topic_repos(get_topic_page(topic_url))
        topic_df.to_csv(topic_name +'.csv', index=None)
    
    
# Testing function


In [421]:
# Creating function to scrape topic repository


def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']))
        scrape_topic(row['url'], row['title'])

        
# test

scrape_topics_repos()

Scraping list of topics
Scraping top repositories for "3D"
The file 3D.csv already exists. Skipping...
Scraping top repositories for "Ajax"
The file Ajax.csv already exists. Skipping...
Scraping top repositories for "Algorithm"
The file Algorithm.csv already exists. Skipping...
Scraping top repositories for "Amp"
The file Amp.csv already exists. Skipping...
Scraping top repositories for "Android"
The file Android.csv already exists. Skipping...
Scraping top repositories for "Angular"
The file Angular.csv already exists. Skipping...
Scraping top repositories for "Ansible"
The file Ansible.csv already exists. Skipping...
Scraping top repositories for "API"
The file API.csv already exists. Skipping...
Scraping top repositories for "Arduino"
The file Arduino.csv already exists. Skipping...
Scraping top repositories for "ASP.NET"
The file ASP.NET.csv already exists. Skipping...
Scraping top repositories for "Atom"
The file Atom.csv already exists. Skipping...
Scraping top repositories for "