# Github Topic Scrapper
    
   ### Objective:
         - To get the Github page of topics, url : "www.github.com/topics"
         - Parse the downloaded html content using Beautiful Soup
         - Get the desired contents/list of contents from the soup object.
         - Make a DataFrame using pandas libraries of the scraped data.
         - Finally save the dataframe as .csv or .xlsx according to our preference

### Importing the required libraries

In [21]:
from bs4 import BeautifulSoup
import requests,urllib
import html5lib
import pandas as pd
import os


### As we are Scraping Github topics page, we are assigning the url to a variable named topics_url

In [5]:
topics_url = 'https://www.github.com/topics'

**We are sending a get request using the python requests library and we are getting the html page as the response to the request sent**

In [6]:
response = requests.get(topics_url)

**We are checking the status code to ensure we have successfully got the webpage response. Status code 200 means that the request was successful. To know more about HTTP Status code, you can refer to [MDN References](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) of HTTP response status codes.**

In [7]:
response.status_code

200

**Getting the HTML content from the downloaded page.**

In [8]:
# response.text  # display all the text of the html page
len(response.text) # total number of characters in the web page

138904

In [9]:
pagecontent = response.text
print(pagecontent[:500])



<!DOCTYPE html>
<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark" >
  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" cr


In [10]:
soup = BeautifulSoup(pagecontent,'html.parser')

In [11]:
soup
type(soup)


bs4.BeautifulSoup

In [None]:
#soup.prettify

In [12]:
p = soup.find_all('p',attrs={
    'class':'f3 lh-condensed mb-0 mt-1 Link--primary'
    })
# print(p)
for item in p:
    print(item)
    print(item.text)

<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
3D
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>
Ajax
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>
Algorithm
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>
Amp
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>
Android
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Angular</p>
Angular
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ansible</p>
Ansible
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">API</p>
API
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Arduino</p>
Arduino
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">ASP.NET</p>
ASP.NET
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Atom</p>
Atom
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Awesome Lists</p>
Awesome Lists
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amazon Web Services</p>
Amazon Web Services
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Azure</p>
Azure
<p class

In [13]:
topic_link_tag = soup.find_all('a',{'class' : 'no-underline flex-grow-0'})
len(topic_link_tag)
print(topic_link_tag)

[<a class="no-underline flex-grow-0" href="/topics/3d">
<div class="color-bg-accent f4 color-fg-muted text-bold rounded mr-3 flex-shrink-0 text-center" style="width:64px; height:64px; line-height:64px;">
            #
          </div>
</a>, <a class="no-underline flex-grow-0" href="/topics/ajax">
<img alt="ajax" class="rounded mr-3" height="64" src="https://raw.githubusercontent.com/github/explore/8be26d91eb231fec0b8856359979ac09f27173fd/topics/ajax/ajax.png" width="64"/>
</a>, <a class="no-underline flex-grow-0" href="/topics/algorithm">
<div class="color-bg-accent f4 color-fg-muted text-bold rounded mr-3 flex-shrink-0 text-center" style="width:64px; height:64px; line-height:64px;">
            #
          </div>
</a>, <a class="no-underline flex-grow-0" href="/topics/amphp">
<img alt="amphp" class="rounded mr-3" height="64" src="https://raw.githubusercontent.com/github/explore/99fe59c0f4fb5d6545311440b4ce89a0d82b0804/topics/amphp/amphp.png" width="64"/>
</a>, <a class="no-underline f

In [14]:
topic0_url = "https://www.github.com"+topic_link_tag[0]['href']
topic0_url

'https://www.github.com/topics/3d'

In [15]:
topic_title = []
for tag in p:
    topic_title.append(tag.text)
topic_title

['3D',
 'Ajax',
 'Algorithm',
 'Amp',
 'Android',
 'Angular',
 'Ansible',
 'API',
 'Arduino',
 'ASP.NET',
 'Atom',
 'Awesome Lists',
 'Amazon Web Services',
 'Azure',
 'Babel',
 'Bash',
 'Bitcoin',
 'Bootstrap',
 'Bot',
 'C',
 'Chrome',
 'Chrome extension',
 'Command line interface',
 'Clojure',
 'Code quality',
 'Code review',
 'Compiler',
 'Continuous integration',
 'COVID-19',
 'C++']

In [16]:
topic_desc_tag = soup.find_all('p', attrs={'class':'f5 color-fg-muted mb-0 mt-1'})

In [17]:
topic_desc_tag

[<p class="f5 color-fg-muted mb-0 mt-1">
           3D modeling is the process of virtually developing the surface and structure of a 3D object.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ajax is a technique for creating interactive web applications.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Algorithms are self-contained sequences that carry out a variety of tasks.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Amp is a non-blocking concurrency library for PHP.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Android is an operating system built by Google designed for mobile devices.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Angular is an open source web application platform.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           Ansible is a simple and powerful automation engine.
         </p>,
 <p class="f5 color-fg-muted mb-0 mt-1">
           An API (Applicati

In [18]:
topic_descriptions = []

for tag in topic_desc_tag:
    topic_descriptions.append(tag.text.strip())
    
print(topic_descriptions)

['3D modeling is the process of virtually developing the surface and structure of a 3D object.', 'Ajax is a technique for creating interactive web applications.', 'Algorithms are self-contained sequences that carry out a variety of tasks.', 'Amp is a non-blocking concurrency library for PHP.', 'Android is an operating system built by Google designed for mobile devices.', 'Angular is an open source web application platform.', 'Ansible is a simple and powerful automation engine.', 'An API (Application Programming Interface) is a collection of protocols and subroutines for building software.', 'Arduino is an open source hardware and software company and maker community.', 'ASP.NET is a web framework for building modern web apps and services.', 'Atom is a open source text editor built with web technologies.', 'An awesome list is a list of awesome things curated by the community.', 'Amazon Web Services provides on-demand cloud computing platforms on a subscription basis.', 'Azure is a cloud

In [19]:
topic_urls = []
base = "https://www.github.com"
for tag in topic_link_tag:
    topic_urls.append(base+tag['href'])
topic_urls

['https://www.github.com/topics/3d',
 'https://www.github.com/topics/ajax',
 'https://www.github.com/topics/algorithm',
 'https://www.github.com/topics/amphp',
 'https://www.github.com/topics/android',
 'https://www.github.com/topics/angular',
 'https://www.github.com/topics/ansible',
 'https://www.github.com/topics/api',
 'https://www.github.com/topics/arduino',
 'https://www.github.com/topics/aspnet',
 'https://www.github.com/topics/atom',
 'https://www.github.com/topics/awesome',
 'https://www.github.com/topics/aws',
 'https://www.github.com/topics/azure',
 'https://www.github.com/topics/babel',
 'https://www.github.com/topics/bash',
 'https://www.github.com/topics/bitcoin',
 'https://www.github.com/topics/bootstrap',
 'https://www.github.com/topics/bot',
 'https://www.github.com/topics/c',
 'https://www.github.com/topics/chrome',
 'https://www.github.com/topics/chrome-extension',
 'https://www.github.com/topics/cli',
 'https://www.github.com/topics/clojure',
 'https://www.github.co

In [22]:
topic_dict = {
    'Title': topic_title,
    'Description' : topic_descriptions,
    'Url' : topic_urls
}
topic_df = pd.DataFrame(topic_dict)
topic_df

Unnamed: 0,Title,Description,Url
0,3D,3D modeling is the process of virtually develo...,https://www.github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://www.github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://www.github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://www.github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://www.github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://www.github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://www.github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://www.github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://www.github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://www.github.com/topics/aspnet


In [23]:
topic_df.to_csv('Github_Topics.csv')

## getting information out of a topic page url

In [None]:
url1 = topic_urls[0]
url1

In [None]:
response = requests.get(url1)

In [None]:
response.status_code

In [None]:
len(response.text)


In [None]:
soup2 = BeautifulSoup(response.text,'html.parser')
# soup2

In [None]:
repo_tags = soup2.find_all('h3',attrs={
    'class' : 'f3 color-fg-muted text-normal lh-condensed'})
len(repo_tags)

In [None]:
a_tag = repo_tags[0].find_all('a')
# a_tag[0].text.strip()
a_tag

In [None]:
a_tag[1].text.strip()

In [None]:
repo_url = a_tag[1]['href']
repo_link1 = base + repo_url
repo_link1

In [None]:
stars= soup2.find_all('span',attrs= {
    'class' : 'Counter js-social-count'
})
stars

In [None]:
star_repo1 = stars[0].text
star_repo1

In [None]:
def parse_star_count(star_str):
    star_str = star_str.strip()
    if star_str[-1] == 'k':
        return int(float(star_str[:-1])*1000)
    return int(stars_str)

In [None]:
star_repo1 = stars[0].text
parse_star_count(star_repo1)

In [None]:
def get_repo_info(h3_tag,star_tag):
    atags = h3_tag.find_all('a')
    username = atags[0].text.strip()
    reponame = atags[1].text.strip()
    repo_url = "https://www.github.com"+ atags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    
    return username,reponame,repo_url,stars

In [None]:
star_repo1

In [None]:
get_repo_info(repo_tags[0],stars[0])

In [None]:
topic_repo_dict = {
    'Username':[],
    'Repo_Name':[],
    'Repo_Url':[],
    'Stars': []
}
print(repo_tags)
for i in range(len(repo_tags)):
    repo_info = get_repo_info(repo_tags[i],stars[i])
    print(repo_info)
    topic_repo_dict['Username'].append(repo_info[0])
    topic_repo_dict['Repo_Name'].append(repo_info[1])
    topic_repo_dict['Repo_Url'].append(repo_info[2])
    topic_repo_dict['Stars'].append(repo_info[3])


In [None]:
topic_repo_dict

In [None]:
def get_topic_page(topic_url):
    response = requests.get(topic_url)
    if response.status_code !=200:
        raise Exception('Failed to load page {}'.format(topic_url))
    topic_doc = BeautifulSoup(response.text,'html.parser')
    return topic_doc 
    
def get_topic_repos(topic_doc):
    
    repo_tags = topic_doc.find_all('h3',attrs={
        'class' : 'f3 color-fg-muted text-normal lh-condensed'})
    stars_tags= soup2.find_all('span',attrs= {
        'class' : 'Counter js-social-count'
            })
    topic_repo_dicts = {
    'Username':[],
    'Repo_Name':[],
    'Repo_Url':[],
    'Stars': []
    }
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i],stars_tags[i])
#         print(repo_info)
        topic_repo_dicts['Username'].append(repo_info[0])
        topic_repo_dicts['Repo_Name'].append(repo_info[1])
        topic_repo_dicts['Repo_Url'].append(repo_info[2])
        topic_repo_dicts['Stars'].append(repo_info[3])
        
    return pd.DataFrame(topic_repo_dicts)


def get_repo_info(h3_tag,star_tag):
    atags = h3_tag.find_all('a')
    username = atags[0].text.strip()
    reponame = atags[1].text.strip()
    repo_url = "https://www.github.com"+ atags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    
    return username,reponame,repo_url,stars

def scrape_topic(topic_url,topic_name):
    filename = topic_name+".csv"
    if os.path.exists(filename):
        print(f"File {filename} already exists. Skipping...")
        return 
    topic_df = get_topic_repos(get_topic_page(topic_url))
    
    topic_df.to_csv(filename,index = None)

In [None]:
topic_urls[4]


In [None]:
#
get_topic_repos(get_topic_page(topic_urls[6]))#.to_csv('ansible.csv0',index = None)



In [None]:
def get_topic_titles(soup):
    topic_title_tags = soup.find_all('p',attrs={
    'class':'f3 lh-condensed mb-0 mt-1 Link--primary'
    })
    topic_link_tag = soup.find_all('a',{'class' : 'no-underline flex-grow-0'})
    topic_titles = []
    
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_description(soup):
    topic_desc_tag = soup.find_all('p', attrs=
                                   {'class':'f5 color-fg-muted mb-0 mt-1'})
    topic_descriptions = []

    for tag in topic_desc_tag:
        topic_descriptions.append(tag.text.strip())
    return topic_descriptions

def get_topic_urls(soup):
    topic_title_tags = soup.find_all('p',attrs={
    'class':'f3 lh-condensed mb-0 mt-1 Link--primary'
    })
    topic_urls = []
    base = "https://www.github.com"
    for tag in topic_link_tags:
        topic_urls.append(base+tag['href'])
    return topic_urls

def scrape_topics():
    topics_url = "https://github.com/topics"
    requests.get(topics_url)
    if response.status_code !=200:
        raise Exception('Failed to load page {}'.format(topic_url))    
    topics_dict = {
        'title' : get_topic_titles(soup),
        'decscription': get_topic_description(soup),
        'url':get_topic_urls(soup)
    }
    return pd.DataFrame(topics_dict)


In [None]:
## Collectings

# topic_title_tags = soup.find_all('p',attrs={
#     'class':'f3 lh-condensed mb-0 mt-1 Link--primary'
#     })


# topic_desc_tag = soup.find_all('p', attrs={'class':'f5 color-fg-muted mb-0 mt-1'})

# topic_link_tags = soup.find_all('a',{'class' : 'no-underline flex-grow-0'})

# topic_title = []
# for tag in p:
#     topic_title.append(tag.text)
# topic_title

# topic_descriptions = []

# for tag in topic_desc_tag:
#     topic_descriptions.append(tag.text.strip())
    
# print(topic_descriptions)

# topic_urls = []
# base = "https://www.github.com"
# for tag in topic_link_tag:
#     topic_urls.append(base+tag['href'])
# topic_urls


In [None]:
scrape_topics()

In [None]:
def scrape_topics_repos():
    print("Scraping list topics from Github")
    topics_df = scrape_topics()
    for index,row in topics_df.iterrows():
        print(f"Scraping top repositories for {row['title']}")
        scrape_topic(row['url'],row['title'])
        

In [None]:
scrape_topics_repos()