# Scrap repo stats using PyGitHub

### Author: Crystal Zang

This notebook utilized GitHub access tokens (PAT) to scrape GitHub repository statistics such as stargazers, watchers, forks, and topics. One PAT would scrape at a rate of 5000 repositories per hour. Utilizing 36 PATs we would scrape 10,288,063 repositories in about XXX hours at a rate of 15,514 repositories per hour.

#### Warnings
You should not commit any access topen to GitHub, which would result in access token being revoked.


In [27]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [28]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import datetime

import multiprocessing
#from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool, freeze_support

import concurrent.futures

warnings.simplefilter(action='ignore', category=FutureWarning)

In [29]:
#os.environ['db_user'] = ''
#os.environ['db_pwd'] = ''

# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE (commits BETWEEN '4500' AND '5000')'''
#raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE commits > 20000'''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
print(raw_slug_data.head())
print(raw_slug_data.shape)
print(raw_slug_data.isna().sum())

#get rid of leading and ending space, save slugs to a list
raw_slugs = raw_slug_data["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())  
print(len(slugs))
print(slugs[0], slugs[len(slugs)-1])

#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats_update'''
github_pats = pd.read_sql_query(github_pats, con=connection)

#PATs access token, saved as a list
access_tokens = github_pats["token"]

#number of tokens available for use, a numeric value
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats_update'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']

connection.close()

                                 id        spdx                       slug  \
0  MDEwOlJlcG9zaXRvcnkzMjEzNzQwMg==     GPL-2.0     vlabatut/totalboumboum   
1  MDEwOlJlcG9zaXRvcnkxNDU5NDg5NDE=         MIT  Kingzyh/aspnetboilerplate   
2  MDEwOlJlcG9zaXRvcnkxOTQ5MzgwNjE=         MIT   NoahBeckerman/GetSquared   
3  MDEwOlJlcG9zaXRvcnk5MTA5NDkxNg==  Apache-2.0         jack9603301/thrift   
4  MDEwOlJlcG9zaXRvcnkxMjE5NjY1Mzc=         MIT      rsumner31/awesome-ios   

            createdat                          description primarylanguage  \
0 2015-03-13 07:05:57  An open source Java Bomberman clone            Java   
1 2018-08-24 06:01:38                                 None              C#   
2 2019-07-02 21:35:54                Green Squared Exploit           Shell   
3 2017-05-12 13:36:08                                 None             C++   
4 2018-02-18 16:01:36                                 None            HTML   

                                         branch  commits      

In [30]:
# index ranges from 0 to maximum number of PATs available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
       # print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

In [31]:
#practice slugs
slugs_example = ["moderndive/ModernDive_book", "DSPG-Young-Scholars-Program/dspg21oss", "rrrrrrrrr", "moderndive/ModernDive_book"]

In [32]:
len(access_tokens)

28

# Function to scrape one slug using specified pat index

In [33]:
#global github_pat_index
def pull_repo_stats(slug, github_pat_index=0):
    #github_pat_index = 1
   # if github_pat_index >= len(access_tokens):
   #     github_pat_index -= len(access_tokens)
   #     print("***Pat access token exceed limit, restart access token loop with #", github_pat_index)
    while True:
        try:
            if github_pat_index >= len(access_tokens):
                github_pat_index -= len(access_tokens)
                print(f"***Pat access token exceed limit, restart access token loop with #", github_pat_index)  
                
            access_token = get_access_token(github_pat_index)
            #print("Scrapping --", slug,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
            #if false, retry until true, max number of retry is 20 times
            g = Github(access_token, retry = 20, timeout = 15)
            repo = g.get_repo(slug)
            stars = repo.stargazers_count
            watchers = repo.subscribers_count
            forks = repo.forks_count
            topics = repo.get_topics()
        except RateLimitExceededException as e:
            print(e.status)
            print('WARNING: Rate limit exceeded --', slug, ", using access token #", github_pat_index)
            #time.sleep(300)
            github_pat_index+=1
            print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
            break
        except BadCredentialsException as e:
            print(e.status)
            print('WARNING: Bad credentials exception --', slug, ", using access token #", github_pat_index)
            github_pat_index+=1
            print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
            break
        except UnknownObjectException as e:
            print(e.status)
            print('WARNING: Unknown object exception --', slug)
            stars = None
            watchers = None
            forks =  None
            topics = None
            return slug, stars, watchers, forks, topics
            break
        except GithubException as e:
            print(e.status)
            print('General exception --', slug)
            break
        except r.exceptions.ConnectionError as e:
            print('Retries limit exceeded --', slug)
            print(str(e))
            time.sleep(10)
            continue
        except r.exceptions.Timeout as e:
            print('Time out exception --', slug)
            print(str(e))
            time.sleep(10)
            continue
        return slug, stars, watchers, forks, topics

In [34]:
# Example
result = pull_repo_stats(slug="rrrr")
#pull_repo_stats(slug= slugs_example)
print(result)
#print(result[0])
#print(result[1] ==None)

404
('rrrr', None, None, None, None)


## Multiprocessing approach: pool.apply_async for multiple parameters

In [39]:
# Approach 2. pool.apply_async for multiple parameters
if __name__ == '__main__':
    cores_available = multiprocessing.cpu_count() - 1
    print(f'There are {cores_available} CPUs available.')
    pool = multiprocessing.Pool(cores_available)

    # now we will feed in all of the remaining slugs 
    slug_log = []
    stars_log = []
    watchers_log = []
    forks_log = []
    topics_log = []

    start_time = datetime.datetime.now()
    print("Start scraping. Start time:", start_time)
    
    for slug in slugs_example:
        result = pool.apply_async(pull_repo_stats, args=(slugs_example, 3)).get()
        print("~~~~~~~~~~~~~~~result: ",result, "~~~~~~~~~~~~~~~~")
        if result is None:
            slug_log.append(None)
            stars_log.append(None)
            watchers_log.append(None)
            forks_log.append(None)
            topics_log.append(None)  
        else:
            slug_log.append(result[0])
            stars_log.append(result[1])
            watchers_log.append(result[2])
            forks_log.append(result[3])
            topics_log.append(result[4])


    final_log = pd.DataFrame({'slug': slug_log, "stars": stars_log, 'watchers': watchers_log, 'forks': forks_log, 'topics': topics_log}, columns=["slug", "stars", "watchers", "forks", "topics"])

    end_time = datetime.datetime.now()
    print("Finished scraping", len(final_log), "of", len(slugs_example), "records at", end_time)
    print("It took", end_time-start_time, "to run.")

There are 39 CPUs available.
Start scraping. Start time: 2021-06-23 11:26:43.117920


AssertionError: ['moderndive/ModernDive_book', 'DSPG-Young-Scholars-Program/dspg21oss', 'rrrrrrrrr', 'moderndive/ModernDive_book']

## Multiprocessing approach: pool.imap_unordered

In [37]:
cores_available = multiprocessing.cpu_count() - 1
print(f'There are {cores_available} cores available.')
pool = Pool(cores_available)

# now we will feed in all of the remaining slugs 
slug_log = []
stars_log = []
watchers_log = []
forks_log = []
topics_log = []

start_time = datetime.datetime.now()
print("Start scraping. Start time:", start_time)
for result in pool.imap_unordered(pull_repo_stats, slugs_example):
    print("~~~~~~~~~~~~~~~result: ",result, "~~~~~~~~~~~~~~~~")
    if result is None:
        slug_log.append(None)
        stars_log.append(None)
        watchers_log.append(None)
        forks_log.append(None)
        topics_log.append(None)  
    else:
        slug_log.append(result[0])
        stars_log.append(result[1])
        watchers_log.append(result[2])
        forks_log.append(result[3])
        topics_log.append(result[4])


final_log = pd.DataFrame({'slug': slug_log, "stars": stars_log, 'watchers': watchers_log, 'forks': forks_log, 'topics': topics_log}, columns=["slug", "stars", "watchers", "forks", "topics"])

end_time = datetime.datetime.now()
print("Finished scraping", len(final_log), "of", len(slugs), "records at", end_time)
print("It took", end_time-start_time, "to run.")


There are 39 cores available.
404
Start scraping. Start time: 2021-06-23 11:22:11.104201
~~~~~~~~~~~~~~~result:  ('rrrrrrrrr', None, None, None, None) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('DSPG-Young-Scholars-Program/dspg21oss', 0, 3, 0, []) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('moderndive/ModernDive_book', 548, 28, 295, ['moderndive', 'data-science', 'tidyverse', 'statistical-inference', 'r', 'ggplot2', 'infer', 'hypothesis-testing', 'confidence-intervals', 'regression', 'regression-models', 'data-visualization', 'data-wrangling', 'tidy', 'rstudio', 'rstats', 'dplyr', 'bootstrap-method', 'permutation-test']) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('moderndive/ModernDive_book', 548, 28, 295, ['moderndive', 'data-science', 'tidyverse', 'statistical-inference', 'r', 'ggplot2', 'infer', 'hypothesis-testing', 'confidence-intervals', 'regression', 'regression-models', 'data-visualization', 'data-wrangling', 'tidy', 'rstudio', 'rstats', 'dplyr', 'bootstrap-method', 'permutation-t

In [38]:
final_log

Unnamed: 0,slug,stars,watchers,forks,topics
0,rrrrrrrrr,,,,
1,DSPG-Young-Scholars-Program/dspg21oss,0.0,3.0,0.0,[]
2,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."
3,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."


In [309]:
#final_log.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/final_log_try.csv', index = False)   

In [317]:
# read in the file and check
print(final_log.head())
print(final_log)
print(final_log.isna().sum())
print(final_log.shape)

                                    slug  stars  watchers  forks  \
0                              rrrrrrrrr    NaN       NaN    NaN   
1  DSPG-Young-Scholars-Program/dspg21oss    0.0       3.0    0.0   
2             moderndive/ModernDive_book  548.0      28.0  295.0   
3             moderndive/ModernDive_book  548.0      28.0  295.0   

                                              topics  
0                                               None  
1                                                 []  
2  [moderndive, data-science, tidyverse, statisti...  
3  [moderndive, data-science, tidyverse, statisti...  
                                    slug  stars  watchers  forks  \
0                              rrrrrrrrr    NaN       NaN    NaN   
1  DSPG-Young-Scholars-Program/dspg21oss    0.0       3.0    0.0   
2             moderndive/ModernDive_book  548.0      28.0  295.0   
3             moderndive/ModernDive_book  548.0      28.0  295.0   

                                           