# Scrap repo stats using PyGitHub

### Author: Crystal Zang

This notebook utilized GitHub access tokens (PAT) to scrape GitHub repository statistics such as stargazers, watchers, forks, and topics. One PAT would scrape at a rate of 5000 repositories per hour. Utilizing 36 PATs we would scrape 10,288,063 repositories in about XXX hours at a rate of 15,514 repositories per hour.

#### Warnings
You should not commit any access topen to GitHub, which would result in access token being revoked.


In [19]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [20]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import datetime

import multiprocessing
#from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool, freeze_support

import concurrent.futures

warnings.simplefilter(action='ignore', category=FutureWarning)

In [21]:
#os.environ['db_user'] = ''
#os.environ['db_pwd'] = ''

# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE (commits BETWEEN '800' AND '900')'''
#raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE commits < 1000'''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
print(raw_slug_data.head())
print(raw_slug_data.shape)
print(raw_slug_data.isna().sum())

#get rid of leading and ending space, save slugs to a list
raw_slugs = raw_slug_data["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())  
print(len(slugs))
print(slugs[0], slugs[len(slugs)-1])

#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats_update'''
github_pats = pd.read_sql_query(github_pats, con=connection)

#PATs access token, saved as a list
access_tokens = github_pats["token"]

#number of tokens available for use, a numeric value
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats_update'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']

connection.close()

                                 id        spdx  \
0  MDEwOlJlcG9zaXRvcnkyMTMyODQzNTc=     GPL-3.0   
1  MDEwOlJlcG9zaXRvcnkyODA0MjM4OTc=    AGPL-3.0   
2  MDEwOlJlcG9zaXRvcnkyODYxMzAxOTQ=  Apache-2.0   
3  MDEwOlJlcG9zaXRvcnk1NDYzMjIwNA==    LGPL-3.0   
4  MDEwOlJlcG9zaXRvcnkxODAzNjMxNTU=     GPL-3.0   

                              slug           createdat  \
0                ryuffhant/candyJM 2019-10-07 03:05:14   
1  onlyforuserbot/Personal-userbot 2020-07-17 12:54:10   
2                   awserv/Game-Pl 2020-08-08 22:43:52   
3            DGLABArquivos/roda-in 2016-03-24 10:00:23   
4                   veloren/legacy 2019-04-09 12:35:46   

                                         description primarylanguage  \
0              aplikasi CBT modifikasi dari candyCBT             PHP   
1                                               None          Python   
2                                               None      JavaScript   
3  RODA-in is a tool specially designed for produ...    

In [22]:
# index ranges from 0 to maximum number of PATs available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
       # print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

## Function to scrape one slug using specified pat index

In [23]:
#global github_pat_index
def pull_repo_stats(slug, github_pat_index):
    if github_pat_index >= len(access_tokens):
        github_pat_index -= len(access_tokens)
        print(f"***Pat access token exceed limit, restart access token loop with #", github_pat_index)  

    access_token = get_access_token(github_pat_index)
    #print("Scrapping --", slug,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
    #if false, retry until true, max number of retry is set to 3 times
    #g = Github(access_token, retry = 3, timeout = 15)
    g = Github(access_token)
    repo = g.get_repo(slug)
    stars = repo.stargazers_count
    watchers = repo.subscribers_count
    forks = repo.forks_count
    topics = repo.get_topics()

    results = (slug, stars, watchers, forks, topics)
    return results

In [24]:
def get_result(results):
    global slug_log
    global stars_log
    global watchers_log
    global forks_log
    global topics_log
    if results is None:
        slug_log.append(None)
        stars_log.append(None)
        watchers_log.append(None)
        forks_log.append(None)
        topics_log.append(None)  
    else:
        slug_log.append(results[0])
        stars_log.append(results[1])
        watchers_log.append(results[2])
        forks_log.append(results[3])
        topics_log.append(results[4])
       # print("appending results")
    
def get_stars(stars):
    global stars_log
    stars_log.append(stars)
    #print("appending results")

In [25]:
# Example
slugs_example = ["moderndive/ModernDive_book", "DSPG-Young-Scholars-Program/dspg21oss", 
                 "unknownrepo", "moderndive/ModernDive_book", "lsst/ip_diffim", "esrlabs/chipmunk", "paulmillr/chokidar"]

i = 0
print(slugs_example[i])
result = pull_repo_stats(slugs_example[i], 0)
print(result[0])

slug_log = []
stars_log = []
watchers_log = []
forks_log = []
topics_log = []

get_result(result)
print(slug_log)
#print(result[0])
#print(result[1] ==None)

moderndive/ModernDive_book
moderndive/ModernDive_book
['moderndive/ModernDive_book']


In [26]:
slugs_try = slugs[0:999]
#print(slugs_try)
print(len(slugs_try))
print(slugs_try[0], slugs_try[998])

999
ryuffhant/candyJM hbang/TypeStatus


## Multiprocessing approach: pool.apply_async for multiple parameters

In [13]:
# Approach 2. pool.apply_async for multiple parameters
github_pat_index = None
if __name__ == '__main__':
    pool = multiprocessing.Pool(10) # using 10 cores for scrapping, leave 6 cores to run other stuff
    
    slug_log = []
    stars_log = []
    watchers_log = []
    forks_log = []
    topics_log = []

    start_time = datetime.datetime.now()
    print("Start scraping. Start time:", start_time)
    #github_pat_index = 0
    #print("1. outer loop pat index", github_pat_index)
    
    for slug in slugs_try:
        github_pat_index = 0
        if github_pat_index >= len(access_tokens):
            github_pat_index -= len(access_tokens)
            print("***Pat access token exceed limit, restart access token loop with #", github_pat_index)
        while True:
            try:
                #github_pat_index = 3
               # print("2. innner loop pat index", github_pat_index)      
                pool.apply_async(pull_repo_stats, args=(slug,  github_pat_index),callback = get_result)        
                print("~~~~~~~~~~~~~~~slug: ",slug, "using access token #", github_pat_index, "~~~~~~~~~~~~~~~~")
                print(slug_log)
                break 
            except RateLimitExceededException as e:
                print(e.status)
                print('WARNING: Rate limit exceeded --', slug, ", using access token #", github_pat_index)
                #time.sleep(300)
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
                break
            except BadCredentialsException as e:
                print(e.status)
                print('WARNING: Bad credentials exception --', slug, ", using access token #", github_pat_index)
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('WARNING: Unknown object exception --', slug)
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', slug)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', slug)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', slug)
                print(str(e))
                time.sleep(10)
                continue
    pool.close()
    pool.join()
    
    print(stars_log)
    final_log = pd.DataFrame({'slug': slug_log, "stars": stars_log, 'watchers': watchers_log, 'forks': forks_log, 'topics': topics_log}, columns=["slug", "stars", "watchers", "forks", "topics"])

    end_time = datetime.datetime.now()
    print("Finished scraping", len(final_log), "of", len(slugs_try), "records at", end_time)
    print("It took", end_time-start_time, "to run.")

Start scraping. Start time: 2021-07-02 12:38:51.867026
~~~~~~~~~~~~~~~slug:  voxsim/badook using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  O330oei/Linxx using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  deivids84/MA-XML-8.0-CATALAN using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  Hoektronics/BotQueue using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  webhat/oplerno using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  proxyrequired/proxy using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  Nitrokey/libnitrokey using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  kuulasmaa/BALSAMIC-wgsbam using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  Clinical-Genomics/BALSAMIC using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  powerbuoy/sleek using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~~~~~~slug:  bbercovici/SBGAT using access token # 0 ~~~~~~~~~~~~~~~~
[]
~~~~~~~~~~

In [14]:
slug_log 

['powerbuoy/sleek',
 'O330oei/Linxx',
 'voxsim/badook',
 'Hoektronics/BotQueue',
 'deivids84/MA-XML-8.0-CATALAN',
 'Clinical-Genomics/BALSAMIC',
 'bbercovici/SBGAT',
 'Nitrokey/libnitrokey',
 'proxyrequired/proxy',
 'webhat/oplerno',
 'xuedayuer/https-github.com-TheAlgorithms-Python',
 'wbez/curiouscity',
 'legendzhouqiang/BRPC',
 'QUIQQER/order',
 'barry-jones/live-documenter',
 'l0k1/MiG-21bis',
 'indexdata/yazpp',
 'elt11bot/e.l.t-bot',
 'CredentialEngine/CredentialRegistry',
 'quattor/pan',
 'doctorxyz/oplrpctest',
 'google/gitiles',
 'IEEECS-VIT/RPL',
 'peterorum/functal',
 'Maxsteam/FF',
 'nrinaudo/kantan.csv',
 'spesmilo/electrum-server',
 'Nesvilab/FragPipe',
 'adib1380/antispam',
 'standard/standard-engine',
 'tylingsoft/dagre-d3-renderer',
 'jeantimex/javascript-problems-and-solutions',
 'victored/geary',
 'scipipe/scipipe',
 'kbingham/screen',
 'M0ses/kanku',
 'saurb/soundwalks.org',
 'pierre-vigier/Perl6-Math-Matrix',
 'xditya/TeleBot',
 'SaifRehman/ICP-Airways',
 'xgecko-u

In [15]:
forks_log

[21,
 0,
 0,
 42,
 3,
 11,
 1,
 28,
 0,
 5,
 0,
 12,
 10,
 0,
 3,
 8,
 1,
 0,
 6,
 18,
 0,
 163,
 2,
 0,
 0,
 27,
 393,
 13,
 0,
 39,
 3,
 70,
 0,
 64,
 0,
 4,
 0,
 5,
 453,
 22,
 0,
 99,
 3,
 0,
 0,
 12,
 1,
 0,
 0,
 0,
 37,
 91,
 4,
 0,
 0,
 20,
 5,
 50,
 12,
 4,
 0,
 1,
 477,
 0,
 0,
 2,
 0,
 19,
 36,
 5,
 3,
 28,
 6,
 4,
 0,
 6,
 0,
 0,
 1,
 27,
 3,
 2,
 0,
 91,
 7,
 0,
 26,
 1,
 0,
 12,
 0,
 22,
 32,
 91,
 21,
 0,
 4,
 0,
 12,
 10,
 0,
 14,
 7,
 0,
 25,
 142,
 0,
 0,
 0,
 0,
 5,
 48,
 0,
 2,
 0,
 0,
 16,
 0,
 101,
 104,
 102,
 1241,
 0,
 47,
 39,
 0,
 3,
 0,
 324,
 23,
 6,
 0,
 1,
 105,
 7,
 0,
 2907,
 45,
 0,
 0,
 0,
 0,
 101,
 6,
 0,
 4,
 22,
 0,
 0,
 23,
 15,
 31,
 1,
 0,
 3,
 48,
 0,
 0,
 6,
 110,
 6,
 1,
 0,
 327,
 0,
 2,
 6,
 6,
 11,
 23622,
 0,
 0,
 0,
 0,
 12,
 7,
 26,
 1,
 229,
 0,
 0,
 2,
 22,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 1,
 0,
 67,
 0,
 0,
 0,
 104,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 55,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 37,
 386,
 235,
 5,

In [16]:
topics_log

[['wordpress',
  'wordpress-theme',
  'composer',
  'npm',
  'webpack',
  'vue',
  'acf',
  'starter-theme'],
 [],
 [],
 [],
 [],
 ['genomics',
  'somatic-mutations',
  'snakemake-workflows',
  'variant-calling',
  'bioinformatics'],
 ['shape-models', 'gravity-model', 'uncertainty-quantification'],
 ['nitrokey',
  'nitrokey-stick-devices',
  'python',
  'hotp',
  'security',
  'password-vault',
  'otp',
  'c-plus-plus',
  'library',
  'encrypted-store',
  'cross-platform'],
 [],
 [],
 [],
 [],
 [],
 ['quiqqer', 'ecommerce', 'order', 'order-management', 'erp'],
 ['documentation-generator',
  'netcore',
  'net',
  'csharp',
  'documentation',
  'documentation-tool',
  'xml-comments',
  'livedocumenter'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['csv'],
 [],
 ['mass-spectrometry', 'proteomics', 'search-engine', 'pipeline', 'gui'],
 [],
 ['nodejs', 'javascript', 'eslint', 'style', 'standard', 'standard-engine'],
 ['dagre', 'dagre-d3'],
 ['javascript', 'algorithms', 'problem-solv

In [17]:
#print(final_log)
final_log

Unnamed: 0,slug,stars,watchers,forks,topics
0,powerbuoy/sleek,31,4,21,"[wordpress, wordpress-theme, composer, npm, we..."
1,O330oei/Linxx,0,1,0,[]
2,voxsim/badook,1,0,0,[]
3,Hoektronics/BotQueue,159,31,42,[]
4,deivids84/MA-XML-8.0-CATALAN,7,5,3,[]
...,...,...,...,...,...
519,ultragalactic/json-framework,1,1,0,[]
520,taw91/Assignment-2,0,1,0,[]
521,mergermarket/tg-eventstore,0,8,0,[]
522,nive/nive,7,4,2,[]


In [31]:
# read in the file and check
print(final_log.head())
print(final_log)
print(final_log.isna().sum())
print(final_log.shape)

                        slug  stars  watchers  forks topics
0              O330oei/Linxx      0         1      0     []
1              telenub/tgbot      0         1      0     []
2          lazykillu/userbot      0         1      0     []
3          Declan57/botkulah      0         1      0     []
4  sukhpreet1427/darkxusrbot      0         1      0     []
                               slug  stars  watchers  forks topics
0                     O330oei/Linxx      0         1      0     []
1                     telenub/tgbot      0         1      0     []
2                 lazykillu/userbot      0         1      0     []
3                 Declan57/botkulah      0         1      0     []
4         sukhpreet1427/darkxusrbot      0         1      0     []
..                              ...    ...       ...    ...    ...
94           kazuyouaoki/munish-dev      0         1      0     []
95                    YueLinHo/serf      0         0      0     []
96  FruitCrushSaga/Fruit-Crush-Saga  