# Scrap repo stats using PyGitHub

### Author: Crystal Zang

This notebook utilized GitHub access tokens (PAT) to scrape GitHub repository statistics such as stargazers, watchers, forks, and topics. One PAT would scrape at a rate of 5000 repositories per hour. Utilizing 36 PATs we would scrape 10,288,063 repositories in about XXX hours at a rate of 15,514 repositories per hour.

#### Warnings
You should not commit any access topen to GitHub, which would result in access token being revoked.


In [19]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [1]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import datetime

import multiprocessing
#from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool, freeze_support

import concurrent.futures

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#os.environ['db_user'] = ''
#os.environ['db_pwd'] = ''

# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE (commits BETWEEN '700' AND '800')'''
#raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE commits < 1000'''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)

connection.close()


print(raw_slug_data.head())
print(raw_slug_data.shape)
print(raw_slug_data.isna().sum())

In [3]:
raw_slug_data = pd.read_csv('/home/zz3hs/git/dspg21oss/data/dspg21oss/crystal_to_scrape_0706.csv') #import csv
raw_slug_data

Unnamed: 0,slug
0,zvini/website
1,tiagoanatar/ninjagame
2,Liujingfang1/kprune
3,exing1984/moodlemoot
4,charterresources/moodle3.2
...,...
88774,tempio-della-grande-madre/grande-madre
88775,dirkdeyne/zk-spring-boot-starter
88776,spider-gazelle/bindata
88777,jenkinsci/osf-builder-suite-for-sfcc-deploy-pl...


In [4]:
#get rid of leading and ending space, save slugs to a list
raw_slugs = raw_slug_data["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())  
print(len(slugs))
print(slugs[0], slugs[len(slugs)-1])

88779
zvini/website ptdave20/helpdesk


In [5]:
#os.environ['db_user'] = ''
#os.environ['db_pwd'] = ''

# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats_update'''
github_pats = pd.read_sql_query(github_pats, con=connection)

#PATs access token, saved as a list
access_tokens = github_pats["token"]

#number of tokens available for use, a numeric value
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats_update'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']

connection.close()

In [7]:
slugs_try = slugs[0:100]
print(slugs_try[0], slugs_try[99])

eevvnnxx/catuserbot YueLinHo/serf


In [6]:
# index ranges from 0 to maximum number of PATs available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
       # print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

In [7]:
#practice on 7 slugs, one invalid slug 
slugs_example = ["moderndive/ModernDive_book", "DSPG-Young-Scholars-Program/dspg21oss", 
                 "unknownrepo", "moderndive/ModernDive_book", "lsst/ip_diffim", "esrlabs/chipmunk", "paulmillr/chokidar"]

In [8]:
len(access_tokens)

32

# Not using any multiprocessing

In [9]:
def pull_repo_stats(github_pat_index, slugs):
    df_repo_stats = pd.DataFrame()
    for slug in slugs:
        if github_pat_index >= len(access_tokens):
            github_pat_index -= len(access_tokens)
            print("***Pat access token exceed limit, restart access token loop with #", github_pat_index)
        while github_pat_index < len(access_tokens):
            try:
                access_token = get_access_token(github_pat_index)
                #print("Scrapping --", slug,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
                #if false, retry until true, max number of retry is 20 times
                g = Github(access_token, retry = 20, timeout = 15)
                repo = g.get_repo(slug)
                df_repo_stats = df_repo_stats.append({
                    "slug": slug,
                    'stars': repo.stargazers_count,
                    'watchers': repo.subscribers_count,
                    'forks': repo.forks_count,
                    'topics': repo.get_topics()
                }, ignore_index = True)
            except RateLimitExceededException as e:
                print(e.status)
                print('Rate limit exceeded --', slug, ", using access token #", github_pat_index)
                print("Current time:", datetime.datetime.now())
                #time.sleep(300)
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
                break
            except BadCredentialsException as e:
                print(e.status)
                print('Bad credentials exception --', slug, ", using access token #", github_pat_index)
                print("Current time:", datetime.datetime.now())
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('Unknown object exception --', slug)
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', slug)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', slug)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', slug)
                print(str(e))
                time.sleep(10)
                continue
            break
    return df_repo_stats



In [None]:

slugs1 = slugs[0:20000]

start_time = datetime.datetime.now()
print("Start scraping:", start_time)
df_repo_stats = pull_repo_stats(0, slugs1)
end_time =  datetime.datetime.now()
print("Finished scraping", len(df_repo_stats), "of", len(slugs1), "records at", end_time)
print("It took", end_time-start_time, "to run.")

Start scraping: 2021-07-06 20:41:30.603796
404
Unknown object exception -- zvini/website
404
Unknown object exception -- tiagoanatar/ninjagame
404
Unknown object exception -- Liujingfang1/kprune
404
Unknown object exception -- exing1984/moodlemoot
404
Unknown object exception -- charterresources/moodle3.2
404
Unknown object exception -- CornwallCollege/Master-Moodle
404
Unknown object exception -- ltu-solent/moodle32test
404
Unknown object exception -- sarahjcotton/gitflowtesting
404
Unknown object exception -- Mets3D/blender
404
Unknown object exception -- josenorberto/moodle-docker-alpine
404
Unknown object exception -- gregorycv/moodle_quiz_extended
404
Unknown object exception -- kk37111754/homebrew-python3.6
404
Unknown object exception -- tjanas94/emacs
404
Unknown object exception -- saidganim/llvm_clone
404
Unknown object exception -- firemax13/android_kernel_sm6150_unified
404
Unknown object exception -- firemax13/a80kernel
404
Unknown object exception -- tonlib/TON-Compiler
4

In [13]:
# read in the file and check
df_repo_stats.head()
#print(df_repo_stats)
#print(df_repo_stats.isna().sum())
#print(df_repo_stats.shape)


Unnamed: 0,forks,slug,stars,topics,watchers
0,0.0,eevvnnxx/catuserbot,0.0,[],1.0
1,0.0,voxsim/badook,1.0,[],0.0
2,0.0,ProofPilot/PP-Frontend,0.0,[],6.0
3,2.0,TransComics/TransBubbles,1.0,[],4.0
4,88.0,Astrotomic/laravel-translatable,712.0,"[laravel, translation, eloquent, database, lan...",14.0


In [14]:
#df_repo_stats.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_stats_bt_800_900.csv', index = False)   


# Function to scrape one slug using specified pat index

In [35]:
#global github_pat_index
def pull_repo_stats(slug, github_pat_index):
    #github_pat_index = 1
   # if github_pat_index >= len(access_tokens):
   #     github_pat_index -= len(access_tokens)
   #     print("***Pat access token exceed limit, restart access token loop with #", github_pat_index)
    while True:
        try:
            if github_pat_index >= len(access_tokens):
                github_pat_index -= len(access_tokens)
                print(f"***Pat access token exceed limit, restart access token loop with #", github_pat_index)  
                
            access_token = get_access_token(github_pat_index)
            #print("Scrapping --", slug,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
            #if false, retry until true, max number of retry is set to 3 times
            g = Github(access_token, retry = 3, timeout = 15)
            repo = g.get_repo(slug)
            stars = repo.stargazers_count
            watchers = repo.subscribers_count
            forks = repo.forks_count
            topics = repo.get_topics()
        except RateLimitExceededException as e:
          #  print(e.status)
          #  print('WARNING: Rate limit exceeded --', slug, ", using access token #", github_pat_index)
            #time.sleep(300)
            github_pat_index+=1
          #  print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
            break
        except BadCredentialsException as e:
           # print(e.status)
           # print('WARNING: Bad credentials exception --', slug, ", using access token #", github_pat_index)
            github_pat_index+=1
           # print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",slug)
            break
        except UnknownObjectException as e:
           # print(e.status)
           # print('WARNING: Unknown object exception --', slug)
            stars = None
            watchers = None
            forks =  None
            topics = None
            return slug, stars, watchers, forks, topics
            break
        except GithubException as e:
           # print(e.status)
           # print('General exception --', slug)
            break
        except r.exceptions.ConnectionError as e:
           # print('Retries limit exceeded --', slug)
           # print(str(e))
            time.sleep(10)
            continue
        except r.exceptions.Timeout as e:
           # print('Time out exception --', slug)
           # print(str(e))
            time.sleep(10)
            continue
            
        results = (slug, stars, watchers, forks, topics)
        #return slug, stars, watchers, forks, topics
        return results

In [36]:
# Example
i = 0
print(slugs_example[i])
result = pull_repo_stats(slugs_example[i], 0)
print(result[3])

#print(result[0])
#print(result[1] ==None)

moderndive/ModernDive_book
295


## Multiprocessing approach: pool.apply_async for multiple parameters

In [37]:
def get_result(results):
    global slug_log
    global stars_log
    global watchers_log
    global forks_log
    global topics_log
    if result is None:
        slug_log.append(None)
        stars_log.append(None)
        watchers_log.append(None)
        forks_log.append(None)
        topics_log.append(None)  
    else:
        slug_log.append(results[0])
        stars_log.append(results[1])
        watchers_log.append(results[2])
        forks_log.append(results[3])
        topics_log.append(results[4])
        #print("appending results")

In [38]:
def get_stars(stars):
    global stars_log
    stars_log.append(stars)
    print("appending results")

In [39]:
stars_log = []
#practice on 2 slugs, one invalid slug 
#slugs_example = ["moderndive/ModernDive_book", "unknownrepo"]

#practice on 7 slugs, one invalid slug 
slugs_example = ["moderndive/ModernDive_book", "DSPG-Young-Scholars-Program/dspg21oss", 
                 "unknownrepo", "moderndive/ModernDive_book", "lsst/ip_diffim", "esrlabs/chipmunk", "paulmillr/chokidar"]


slugs_100 = slugs[0:100]

In [40]:
multiprocessing.cpu_count()

40

In [None]:
# Approach 2. pool.apply_async for multiple parameters
if __name__ == '__main__':
    cores_available = multiprocessing.cpu_count() - 1
    print(f'There are {cores_available} CPUs available.')
    #pool = multiprocessing.Pool(cores_available)
    pool = multiprocessing.Pool(10)
    
    # now we will feed in all of the remaining slugs 
    slug_log = []
    stars_log = []
    watchers_log = []
    forks_log = []
    topics_log = []

    start_time = datetime.datetime.now()
    print("Start scraping. Start time:", start_time)
    
    for slug in slugs_try:        
        pat_i = 3
        pool.apply_async(pull_repo_stats, args=(slug,  pat_i),callback = get_result)        
        print("~~~~~~~~~~~~~~~slug: ",slug, "using access token #", pat_i, "~~~~~~~~~~~~~~~~")
    pool.close()
    pool.join()
    
    final_log = pd.DataFrame({'slug': slug_log, "stars": stars_log, 'watchers': watchers_log, 'forks': forks_log, 'topics': topics_log}, columns=["slug", "stars", "watchers", "forks", "topics"])

    end_time = datetime.datetime.now()
    print("Finished scraping", len(final_log), "of", len(slugs_try), "records at", end_time)
    print("It took", end_time-start_time, "to run.")

There are 39 CPUs available.
Start scraping. Start time: 2021-06-28 11:02:33.855801
~~~~~~~~~~~~~~~slug:  voxsim/badook using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  Denver-userbot/D-USERBOT using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  thoniraq/IQTHON using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  Baqrted/telethoncat using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  Denver-userbot/rushi using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  MrASquare/verge-wallet using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  ItzSjDude/Plus using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  Velocies/raptor-ads using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  nikolai5slo/jamvm using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  bhaii126272/userbot using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~slug:  nobody-repo/nObOdY-lOvEsYOU using access token # 3 ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~

Exception in thread Thread-15:
Traceback (most recent call last):
  File "/home/zz3hs/.conda/envs/crystal/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/zz3hs/.conda/envs/crystal/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zz3hs/.conda/envs/crystal/lib/python3.7/multiprocessing/pool.py", line 486, in _handle_results
    cache[job]._set(i, obj)
  File "/home/zz3hs/.conda/envs/crystal/lib/python3.7/multiprocessing/pool.py", line 662, in _set
    self._callback(self._value)
  File "<ipython-input-37-ca40cdba5350>", line 14, in get_result
    slug_log.append(results[0])
TypeError: 'NoneType' object is not subscriptable



In [115]:
#print(final_log)
final_log

Unnamed: 0,slug,stars,watchers,forks,topics
0,unknownrepo,,,,
1,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."
2,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."
3,DSPG-Young-Scholars-Program/dspg21oss,0.0,3.0,0.0,[]
4,lsst/ip_diffim,5.0,45.0,7.0,[]
5,esrlabs/chipmunk,285.0,11.0,23.0,"[logs-analysis, logviewer, search, logstash, l..."
6,paulmillr/chokidar,7990.0,85.0,511.0,"[watch-files, fsevents, watcher, filesystem]"


## Multiprocessing approach: pool.imap_unordered

In [37]:
cores_available = multiprocessing.cpu_count() - 1
print(f'There are {cores_available} cores available.')
pool = Pool(cores_available)

# now we will feed in all of the remaining slugs 
slug_log = []
stars_log = []
watchers_log = []
forks_log = []
topics_log = []

start_time = datetime.datetime.now()
print("Start scraping. Start time:", start_time)
for result in pool.imap_unordered(pull_repo_stats, slugs_example):
    print("~~~~~~~~~~~~~~~result: ",result, "~~~~~~~~~~~~~~~~")
    if result is None:
        slug_log.append(None)
        stars_log.append(None)
        watchers_log.append(None)
        forks_log.append(None)
        topics_log.append(None)  
    else:
        slug_log.append(result[0])
        stars_log.append(result[1])
        watchers_log.append(result[2])
        forks_log.append(result[3])
        topics_log.append(result[4])


final_log = pd.DataFrame({'slug': slug_log, "stars": stars_log, 'watchers': watchers_log, 'forks': forks_log, 'topics': topics_log}, columns=["slug", "stars", "watchers", "forks", "topics"])

end_time = datetime.datetime.now()
print("Finished scraping", len(final_log), "of", len(slugs), "records at", end_time)
print("It took", end_time-start_time, "to run.")


There are 39 cores available.
404
Start scraping. Start time: 2021-06-23 11:22:11.104201
~~~~~~~~~~~~~~~result:  ('rrrrrrrrr', None, None, None, None) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('DSPG-Young-Scholars-Program/dspg21oss', 0, 3, 0, []) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('moderndive/ModernDive_book', 548, 28, 295, ['moderndive', 'data-science', 'tidyverse', 'statistical-inference', 'r', 'ggplot2', 'infer', 'hypothesis-testing', 'confidence-intervals', 'regression', 'regression-models', 'data-visualization', 'data-wrangling', 'tidy', 'rstudio', 'rstats', 'dplyr', 'bootstrap-method', 'permutation-test']) ~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~result:  ('moderndive/ModernDive_book', 548, 28, 295, ['moderndive', 'data-science', 'tidyverse', 'statistical-inference', 'r', 'ggplot2', 'infer', 'hypothesis-testing', 'confidence-intervals', 'regression', 'regression-models', 'data-visualization', 'data-wrangling', 'tidy', 'rstudio', 'rstats', 'dplyr', 'bootstrap-method', 'permutation-t

In [38]:
final_log

Unnamed: 0,slug,stars,watchers,forks,topics
0,rrrrrrrrr,,,,
1,DSPG-Young-Scholars-Program/dspg21oss,0.0,3.0,0.0,[]
2,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."
3,moderndive/ModernDive_book,548.0,28.0,295.0,"[moderndive, data-science, tidyverse, statisti..."


In [309]:
#final_log.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/final_log_try.csv', index = False)   

In [317]:
# read in the file and check
print(final_log.head())
print(final_log)
print(final_log.isna().sum())
print(final_log.shape)

                                    slug  stars  watchers  forks  \
0                              rrrrrrrrr    NaN       NaN    NaN   
1  DSPG-Young-Scholars-Program/dspg21oss    0.0       3.0    0.0   
2             moderndive/ModernDive_book  548.0      28.0  295.0   
3             moderndive/ModernDive_book  548.0      28.0  295.0   

                                              topics  
0                                               None  
1                                                 []  
2  [moderndive, data-science, tidyverse, statisti...  
3  [moderndive, data-science, tidyverse, statisti...  
                                    slug  stars  watchers  forks  \
0                              rrrrrrrrr    NaN       NaN    NaN   
1  DSPG-Young-Scholars-Program/dspg21oss    0.0       3.0    0.0   
2             moderndive/ModernDive_book  548.0      28.0  295.0   
3             moderndive/ModernDive_book  548.0      28.0  295.0   

                                           