# Scrap repo stats using PyGitHub

### Author: Crystal Zang

This notebook utilized GitHub access tokens (PAT) to scrape GitHub repository statistics such as stargazers, watchers, forks, and topics. One PAT would scrape at a rate of 5000 repositories per hour. Utilizing 36 PATs we would scrape 10,288,063 repositories in about XXX hours at a rate of 15,514 repositories per hour.

#### Warnings
You should not commit any access topen to GitHub, which would result in access token being revoked.


In [65]:
import os
os.environ['db_user']='zz3hs'
os.environ['db_pwd']='zz3hs'

In [66]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import datetime
warnings.simplefilter(action='ignore', category=FutureWarning)

In [67]:
# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE (commits BETWEEN '3000' AND '5000')'''
#raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE commits > 20000'''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
print(raw_slug_data.head())
print(raw_slug_data.shape)
print(raw_slug_data.isna().sum())

#get rid of leading and ending space, save slugs to a list
raw_slugs = raw_slug_data["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())
    
print(len(slugs))
#print(slugs)

                                 id        spdx                       slug  \
0  MDEwOlJlcG9zaXRvcnkzMjEzNzQwMg==     GPL-2.0     vlabatut/totalboumboum   
1  MDEwOlJlcG9zaXRvcnkxNDU5NDg5NDE=         MIT  Kingzyh/aspnetboilerplate   
2  MDEwOlJlcG9zaXRvcnkxOTQ5MzgwNjE=         MIT   NoahBeckerman/GetSquared   
3  MDEwOlJlcG9zaXRvcnk5MTA5NDkxNg==  Apache-2.0         jack9603301/thrift   
4  MDEwOlJlcG9zaXRvcnkxMjE5NjY1Mzc=         MIT      rsumner31/awesome-ios   

            createdat                          description primarylanguage  \
0 2015-03-13 07:05:57  An open source Java Bomberman clone            Java   
1 2018-08-24 06:01:38                                 None              C#   
2 2019-07-02 21:35:54                Green Squared Exploit           Shell   
3 2017-05-12 13:36:08                                 None             C++   
4 2018-02-18 16:01:36                                 None            HTML   

                                         branch  commits      

In [68]:
#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats'''
github_pats = pd.read_sql_query(github_pats, con=connection)

#PATs access token, saved as a list
access_tokens = github_pats["token"]

#number of tokens available for use, a numeric value
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']

In [69]:
# index ranges from 0 to maximum number of PATs available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
       # print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

In [72]:
#practice slugs
#slugs = ["moderndive/ModernDive_book", "DSPG-Young-Scholars-Program/dspg21oss", "DSPG-Young-Scholars-Program/dspg21RnD"]

In [73]:
len(access_tokens)

36

In [74]:
def pull_repo_stats(github_pat_index):
    df_repo_stats = pd.DataFrame()
    for slug in slugs:
        print("scrap:",slug)
        if github_pat_index >= len(access_tokens):
            github_pat_index -= len(access_tokens)
            print("***Pat access token exceed limit, restart access token loop with #", github_pat_index)
        while github_pat_index < len(access_tokens):
            if github_pat_index >= len(access_tokens):
                print("Warning: Pat access token exceed limit.")
                break
            try:
                access_token = get_access_token(github_pat_index)
                print("Scrapping --", slug,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
                #if false, retry until true, max number of retry is 20 times
                g = Github(access_token, retry = 20, timeout = 15)
                repo = g.get_repo(slug)
                df_repo_stats = df_repo_stats.append({
                    "slug": slug,
                    'stars': repo.stargazers_count,
                    'watchers': repo.subscribers_count,
                    'forks': repo.forks_count,
                    'topics': repo.get_topics()
                }, ignore_index = True)
            except RateLimitExceededException as e:
                print(e.status)
                print('Rate limit exceeded --', slug)
                print("Current time:", datetime.datetime.now())
                #time.sleep(300)
                github_pat_index+=1
                print("***Exit current access code, proceed with next aceess code.")
                break
            except BadCredentialsException as e:
                print(e.status)
                print('Bad credentials exception --', slug)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('Unknown object exception --', slug)
                print("Current time:", datetime.datetime.now())
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', slug)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', slug)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', slug)
                print(str(e))
                time.sleep(10)
                continue
            break
    return df_repo_stats

In [75]:
print("Start time:", datetime.datetime.now())
df_repo_stats = pull_repo_stats(15)
print("End time:", datetime.datetime.now())

Start time: 2021-06-16 21:47:00.010254
scrap: vlabatut/totalboumboum
Scrapping -- vlabatut/totalboumboum . Extracting access token # 16 , total 36 tokens are available.
scrap: Kingzyh/aspnetboilerplate
Scrapping -- Kingzyh/aspnetboilerplate . Extracting access token # 16 , total 36 tokens are available.
scrap: NoahBeckerman/GetSquared
Scrapping -- NoahBeckerman/GetSquared . Extracting access token # 16 , total 36 tokens are available.
scrap: jack9603301/thrift
Scrapping -- jack9603301/thrift . Extracting access token # 16 , total 36 tokens are available.
scrap: rsumner31/awesome-ios
Scrapping -- rsumner31/awesome-ios . Extracting access token # 16 , total 36 tokens are available.
scrap: CurieBSP/zephyr
Scrapping -- CurieBSP/zephyr . Extracting access token # 16 , total 36 tokens are available.
scrap: obiba/onyx
Scrapping -- obiba/onyx . Extracting access token # 16 , total 36 tokens are available.
scrap: tizenorg/framework.connectivity.connman
Scrapping -- tizenorg/framework.connectivi

In [76]:
df_repo_stats.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_stats_bt_3000_5000.csv', index = False)   

In [77]:
# read in the file and check
print(df_repo_stats.head())
print(df_repo_stats)
print(df_repo_stats.isna().sum())
print(df_repo_stats.shape)

   forks                       slug  stars topics  watchers
0    0.0     vlabatut/totalboumboum    4.0     []       2.0
1    0.0  Kingzyh/aspnetboilerplate    0.0     []       1.0
2    2.0   NoahBeckerman/GetSquared    1.0     []       0.0
3    0.0         jack9603301/thrift    0.0     []       1.0
4    0.0      rsumner31/awesome-ios    1.0     []       1.0
      forks                       slug  stars topics  watchers
0       0.0     vlabatut/totalboumboum    4.0     []       2.0
1       0.0  Kingzyh/aspnetboilerplate    0.0     []       1.0
2       2.0   NoahBeckerman/GetSquared    1.0     []       0.0
3       0.0         jack9603301/thrift    0.0     []       1.0
4       0.0      rsumner31/awesome-ios    1.0     []       1.0
...     ...                        ...    ...    ...       ...
9891    7.0                KDE/tellico   18.0     []       4.0
9892    0.0              koo5/manaplus    1.0     []       2.0
9893    1.0           Mabenati/Auspice    1.0     []       1.0
9894    0.