# scrap repo stats using PyGitHub

In [None]:
%reset

In [1]:
import os
os.environ['db_user']='zz3hs'
os.environ['db_pwd']='zz3hs'

In [2]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import time
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# connect to the database, download data, limit to top 5000 repos that have the highest number of commits
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked WHERE (commits BETWEEN '11000' AND '20000')'''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
print(raw_slug_data.head())
print(raw_slug_data.shape)
print(raw_slug_data.isna().sum())

raw_slugs = raw_slug_data["slug"].tolist()
#slugs = [raw_slugs.strip(' ') for raw_slugs in slugs]
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())
    
print(len(slugs))
#print(slugs)

                                 id        spdx  \
0  MDEwOlJlcG9zaXRvcnkxODA1NjY0NzE=  Apache-2.0   
1  MDEwOlJlcG9zaXRvcnkxMzQyMzY5NTk=  Apache-2.0   
2  MDEwOlJlcG9zaXRvcnk1OTc5OTQxOQ==  Apache-2.0   
3  MDEwOlJlcG9zaXRvcnkzMDkxNTc1MDI=  Apache-2.0   
4  MDEwOlJlcG9zaXRvcnkxOTYwNTU3Nw==  Apache-2.0   

                            slug           createdat  \
0  kderme/spark-memory-simulator 2019-04-10 11:22:43   
1     zhangyineng/pentaho_kettle 2018-05-21 07:55:12   
2          zorba-processor/zorba 2016-05-27 02:56:03   
3                  jsoniq/jsoniq 2020-11-01 18:08:05   
4                   28msec/zorba 2014-05-09 09:13:04   

                       description primarylanguage  \
0                             None           Scala   
1                             None            Java   
2      Zorba - the NoSQL processor             C++   
3  JSONiq: The JSON Query Language             C++   
4  JSONiq & XQuery Query Processor             C++   

                               

In [18]:
#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats'''
github_pats = pd.read_sql_query(github_pats, con=connection)

#PATs access token, saved as a list
access_tokens = github_pats["token"]

#number of tokens available for use, a numeric value
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']


In [6]:
# index from 0 to max number of PAT available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
       # print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

In [9]:
#try to get a token
get_access_token(0)

'51f7b11ebd070ebb473d65a5a8f90cdf1c71a3a4'

In [12]:
#try to get a exceed limit
get_access_token(40)

token exceed limit


In [24]:
#practice slugs
#slugs = ["moderndive/ModernDive_book", "crystalzang/mpred", "rrrrrrrrr"]

In [87]:
def pull_repo_stats(github_pat_index):
    df_repo_stats = pd.DataFrame()
    access_token = get_access_token(github_pat_index)
    print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
    for slug in slugs:
        while True:
            try:
                #if false, retry until true, max number of retry is 20 times
                g = Github(access_token, retry = 20, timeout = 15)
                repo = g.get_repo(slug)
                df_repo_stats = df_repo_stats.append({
                    "slug": slug,
                    'stars': repo.stargazers_count,
                    'watchers': repo.subscribers_count,
                    'forks': repo.forks_count,
                    'topics': repo.get_topics()
                }, ignore_index = True)
            except RateLimitExceededException as e:
                print(e.status)
                print('Rate limit exceeded --', slug)
                #time.sleep(300)
                g = Github(get_access_token(github_pat_index+1))
                print("Proceed with the next token #",github_pat_index+2)
                repo = g.get_repo(slug)
                df_repo_stats = df_repo_stats.append({
                    "slug": slug,
                    'stars': repo.stargazers_count,
                    'watchers': repo.subscribers_count,
                    'forks': repo.forks_count,
                    'topics': repo.get_topics()
                }, ignore_index = True)
                continue
            except BadCredentialsException as e:
                print(e.status)
                print('Bad credentials exception --', slug)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('Unknown object exception --', slug)
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', slug)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', slug)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', slug)
                print(str(e))
                time.sleep(10)
                continue
            break
        df_repo_stats.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_stats_bt_11000_20000.csv', index = False)        

In [None]:
def pull_repo_stats():
    df_repo_stats = pd.DataFrame()
    for  in access_tokens:
        access_token = get_access_token(github_pat_index)
        print("Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
    for slug in slugs:
        while True:
            try:
                #if false, retry until true, max number of retry is 20 times
                g = Github(access_token, retry = 20, timeout = 15)
                repo = g.get_repo(slug)
                df_repo_stats = df_repo_stats.append({
                    "slug": slug,
                    'stars': repo.stargazers_count,
                    'watchers': repo.subscribers_count,
                    'forks': repo.forks_count,
                    'topics': repo.get_topics()
                }, ignore_index = True)
            except RateLimitExceededException as e:
                print(e.status)
                print('Rate limit exceeded --', slug)
                #time.sleep(300)
                g = Github(get_access_token(github_pat_index+1))
                print("Proceed with the next token #",github_pat_index+2)
                continue
            except BadCredentialsException as e:
                print(e.status)
                print('Bad credentials exception --', slug)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('Unknown object exception --', slug)
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', slug)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', slug)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', slug)
                print(str(e))
                time.sleep(10)
                continue
            break
        df_repo_stats.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_stats_bt_11000_20000.csv', index = False)        

In [88]:
pull_repo_stats(0)

Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark-memory-simulator
Proceed with the next token # 2
Extracting access token # 1 , total 36 tokens are available.
403
Rate limit exceeded -- kderme/spark

KeyboardInterrupt: 

In [85]:
df_repo_stats = pd.read_csv(r'/home/zz3hs/git/dspg21oss/output/repo_stats.csv')

In [86]:
print(df_repo_stats.head())
print(df_repo_stats)

                               slug  stars topics  watchers
0                Rafael3086/spryker    NaN    NaN       NaN
1                    jplyley/PDF.js    NaN    NaN       NaN
2           jcoliver/OSUScratchpads    NaN    NaN       NaN
3                  simonp22/AnkhSVN    NaN    NaN       NaN
4  AlenTuzla/anycubici3megas3dtouch    NaN    NaN       NaN
                                  slug  stars topics  watchers
0                   Rafael3086/spryker    NaN    NaN       NaN
1                       jplyley/PDF.js    NaN    NaN       NaN
2              jcoliver/OSUScratchpads    NaN    NaN       NaN
3                     simonp22/AnkhSVN    NaN    NaN       NaN
4     AlenTuzla/anycubici3megas3dtouch    NaN    NaN       NaN
...                                ...    ...    ...       ...
5459                               NaN    0.0     []       1.0
5460                               NaN    0.0     []       0.0
5461                               NaN    0.0     []       1.0
5462      

In [88]:
df_repo_stats.isna().sum()

slug        2715
stars       2749
topics      2749
watchers    2749
dtype: int64

In [89]:
df_repo_stats.shape

(5464, 4)