## Set up the dependencies and configurations

###  Assumption: you have downloaded stackql and is in your system path
Follow [this instruction to install](https://stackql.io/downloads)

### Append system path so we can use other modules

In [1]:
import sys 
import settings
project_root = str(settings.get_project_root())
print(project_root)
sys.path.append(project_root)

/Users/yunchengyang/Projects/Storyscore/storyscore-data


### setup stackql

In [2]:
from pystackql import StackQL
import json 
import os
import pandas as pd

In [3]:
os.environ['GITHUB_CREDS'] = "b60db0573b17fdf5e12a:2bc5d338d11e71aa0d60ddfca5f971208c009792"
auth = { 
    "github": 
      { "type": "basic", "credentialsenvvar": "GITHUB_CREDS" }
}
stackql_path = project_root + '/.stackql'
iql = StackQL(auth=json.dumps(auth))

To make the script easy to duplicate in any environment, we will use pystackql to pull the github provider from registry

In [4]:
## pull github registry
registry_res = iql.execute('REGISTRY LIST;')
registry_list = json.loads(registry_res)
github_registry = list(filter(lambda reg: reg["provider"] == 'github', registry_list))[0]
github_version = github_registry['version']

print(github_version)

pull_provider_query = """
REGISTRY PULL github %s;
""" % github_version

print(pull_provider_query)

res = iql.execute(pull_provider_query)

print(res)


v0.3.2

REGISTRY PULL github v0.3.2;

[{"error": "github provider, version 'v0.3.2' successfully installed"}]


## Using Github provider
We will use [Intel open source DAOS project](https://github.com/daos-stack/daos) as example 

In [5]:
org_name = 'stackql'

### Overview
First lets look at what we can get from the Github provider

In [57]:
def get_dataframe_from_query(query: str) -> pd.DataFrame :
    res = iql.execute(query)
    try: 
        res_obj = json.loads(res)
        if "error" in res:
            raise AttributeError()
        if not res_obj:
            raise TypeError(res)
        data = pd.DataFrame.from_dict(res_obj)
        return data
    except Exception as error:
        if not isinstance(error, TypeError):
            error.args = ('StackQL execute error with error: %s, res: %s, error type: %s, res type: %s' %(error, res, type(error), type(res)), *error.args)
        raise error


In [7]:
### look at repos, commits and comments
def show_resource():
    query = 'show resources in github.repos'
    data = get_dataframe_from_query(query)
    
    print(data)

show_resource()


                                              id  \
0               github.repos.access_restrictions   
1           github.repos.admin_branch_protection   
2                        github.repos.app_access   
3                         github.repos.autolinks   
4                 github.repos.branch_protection   
5                          github.repos.branches   
6                     github.repos.clone_traffic   
7    github.repos.collaborator_permission_levels   
8                     github.repos.collaborators   
9                 github.repos.combined_statuses   
10                         github.repos.comments   
11                  github.repos.commit_branches   
12             github.repos.commit_pull_requests   
13                          github.repos.commits   
14                        github.repos.community   
15                  github.repos.content_traffic   
16                         github.repos.contents   
17                     github.repos.contributors   
18          

In [8]:
def describe_commit():
    query = 'DESCRIBE github.repos.commits;'
    data = get_dataframe_from_query(query)
    
    print(data)

def describe_repos():
    query = 'DESCRIBE github.repos.repos;'
    data = get_dataframe_from_query(query)
    
    print(data)
    
describe_repos()

describe_commit()


                name     type
0                 id  integer
1               name   string
2        description   string
3          forks_url   string
4            node_id   string
..               ...      ...
80  issue_events_url   string
81    milestones_url   string
82          keys_url   string
83              fork  boolean
84    default_branch   string

[85 rows x 2 columns]
            name    type
0          files   array
1         commit  object
2        node_id  string
3         author  object
4      committer  object
5            url  string
6          stats  object
7       html_url  string
8            sha  string
9   comments_url  string
10       parents   array


### explore commit activity 

In [9]:
test_repo = 'daos'
test_owner = 'daos-stack'

In [10]:
def list_repos():
    query = """
    select id, name
    from github.repos.repos
    where org = '%s';
    """ % org_name
    data = get_dataframe_from_query(query)
    
    print(data)
list_repos()

           id                         name
0   409393414       fullstackchronicles.io
1   424079013    docusaurus-plugin-hubspot
2   425113738      gatsby-plugin-smartlook
3   425366372  docusaurus-plugin-smartlook
4   441087132    stackql-provider-registry
5   443987542                      stackql
6   446769762            go-openapistackql
7   447051137                      go-spew
8   448123925                   go-sqlite3
9   448126348                       vitess
10  448127673                     readline
11  448127756                        color
12  455730530                    pystackql
13  456722161         stackql-jupyter-demo
14  469681593                    psql-wire
15  472680056            provider-doc-util
16  474504182                go-suffix-map
17  476492741              okta-pkce-login
18  487689765             openapi-doc-util
19  501046061      stackql-gcp-foundations
20  504338261          registry.stackql.io
21  506876993  google-discovery-to-openapi


In [11]:
def get_username_from_url(url):
    #https://api.github.com/users/jolivier23
    try:
        if url != None and url != 'null' and len(url):
            return url.split('/users/')[1]
        return None
    except:
        print('get user name from url error with url %s', url)

In [12]:
def get_username_from_column(list_data: list):
    return list(map(lambda url: get_username_from_url(url), list_data));


In [13]:
### get the developers
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributor_data = get_contributors_of_repo(test_repo, test_owner)
contributor_data['username'] = get_username_from_column(contributor_data['url'].to_list())
print(contributor_data.head(5))


    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    
  email        id  name  type                                         url  \
0  null  10464486  null  User     https://api.github.com/users/jolivier23   
1  null  32652776  null  User        https://api.github.com/users/wangdi1   
2  null   5822721  null  User  https://api.github.com/users/ashleypittman   
3  null   3277648  null  User       https://api.github.com/users/tanabarr   
4  null   1791869  null  User     https://api.github.com/users/liuxuezhao   

        username  
0     jolivier23  
1        wangdi1  
2  ashleypittman  
3       tanabarr  
4     liuxuezhao  


In [14]:

def get_commits(repo, owner) :
    query = """
    SELECT 
    JSON_EXTRACT(commit, '$.message') as message, 
    sha, 
    JSON_EXTRACT(author, '$.url') as url,
    JSON_EXTRACT(commit, '$.author.date') as commit_date
    FROM github.repos.commits 
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    data = get_dataframe_from_query(query)
    # commits_json = data['commit'].to_list()
    # commits = list(map(lambda commit: json.loads(commit), commits_json)) 
    # data['commit'] = commits   
    return data

commits = get_commits(test_repo, test_owner)
commits['username'] = get_username_from_column(commits['url'].to_list())




In [15]:
print(commits.head(1).to_dict())

{'commit_date': {0: '2022-07-15T21:15:58Z'}, 'message': {0: 'DAOS-10300 test: fix dmg_pool_query (#9720)\n\nUpdate dmg_pool_query tests with expected values\r\n\r\nSigned-off-by: Dalton Bohning <dalton.bohning@intel.com>'}, 'sha': {0: '0e243044b00d7c6ceb14f1cc415b8ba11de21307'}, 'url': {0: 'https://api.github.com/users/daltonbohning'}, 'username': {0: 'daltonbohning'}}


In [16]:
### get user
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributors = get_contributors_of_repo(test_repo, test_owner)
contributors['username'] =get_username_from_column(contributors['url'].to_list())
print(contributors.head(5))



    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    
  email        id  name  type                                         url  \
0  null  10464486  null  User     https://api.github.com/users/jolivier23   
1  null  32652776  null  User        https://api.github.com/users/wangdi1   
2  null   5822721  null  User  https://api.github.com/users/ashleypittman   
3  null   3277648  null  User       https://api.github.com/users/tanabarr   
4  null   1791869  null  User     https://api.github.com/users/liuxuezhao   

        username  
0     jolivier23  
1        wangdi1  
2  ashleypittman  
3       tanabarr  
4     liuxuezhao  


Other interesting resources in the repo that you can use to check up developer's activity

In [17]:
def get_pull_requests(repo, owner):
    #github.pulls.pull_requests
    query = """
    SELECT 
    number as pull_number, 
    JSON_EXTRACT(assignee, '$.url') as assignee_url, 
    JSON_EXTRACT(user, '$.url') as user_url, 
    state
    FROM github.pulls.pull_requests
    where repo = '%s' AND owner = '%s'
    """ % (repo, owner)
    pull_requests = get_dataframe_from_query(query)
    pull_requests['assignee_username'] =get_username_from_column(pull_requests['assignee_url'].to_list())
    pull_requests['username'] =get_username_from_column(pull_requests['user_url'].to_list())
    return pull_requests;
pull_requests = get_pull_requests(test_repo, test_owner)
print(pull_requests.head(2))


  assignee_url pull_number state                                   user_url  \
0         null        9722  open  https://api.github.com/users/JohnMalmberg   
1         null        9721  open       https://api.github.com/users/phender   

  assignee_username      username  
0              None  JohnMalmberg  
1              None       phender  


In [21]:
def get_pull_request_reviews(pull_number, repo, owner):
    #github.pulls.reviews
    query = """
    SELECT JSON_EXTRACT(user, '$.url') as user_url, state, body
    FROM github.pulls.reviews
    where repo = '%s' AND owner = '%s' AND pull_number = %s
    """ % (repo, owner, pull_number)
    try:
        pull_requests_reviews = get_dataframe_from_query(query)
        pull_requests_reviews['username'] =get_username_from_column(pull_requests_reviews['user_url'].to_list())
        return pull_requests_reviews;
    except(TypeError):
        raise TypeError
        
  

print(get_pull_request_reviews(9656, test_repo, test_owner))


                                                 body      state  \
0               LGTM.  No errors found by checkpatch.  COMMENTED   
1               LGTM.  No errors found by checkpatch.  COMMENTED   
2               LGTM.  No errors found by checkpatch.  COMMENTED   
3               LGTM.  No errors found by checkpatch.  COMMENTED   
4               LGTM.  No errors found by checkpatch.  COMMENTED   
5               LGTM.  No errors found by checkpatch.  COMMENTED   
6               LGTM.  No errors found by checkpatch.  COMMENTED   
8                                                      COMMENTED   
9                                                      COMMENTED   
10                                                     COMMENTED   
11              LGTM.  No errors found by checkpatch.  COMMENTED   
12              LGTM.  No errors found by checkpatch.  COMMENTED   
13              LGTM.  No errors found by checkpatch.  COMMENTED   
14              LGTM.  No errors found by checkp

In [58]:
def get_reviews_for_pull_requests(pull_numbers: list, repo, owner, limit=100):
    ##loop pull requests data
    reviews_frames = []
    pull_numbers = pull_numbers[0:limit]
    for pull_number in pull_numbers:
        try:
            print('getting reviews for pull number %s' % pull_number)
            reviews = get_pull_request_reviews(pull_number, repo, owner)
            if reviews is not None and not reviews.empty:
                reviews_frames.append(reviews)
        except Exception as error:
            print(error)
            if isinstance(error, TypeError):
                continue
            else:
                return;
            
    return pd.concat(reviews_frames);

reviews_data = get_reviews_for_pull_requests(pull_requests['pull_number'].to_list(), test_repo, test_owner, limit=1)


getting reviews for pull number 9722
('StackQL execute error with error: Expecting \',\' delimiter: line 1 column 36 (char 35), res: [{"error": "HTTP response error: {"documentation_url":"https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting","message":"API rate limit exceeded for 14.201.196.52. (But here\'s the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)"}"}], error type: <class \'json.decoder.JSONDecodeError\'>, res type: <class \'str\'>', "Expecting ',' delimiter: line 1 column 36 (char 35)")


In [30]:
reviews = reviews_data.copy()
reviews['username'] = get_username_from_column(reviews['user_url'].to_list())
print(reviews.tail(1))

                                                body      state  \
2  LGTM.  No errors found by checkpatch.\n\nFYI: ...  COMMENTED   

                                  user_url    username  
2  https://api.github.com/users/daosbuild1  daosbuild1  


In [40]:
def aggregate_user_activity (row, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame):
    username = row['username']
    row['commits'] = commits[commits['username'] == username]
    row['pull_requests'] = pull_requests[pull_requests['username'] == username]
    row['reviews'] = reviews[reviews['username'] == username] 
    return row

def get_user_activity(users: pd.DataFrame, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame, owner, repo):
    #avoid rate limit
    activities = users.apply(aggregate_user_activity, args=(commits, reviews, pull_requests), axis=1)
    activities['owner'] = owner;
    activities['repo'] = repo
    return activities;

activities = get_user_activity(contributors, commits, reviews, pull_requests, test_owner, test_repo)
print(activities.head(1).to_dict())

TypeError: 'NoneType' object is not subscriptable

## Upload results 

In [43]:
import db.bigquery.data as bg
import db.bigquery.schemas as schemas
def upload_user_activity(activity_dataframe: pd.DataFrame):
    dataset = 'github'
    table_name = 'user_activity'
    table_id = bg.get_table_id(dataset, table_name)
    config = bg.create_load_job_config(schemas.activity_schema)
    bg.upload_dataframe(activity_dataframe, table_id, config)
    return


storyscore-356114.test_dataset.test_table
