## Set up the dependencies and configurations

###  Assumption: you have downloaded stackql and is in your system path
Follow [this instruction to install](https://stackql.io/downloads)

### Append system path so we can use other modules

In [24]:
import sys 
import settings
project_root = str(settings.get_project_root())
print(project_root)
sys.path.append(project_root)

/Users/yunchengyang/Projects/Storyscore/storyscore-data


### setup stackql

In [25]:
from pystackql import StackQL
import json 
import os
import pandas as pd

In [26]:
os.environ['GITHUB_CREDS'] = "b60db0573b17fdf5e12a:2bc5d338d11e71aa0d60ddfca5f971208c009792"
auth = { 
    "github": 
      { "type": "basic", "credentialsenvvar": "GITHUB_CREDS" }
}
stackql_path = project_root + '/.stackql'
iql = StackQL(auth=json.dumps(auth))

To make the script easy to duplicate in any environment, we will use pystackql to pull the github provider from registry

In [27]:
## pull github registry
registry_res = iql.execute('REGISTRY LIST;')
registry_list = json.loads(registry_res)
github_registry = list(filter(lambda reg: reg["provider"] == 'github', registry_list))[0]
github_version = github_registry['version']

print(github_version)

pull_provider_query = """
REGISTRY PULL github %s;
""" % github_version

print(pull_provider_query)

res = iql.execute(pull_provider_query)

print(res)


v0.3.2

REGISTRY PULL github v0.3.2;

[{"error": "github provider, version 'v0.3.2' successfully installed"}]


## Using Github provider
We will use [Intel open source DAOS project](https://github.com/daos-stack/daos) as example 

In [28]:
org_name = 'stackql'

### Overview
First lets look at what we can get from the Github provider

In [29]:
def get_dataframe_from_query(query: str) -> pd.DataFrame :
    res = iql.execute(query)
    try: 
        res_obj = json.loads(res)
        if not res_obj:
            return;
        if "error" in res_obj[0]:
            print(res)
            raise AttributeError(res_obj)
        data = pd.DataFrame.from_dict(res_obj)
        return data
    except:
        print(type(res))
        print(res)
        raise TypeError(res)


In [30]:
### look at repos, commits and comments
def show_resource():
    query = 'show resources in github.repos'
    data = get_dataframe_from_query(query)
    
    print(data)

show_resource()


                                              id  \
0               github.repos.access_restrictions   
1           github.repos.admin_branch_protection   
2                        github.repos.app_access   
3                         github.repos.autolinks   
4                 github.repos.branch_protection   
5                          github.repos.branches   
6                     github.repos.clone_traffic   
7    github.repos.collaborator_permission_levels   
8                     github.repos.collaborators   
9                 github.repos.combined_statuses   
10                         github.repos.comments   
11                  github.repos.commit_branches   
12             github.repos.commit_pull_requests   
13                          github.repos.commits   
14                        github.repos.community   
15                  github.repos.content_traffic   
16                         github.repos.contents   
17                     github.repos.contributors   
18          

In [31]:
def describe_commit():
    query = 'DESCRIBE github.repos.commits;'
    data = get_dataframe_from_query(query)
    
    print(data)

def describe_repos():
    query = 'DESCRIBE github.repos.repos;'
    data = get_dataframe_from_query(query)
    
    print(data)
    
describe_repos()

describe_commit()


                name     type
0                 id  integer
1               name   string
2        description   string
3          pulls_url   string
4           homepage   string
..               ...      ...
80    default_branch   string
81             owner   object
82      has_projects  boolean
83  stargazers_count  integer
84               url   string

[85 rows x 2 columns]
            name    type
0   comments_url  string
1          files   array
2            url  string
3        node_id  string
4         author  object
5         commit  object
6            sha  string
7       html_url  string
8      committer  object
9        parents   array
10         stats  object


### explore commit activity 

In [32]:
test_repo = 'daos'
test_owner = 'daos-stack'

In [33]:
def list_repos():
    query = """
    select id, name
    from github.repos.repos
    where org = '%s';
    """ % org_name
    data = get_dataframe_from_query(query)
    
    print(data)
list_repos()

           id                         name
0   409393414       fullstackchronicles.io
1   424079013    docusaurus-plugin-hubspot
2   425113738      gatsby-plugin-smartlook
3   425366372  docusaurus-plugin-smartlook
4   441087132    stackql-provider-registry
5   443987542                      stackql
6   446769762            go-openapistackql
7   447051137                      go-spew
8   448123925                   go-sqlite3
9   448126348                       vitess
10  448127673                     readline
11  448127756                        color
12  455730530                    pystackql
13  456722161         stackql-jupyter-demo
14  469681593                    psql-wire
15  472680056            provider-doc-util
16  474504182                go-suffix-map
17  476492741              okta-pkce-login
18  487689765             openapi-doc-util
19  501046061      stackql-gcp-foundations
20  504338261          registry.stackql.io
21  506876993  google-discovery-to-openapi


In [34]:
def get_username_from_url(url):
    #https://api.github.com/users/jolivier23
    try:
        if url != None and url != 'null' and len(url):
            return url.split('/users/')[1]
        return None
    except:
        print('get user name from url error with url %s', url)

In [35]:
def get_username_from_column(list_data: list):
    return list(map(lambda url: get_username_from_url(url), list_data));


In [36]:
### get the developers
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributor_data = get_contributors_of_repo(test_repo, test_owner)
contributor_data['username'] = get_username_from_column(contributor_data['url'].to_list())
print(contributor_data.head(5))


    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    
  email        id  name  type                                         url  \
0  null  10464486  null  User     https://api.github.com/users/jolivier23   
1  null  32652776  null  User        https://api.github.com/users/wangdi1   
2  null   5822721  null  User  https://api.github.com/users/ashleypittman   
3  null   3277648  null  User       https://api.github.com/users/tanabarr   
4  null   1791869  null  User     https://api.github.com/users/liuxuezhao   

        username  
0     jolivier23  
1        wangdi1  
2  ashleypittman  
3       tanabarr  
4     liuxuezhao  


In [76]:

def get_commits(repo, owner) :
    query = """
    SELECT 
    JSON_EXTRACT(commit, '$.message') as message, 
    sha, 
    JSON_EXTRACT(author, '$.url') as url,
    JSON_EXTRACT(commit, '$.author.date') as commit_date
    FROM github.repos.commits 
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    data = get_dataframe_from_query(query)
    # commits_json = data['commit'].to_list()
    # commits = list(map(lambda commit: json.loads(commit), commits_json)) 
    # data['commit'] = commits   
    return data

commits = get_commits(test_repo, test_owner)
commits['username'] = get_username_from_column(commits['url'].to_list())




In [77]:
print(commits.head(1).to_dict())

{'commit_date': {0: '2022-07-14T12:53:35Z'}, 'message': {0: 'DAOS-11120 control: Fix block device size display (#9686)\n\nThe hwloc library produces size values in kB, but the\r\nprinting library expects bytes.\r\n\r\nSigned-off-by: Michael MacDonald <mjmac.macdonald@intel.com>'}, 'sha': {0: 'cc2c0aa679eca7eee5ab5aa11657dfa16ee605f2'}, 'url': {0: 'https://api.github.com/users/mjmac'}, 'username': {0: 'mjmac'}}


In [38]:
### get user
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributors = get_contributors_of_repo(test_repo, test_owner)
contributors['username'] =get_username_from_column(contributors['url'].to_list())
print(contributors.head(5))



    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    
  email        id  name  type                                         url  \
0  null  10464486  null  User     https://api.github.com/users/jolivier23   
1  null  32652776  null  User        https://api.github.com/users/wangdi1   
2  null   5822721  null  User  https://api.github.com/users/ashleypittman   
3  null   3277648  null  User       https://api.github.com/users/tanabarr   
4  null   1791869  null  User     https://api.github.com/users/liuxuezhao   

        username  
0     jolivier23  
1        wangdi1  
2  ashleypittman  
3       tanabarr  
4     liuxuezhao  


Other interesting resources in the repo that you can use to check up developer's activity

In [112]:
def get_pull_requests(repo, owner):
    #github.pulls.pull_requests
    query = """
    SELECT 
    number as pull_number, 
    JSON_EXTRACT(assignee, '$.url') as assignee_url, 
    JSON_EXTRACT(user, '$.url') as user_url, 
    state
    FROM github.pulls.pull_requests
    where repo = '%s' AND owner = '%s'
    """ % (repo, owner)
    pull_requests = get_dataframe_from_query(query)
    pull_requests['assignee_username'] =get_username_from_column(pull_requests['assignee_url'].to_list())
    pull_requests['username'] =get_username_from_column(pull_requests['user_url'].to_list())
    return pull_requests;
pull_requests = get_pull_requests(test_repo, test_owner)
print(pull_requests.head(2))


  assignee_url pull_number state  \
0         null        9695  open   
1         null        9694  open   

                                          user_url assignee_username  \
0       https://api.github.com/users/brianjmurrell              None   
1  https://api.github.com/users/wangzhaorong-cestc              None   

             username  
0       brianjmurrell  
1  wangzhaorong-cestc  


In [115]:
def get_pull_request_reviews(pull_number, repo, owner):
    #github.pulls.reviews
    query = """
    SELECT JSON_EXTRACT(user, '$.url') as user_url, state, body
    FROM github.pulls.reviews
    where repo = '%s' AND owner = '%s' AND pull_number = %s
    """ % (repo, owner, pull_number)
    pull_requests_reviews = get_dataframe_from_query(query)
    pull_requests_reviews['username'] =get_username_from_column(pull_requests_reviews['user_url'].to_list())
    return pull_requests_reviews;

print(get_pull_request_reviews(9656, test_repo, test_owner))


                                                 body      state  \
0               LGTM.  No errors found by checkpatch.  COMMENTED   
1               LGTM.  No errors found by checkpatch.  COMMENTED   
2               LGTM.  No errors found by checkpatch.  COMMENTED   
3               LGTM.  No errors found by checkpatch.  COMMENTED   
4               LGTM.  No errors found by checkpatch.  COMMENTED   
5               LGTM.  No errors found by checkpatch.  COMMENTED   
6               LGTM.  No errors found by checkpatch.  COMMENTED   
8                                                      COMMENTED   
9                                                      COMMENTED   
10                                                     COMMENTED   
11              LGTM.  No errors found by checkpatch.  COMMENTED   
12              LGTM.  No errors found by checkpatch.  COMMENTED   
13              LGTM.  No errors found by checkpatch.  COMMENTED   
14              LGTM.  No errors found by checkp

In [113]:
def get_reviews_for_pull_requests(pull_numbers: list, repo, owner, limit=100):
    ##loop pull requests data
    reviews_frames = []
    pull_numbers = pull_numbers[0:limit]
    for pull_number in pull_numbers:
        print('getting reviews for pull number %s' % pull_number)
        reviews = get_pull_request_reviews(pull_number, repo, owner)
        if reviews is not None and not reviews.empty:
            reviews_frames.append(reviews)
    return pd.concat(reviews_frames);

reviews = get_reviews_for_pull_requests(pull_requests['pull_number'].to_list(), test_repo, test_owner, limit=20)


getting reviews for pull number 9695
getting reviews for pull number 9694
getting reviews for pull number 9693
getting reviews for pull number 9692
getting reviews for pull number 9691
getting reviews for pull number 9690
getting reviews for pull number 9689
getting reviews for pull number 9688
getting reviews for pull number 9685
getting reviews for pull number 9684
getting reviews for pull number 9683
getting reviews for pull number 9682
getting reviews for pull number 9681
getting reviews for pull number 9680
getting reviews for pull number 9679
getting reviews for pull number 9678
getting reviews for pull number 9676
getting reviews for pull number 9675
getting reviews for pull number 9674
getting reviews for pull number 9673


In [114]:
reviews['username'] = get_username_from_column(reviews['user_url'].to_list())
print(reviews.tail(1))

  body      state                                    user_url       username
2       COMMENTED  https://api.github.com/users/daltonbohning  daltonbohning


In [None]:
def aggregate_user_activity (row, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame):
    username = row['username']
    row['commits'] = commits[commits['username'] == username]
    row['pull_requests'] = pull_requests[pull_requests['username'] == username]
    row['reviews'] = reviews[reviews['username'] == username] 
    return row

def get_user_activity(users: pd.DataFrame, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame, owner, repo):
    #avoid rate limit
    activities = users.apply(aggregate_user_activity, args=(commits, reviews, pull_requests), axis=1)
    activities['owner'] = owner;
    activities['repo'] = repo
    return activities;

activities = get_user_activity(contributors, commits, reviews, pull_requests, test_owner, test_repo)
print(activities.head(1).to_csv('example_activities.csv'))

None


## Upload results 

In [43]:
import db.bigquery.data as bg
print(bg.get_table_id('test_dataset', 'test_table'))


storyscore-356114.test_dataset.test_table
