## Set up the dependencies and configurations

###  Assumption: you have downloaded stackql and is in your system path
Follow [this instruction to install](https://stackql.io/downloads)

### Append system path so we can use other modules

In [1]:
import sys 
import settings
project_root = str(settings.get_project_root())
print(project_root)
sys.path.append(project_root)

/Users/yunchengyang/Projects/Storyscore/storyscore-data


### setup stackql

In [2]:
from pystackql import StackQL
import json 
import os
import pandas as pd
import dotenv
import base64

In [3]:
dotenv.load_dotenv()

True

In [4]:
auth = { 
    "github": 
      { "type": "basic", "credentialsenvvar": "GITHUB_CREDS" }
}
iql = StackQL(auth=json.dumps(auth))

To make the script easy to duplicate in any environment, we will use pystackql to pull the github provider from registry

In [5]:
## pull github registry
registry_res = iql.execute('REGISTRY LIST;')
registry_list = json.loads(registry_res)
github_registry = list(filter(lambda reg: reg["provider"] == 'github', registry_list))[0]
github_version = github_registry['version']

pull_provider_query = """
REGISTRY PULL github %s;
""" % github_version

res = iql.execute(pull_provider_query)


## Using Github provider
We will use [Intel open source DAOS project](https://github.com/daos-stack/daos) as example 

### Overview
First lets look at what we can get from the Github provider

In [6]:
def get_dataframe_from_query(query: str) -> pd.DataFrame :
    res = iql.execute(query)
    try: 
        res_obj = json.loads(res)
        if "error" in res_obj:
            raise AttributeError()
        if not res_obj:
            raise TypeError(res)
        data = pd.DataFrame(res_obj)
        return data
    except Exception as error:
        if not isinstance(error, TypeError):
            error.args = ('StackQL execute error with error: %s, res: %s, error type: %s, res type: %s' %(error, res, type(error), type(res)), *error.args)
        raise error


In [7]:
### look at repos, commits and comments
def show_resource():
    query = 'show resources in github.repos'
    data = get_dataframe_from_query(query)
    
    print(data)

show_resource()


                                              id  \
0               github.repos.access_restrictions   
1           github.repos.admin_branch_protection   
2                        github.repos.app_access   
3                         github.repos.autolinks   
4                 github.repos.branch_protection   
5                          github.repos.branches   
6                     github.repos.clone_traffic   
7    github.repos.collaborator_permission_levels   
8                     github.repos.collaborators   
9                 github.repos.combined_statuses   
10                         github.repos.comments   
11                  github.repos.commit_branches   
12             github.repos.commit_pull_requests   
13                          github.repos.commits   
14                        github.repos.community   
15                  github.repos.content_traffic   
16                         github.repos.contents   
17                     github.repos.contributors   
18          

In [8]:
def describe_commit():
    query = 'DESCRIBE github.repos.commits;'
    data = get_dataframe_from_query(query)
    
    print(data)

def describe_repos():
    query = 'DESCRIBE github.repos.repos;'
    data = get_dataframe_from_query(query)
    
    print(data)
    
describe_repos()

describe_commit()


                      name     type
0                       id  integer
1                     name   string
2              description   string
3                  svn_url   string
4                  private  boolean
..                     ...      ...
80                 ssh_url   string
81  delete_branch_on_merge  boolean
82          default_branch   string
83               pulls_url   string
84        temp_clone_token   string

[85 rows x 2 columns]
            name    type
0         author  object
1   comments_url  string
2        parents   array
3            sha  string
4        node_id  string
5          stats  object
6          files   array
7         commit  object
8            url  string
9      committer  object
10      html_url  string


### explore commit activity 

In [9]:
test_repo = 'daos'
test_owner = 'daos-stack'

In [10]:
def list_repos():
    query = """
    select id, name
    from github.repos.repos
    where org = '%s';
    """ % 'site-mate'
    data = get_dataframe_from_query(query)
    
    print(data)
list_repos()

           id                              name
0    86028319                     dashpivot-api
1    91520879              bootstrap-datepicker
2    92884149                     dashpivot-web
3    94492881                  dashpivot-mobile
4   112567950                   lambda-compress
5   173240161                         dashpivot
6   205999587                apple-certificates
7   224585474  redux-persist-filesystem-storage
8   245974039     dashpivot-tests-webui-cypress
9   246268185        react-native-sketch-canvas
10  252369067                          sitemate
11  252392599                   sitemate-mobile
12  266515994                  dashpivot-shared
13  354844778                    dashpivot-misc
14  374542821                     mobile-shared
15  457199306                sitemate-marketing
16  465981039                    sitemate-admin
17  470125310            react-native-blob-util
18  478925268          react-native-router-flux
19  501543630                           

In [11]:
def get_username_from_url(url):
    #https://api.github.com/users/jolivier23
    try:
        if url != None and url != 'null' and len(url):
            return url.split('/users/')[1]
        return None
    except:
        print('get user name from url error with url %s', url)

In [12]:
def get_username_from_column(list_data: list):
    return list(map(lambda url: get_username_from_url(url), list_data));


In [14]:
### get the developers
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributor_data = get_contributors_of_repo(test_repo, test_owner)
contributor_data['username'] = get_username_from_column(contributor_data['url'].to_list())
display(contributor_data)


    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    


Unnamed: 0,email,id,name,type,url,username
0,,10464486,,User,https://api.github.com/users/jolivier23,jolivier23
1,,32652776,,User,https://api.github.com/users/wangdi1,wangdi1
2,,5822721,,User,https://api.github.com/users/ashleypittman,ashleypittman
3,,3277648,,User,https://api.github.com/users/tanabarr,tanabarr
4,,1791869,,User,https://api.github.com/users/liuxuezhao,liuxuezhao
...,...,...,...,...,...,...
80,,5947790,,User,https://api.github.com/users/MartinVerges,MartinVerges
81,,7877036,,User,https://api.github.com/users/nayankumarp,nayankumarp
82,,37665970,,User,https://api.github.com/users/kalfizah,kalfizah
83,,34421528,,User,https://api.github.com/users/omaraziz255,omaraziz255


In [15]:

def get_commits(repo, owner) :
    query = """
    SELECT 
    JSON_EXTRACT(commit, '$.message') as message, 
    sha, 
    JSON_EXTRACT(author, '$.url') as url,
    JSON_EXTRACT(commit, '$.author.date') as commit_date
    FROM github.repos.commits 
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    data = get_dataframe_from_query(query)
    # commits_json = data['commit'].to_list()
    # commits = list(map(lambda commit: json.loads(commit), commits_json)) 
    # data['commit'] = commits   
    return data

commits = get_commits(test_repo, test_owner)
commits['username'] = get_username_from_column(commits['url'].to_list())




In [16]:
print(commits.head(1).to_dict())

{'commit_date': {0: '2022-07-29T18:49:28Z'}, 'message': {0: 'DAOS-11190 test: Fix deployment/network_failure.py by using NodeSet for hostnames (#9800)\n\ndeployment/network_failure.py is failing because the underlying\r\nrun_pcmd() was updated to use NodeSet. The test needs to handle\r\nthe hostnames with NodeSet instead of string.\r\n\r\nSigned-off-by: Makito Kano <makito.kano@intel.com>'}, 'sha': {0: 'e920877b33d7b5b923a4c7e1cc9f9f61de88286d'}, 'url': {0: 'https://api.github.com/users/shimizukko'}, 'username': {0: 'shimizukko'}}


In [17]:
### get user
#github.repos.contributors
def get_contributors_of_repo(repo, owner):
    query = """
    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = '%s' AND owner = '%s';
    """ % (repo, owner)
    print(query)
    data = get_dataframe_from_query(query)
    return data
contributors = get_contributors_of_repo(test_repo, test_owner)
contributors['username'] =get_username_from_column(contributors['url'].to_list())
print(contributors.head(5))



    SELECT name, email, id, type, url
    FROM github.repos.contributors
    where repo = 'daos' AND owner = 'daos-stack';
    
  email        id  name  type                                         url  \
0  null  10464486  null  User     https://api.github.com/users/jolivier23   
1  null  32652776  null  User        https://api.github.com/users/wangdi1   
2  null   5822721  null  User  https://api.github.com/users/ashleypittman   
3  null   3277648  null  User       https://api.github.com/users/tanabarr   
4  null   1791869  null  User     https://api.github.com/users/liuxuezhao   

        username  
0     jolivier23  
1        wangdi1  
2  ashleypittman  
3       tanabarr  
4     liuxuezhao  


Other interesting resources in the repo that you can use to check up developer's activity

In [18]:
def get_pull_requests(repo, owner):
    #github.pulls.pull_requests
    query = """
    SELECT 
    number as pull_number, 
    JSON_EXTRACT(assignee, '$.url') as assignee_url, 
    JSON_EXTRACT(user, '$.url') as user_url, 
    state,
    updated_at
    FROM github.pulls.pull_requests
    where repo = '%s' AND owner = '%s'
    """ % (repo, owner)
    pull_requests = get_dataframe_from_query(query)
    pull_requests['assignee_username'] =get_username_from_column(pull_requests['assignee_url'].to_list())
    pull_requests['username'] =get_username_from_column(pull_requests['user_url'].to_list())
    return pull_requests;
pull_requests = get_pull_requests(test_repo, test_owner)
print(pull_requests.head(1).to_dict())


{'assignee_url': {0: 'null'}, 'pull_number': {0: '9864'}, 'state': {0: 'open'}, 'updated_at': {0: '2022-07-30T23:15:15Z'}, 'user_url': {0: 'https://api.github.com/users/jgmoore-or'}, 'assignee_username': {0: None}, 'username': {0: 'jgmoore-or'}}


In [19]:
def get_pull_request_reviews(pull_number, repo, owner):
    #github.pulls.reviews
    query = """
    SELECT JSON_EXTRACT(user, '$.url') as user_url, state, body, submitted_at
    FROM github.pulls.reviews
    where repo = '%s' AND owner = '%s' AND pull_number = %s
    """ % (repo, owner, pull_number)
    try:
        pull_requests_reviews = get_dataframe_from_query(query)
        pull_requests_reviews['username'] =get_username_from_column(pull_requests_reviews['user_url'].to_list())
        return pull_requests_reviews;
    except(TypeError):
        raise TypeError
        
  

print(get_pull_request_reviews(9656, test_repo, test_owner))


                                                 body              state  \
0               LGTM.  No errors found by checkpatch.          COMMENTED   
1               LGTM.  No errors found by checkpatch.          COMMENTED   
2               LGTM.  No errors found by checkpatch.          COMMENTED   
3               LGTM.  No errors found by checkpatch.          COMMENTED   
4               LGTM.  No errors found by checkpatch.          COMMENTED   
5               LGTM.  No errors found by checkpatch.          COMMENTED   
6               LGTM.  No errors found by checkpatch.          COMMENTED   
8                                                              COMMENTED   
9                                                              COMMENTED   
10                                                             COMMENTED   
11              LGTM.  No errors found by checkpatch.          COMMENTED   
12              LGTM.  No errors found by checkpatch.          COMMENTED   
13          

In [20]:
def get_reviews_for_pull_requests(pull_numbers: list, repo, owner, limit=100):
    ##loop pull requests data
    reviews_frames = []
    pull_numbers = pull_numbers[0:limit]
    for pull_number in pull_numbers:
        try:
            print('getting reviews for pull number %s' % pull_number)
            reviews = get_pull_request_reviews(pull_number, repo, owner)
            if reviews is not None and not reviews.empty:
                reviews_frames.append(reviews)
        except Exception as error:
            print(error)
            if isinstance(error, TypeError):
                continue
            else:
                return;
            
    return pd.concat(reviews_frames);

reviews_data = get_reviews_for_pull_requests(pull_requests['pull_number'].to_list(), test_repo, test_owner, limit=100)


getting reviews for pull number 9864
getting reviews for pull number 9863
getting reviews for pull number 9862
getting reviews for pull number 9861
getting reviews for pull number 9860

getting reviews for pull number 9859
getting reviews for pull number 9858
getting reviews for pull number 9857
getting reviews for pull number 9856
getting reviews for pull number 9855
getting reviews for pull number 9854
getting reviews for pull number 9853
getting reviews for pull number 9852
getting reviews for pull number 9850
getting reviews for pull number 9849
getting reviews for pull number 9848
getting reviews for pull number 9843
getting reviews for pull number 9842
getting reviews for pull number 9841
getting reviews for pull number 9839

getting reviews for pull number 9838
getting reviews for pull number 9837
getting reviews for pull number 9834
getting reviews for pull number 9830
getting reviews for pull number 9829
getting reviews for pull number 9828
getting reviews for pull number 9825

In [21]:
reviews = reviews_data.copy()
reviews['username'] = get_username_from_column(reviews['user_url'].to_list())
print(reviews.tail(1))

                                                body      state  \
0  LGTM.  No errors found by checkpatch.\n\nFYI: ...  COMMENTED   

           submitted_at                                 user_url    username  
0  2022-06-28T06:02:36Z  https://api.github.com/users/daosbuild1  daosbuild1  


In [22]:
def aggregate_user_activity (row, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame):
    username = row['username']
    row['commits'] = commits[commits['username'] == username]
    row['pull_requests'] = pull_requests[pull_requests['username'] == username]
    row['reviews'] = reviews[reviews['username'] == username]
    return row

def get_user_activity(users: pd.DataFrame, commits: pd.DataFrame,  reviews: pd.DataFrame, pull_requests: pd.DataFrame, owner, repo):
    #avoid rate limit
    activities = users.apply(aggregate_user_activity, args=(commits, reviews, pull_requests), axis=1)
    activities['owner'] = owner;
    activities['repo'] = repo
    return activities;

activities = get_user_activity(contributors, commits, reviews, pull_requests, test_owner, test_repo)

In [None]:
activities.dtypes

email            object
id               object
name             object
type             object
url              object
username         object
commits          object
pull_requests    object
reviews          object
owner            object
repo             object
dtype: object

In [23]:
activities.to_json('user-activities.json', orient='records', lines=True)