# GitHub & Government Loader

This notebook is used to store information about <b>organizations</b> listed in the [GitHub & Government page](https://government.github.com/community/), their <b>repositories</b> and their <b>contributors</b> in the database.  
<hr>

In [None]:
access_token="<PERSONAL-ACCESS-TOKEN>" # You need to create a Personal Access Token (https://github.com/settings/tokens)

#### Requirements

In [None]:
! pip install lxml
! pip install beautifulsoup4
! pip install iso3166

<hr>

#### GitHubApi instance

In [None]:
from lib.github import GitHubApi
github_api=GitHubApi(access_token)

#### Logger set up

In [None]:
import logging, os, datetime,sys
from pathlib import Path
Path('logs').mkdir(parents=True,exist_ok=True)
# Logging Levels: DEBUG, INFO, WARNING, ERROR, CRITICAL
logging.basicConfig(#filename=os.path.join('logs','sbom_creator',str(datetime.datetime.now().strftime("%d-%m-%Y T%H %M %S")) +'.log'),
handlers=[
        logging.FileHandler(os.path.join('logs','log-'+str(datetime.datetime.now().strftime("%d-%m-%Y")) +'.log')),
        logging.StreamHandler(sys.stdout)
    ],
                    format='%(asctime)s |:| LEVEL:%(levelname)-2s |:| FILE:notebook_1 (gov_org_loader).ipynb:%(lineno)-s |:| %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)
#if not logging.getLogger().handlers: logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logging.getLogger("urllib3").propagate = False

#### Get GitHub & Government list:

In [None]:
import requests
from bs4 import BeautifulSoup
import re


gh_gov_url='https://government.github.com/community/' 
 
soup = BeautifulSoup(requests.get(gh_gov_url).text, 'lxml') #Obtain the BeautifulSoup object
orgs_names = soup.select('div.org-name') #Get GitHub organizations names
orgs=list()
for on in orgs_names: #For each org get username, section and category in the GitHub & Government page:
       orgs.append({
            'user_name':re.compile(r'@([\w,-]+)').search(on.text).groups()[0],
            'section':on.find_previous('h2').text.strip(),
            'category':re.compile(r'(.*)\([0-9]+\)').search(on.find_previous('h3').text).groups()[0].strip()
       })

logging.info('{} GitHub repositories related to Government agencies at the national, state and local level have been found.'.format(len(orgs_names)))

#### Get information about each organization

In [None]:
orgs = orgs[:5]

In [None]:
import time
import datetime
from iso3166 import countries

for org in orgs:
    logging.info('Getting organization info from GitHub API for organization"{}"'.format(org['user_name']))
    try:
        data=github_api.req('https://api.github.com/orgs/{}'.format(org['user_name'].lower()))
        members= github_api.req('https://api.github.com/orgs/{}/public_members'.format(org['user_name']))
        
        orgs[orgs.index(org)]= org | {
            'url':'https:github.com/'+org['user_name'],
            'name': data['name'] if 'name' in data else data['login'] if 'login' in data else '',
            'location': data['location'] if 'location' in data else '',
            'repos': data['public_repos'] if 'public_repos' in data else '',
            'followers': data['followers'] if 'followers' in data else '',
            'created_at': data['created_at'] if 'created_at' in data else '',
            'timestamp_created_at': time.mktime(datetime.datetime.strptime( data['created_at'], "%Y-%m-%dT%XZ").timetuple()) if 'created_at' in data else '',
            'country':countries.get(org['category'])[0] if org['category'] in countries else 'United States' if 'U.S' or 'America' in org['category'] else 'United Kingdom' if 'U.K.' in org['category'] else '',
            'members': len(members)
        }
 
    except Exception as e:
        logging.warning('A problem occur while using GitHub API for "{}" organization. Problem: {}'.format(org['user_name'],e))


### Store organizations in SQLite database

In [None]:
from lib.sqlite_utils import DBConnection 
import os

if not os.path.exists(os.path.join('database','database.sqlite')):
    raise Exception('Database does not exists! You need to create it first (db_builder.ipynb)')

conn=DBConnection(os.path.join('database','database.sqlite'))
conn.query("PRAGMA foreign_keys = ON;")

logging.info('Connected with "database/database.sqlite" database.') 

for org in orgs:    
    conn.add('organization',org) 

logging.info('Added {} organizations to the database.'.format(str(len(orgs))))

#### Get repositories info for each organization

In [None]:
from datetime import date

res_per_page,starting_page= 50,1

for org in orgs:
    counter,organization=0, org['user_name']
    repos=None
    try:
        repos=github_api.req(f'https://api.github.com/orgs/{organization}/repos?page={starting_page}&per_page={res_per_page}')
    except Exception as e:
        logging.warning('A problem occurs while using GitHub API for organization "{}". Skipping to the next organization.'.format(org['user_name']))
        continue
    org['repos_list'] = list()

    while len(repos)>0:
        for repo in repos:
            logging.info('Getting info about "{}" repository of "{}" organization  ...'.format(repo['name'] if 'name' in repo else '-', org['user_name']))
            org['repos_list'].append({
                'name':repo['name'] if 'name' in repo else '',
                'organization':org['url'],
                'description': repo['description'] if 'description' in repo else '',
                'url': repo['html_url'] if 'html_url' in repo else 'https://github.com/{}/{}'.format(org['user_name'],repo['name'] if 'name' in repo else ''),
            'is_fork': 1 if (repo['fork'] if 'fork' in repo else False) else 0,
                'size': repo['size'] if 'size' in repo else '-1',
                'stars': repo['stargazers_count'] if 'stargazers_count' in repo else (repo['watchers'] if 'watchers' in repo else -1),
                'watcher': repo['subscribers_count'] if 'subscribers_count' in repo else -1,
                'language': repo['language'] if 'language' in repo else '',
                'forks': repo['forks_count'] if 'forks_count' in repo else -1,
                'open_issues': repo['open_issues'] if 'open_issues_count' in repo else -1,
                'created_at': repo['created_at'] if 'created_at' in repo else '',
                'updated_at': repo['updated_at'] if 'updated_at' in repo else '',
                'stored_at': str(date.today()),
                'default_branch': repo['default_branch'] if 'default_branch' in repo else 'master',
                'timestamp_created_at': time.mktime(datetime.datetime.strptime( repo['created_at'], "%Y-%m-%dT%XZ").timetuple()) if 'created_at' in repo else '',
                'timestamp_updated_at': time.mktime(datetime.datetime.strptime( repo['updated_at'], "%Y-%m-%dT%XZ").timetuple()) if 'updated_at' in repo else ''
            })
        starting_page+=1
        repos=github_api.req(f'https://api.github.com/orgs/{organization}/repos?page={starting_page}&per_page={res_per_page}')


#### Store each repository of each organization

In [None]:
from lib.sqlite_utils import DBConnection 
import os

for org in orgs:  
    for repo in org['repos_list']:  
        conn.add('repository',repo)
logging.info(f'Stored all repositories into the database.')

#### Collect contributors for each repository

In [None]:
from collections import Counter
from dateutil.parser import parse
from time import strptime,strftime
from lib.github import get_contributors, get_commits, get_pullrequests


users = list()
contribs = list()

instances = list()
for org in orgs:
    instances.extend([(r,org['user_name']) for r in org['repos_list'] if 'repos_list' in org])

for repo,organization in instances:
    try:
        branches = github_api.req('https://api.github.com/repos/{}/{}/branches'.format(organization,repo['name']))
    except Exception as e:
        logging.warning('Cannot use GitHub API for collecting branches of repository "{}" (Organization "{}"). Error:\n{}'.format(repo['name'],organization,e))
        continue
    branches = branches if 'message' not in branches else []

    contributors = []
    if len(branches)==0 or repo['size']==0:
        continue

    contributors = get_contributors(organization,repo['name'],github_api)
    commits = get_commits(organization,repo['name'],github_api)
    pull_requests = get_pullrequests(organization,repo['name'],github_api)

    contributors = [user for user in contributors if 'login' in user]
    pull_requests = [pull for pull in pull_requests if 'user' in pull and 'login' in pull['user'] and 'state' in pull]
    commits = [com for com in commits if 'commit' in com and 'author' in com['commit'] and 'date' in com['commit']['author'] and ('login' in com['commit']['author'] or 'name' in com['commit']['author'])] 

    for user in contributors:
        metadata = github_api.req('https://api.github.com/users/{}'.format(user['login']))
        
        users.append({
            'user_name': metadata['login'] if 'login' in metadata else user['login'],
            'created_at': metadata['created_at'] if 'created_at' in metadata else '',
            'updated_at': metadata['updated_at'] if 'updated_at' in metadata else '',
            'name': metadata['name'],
            'company': metadata['company'] if 'company' in metadata else '',
            'location': metadata['location'] if 'location' in metadata else '',
            'followers': metadata['followers'] if 'followers' in metadata else '',
            'following': metadata['following'] if 'following' in metadata else '',
            'email': metadata['email'] if 'email' in metadata else '',
            'bio': metadata['bio'] if 'bio' in metadata else '',
            'repos': metadata['public_repos'] if 'public_repos' in metadata else '',
            'twitter_username': metadata['twitter_username'] if 'twitter_username' in metadata else '',
        })

        commit_dates= [parse(commit['commit']['author']['date']).strftime('%d/%m/%Y') for commit in commits if  (commit['author']['login'] if (commit['author'] is not None and 'login' in commit['author'])  else  commit['commit']['author']['name']) ==user['login']] 

        contribs.append({
            'user_name': metadata['login'],
            'repository': repo['url'],
            'contributions': user['contributions'],
            'rejected_pull_requests': len([pull for pull in pull_requests if pull['user']['login']==user['login'] and pull['state']=='closed' and not('merged_at'  in pull)]),
            'max_commit_in_a_day':  Counter(commit_dates).most_common()[0][1]  if len(commit_dates)>0 else '',
            'first_commit_date':str(strftime('%d/%m/%Y',min([strptime(date,'%d/%m/%Y') for date in commit_dates])) if len(commit_dates)>0 else "")

        })
    logging.info('Collected {} contributors for repository "{}" ...'.format(len(contributors), repo['url']))


#### Store users and contributors in database

In [None]:
for user in users:
    conn.add_or_update('user',user)

for contrib in contribs:
    conn.add_or_update('contributor',contrib)

#### Close database

In [None]:
conn.close()