# Organization Loader

This notebook is used to store information about an <b>organization</b>, its <b>repositories</b> and their <b>contributors</b> in the database. Set the organization username below and run the notebook.
<hr>

In [None]:
organization = "italia" # Set here the GitHub username of the organization
country = '' # Optional: the country of the organization
access_token="<PERSONAL-ACCESS-TOKEN>" # You need to create a Personal Access Token (https://github.com/settings/tokens)

#### Requirements

In [None]:
#! pip install iso3166

<hr>

#### Logger set up

In [None]:
import logging, os, datetime,sys
from pathlib import Path
Path('logs').mkdir(parents=True,exist_ok=True)
# Logging Levels: DEBUG, INFO, WARNING, ERROR, CRITICAL
logging.basicConfig(#filename=os.path.join('logs','sbom_creator',str(datetime.datetime.now().strftime("%d-%m-%Y T%H %M %S")) +'.log'),
handlers=[
        logging.FileHandler(os.path.join('logs','log-'+str(datetime.datetime.now().strftime("%d-%m-%Y")) +'.log')),
        logging.StreamHandler(sys.stdout)
    ],
                    format='%(asctime)s |:| LEVEL:%(levelname)-2s |:| FILE:notebook_1 (org_loader).ipynb:%(lineno)-s |:| %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)
#if not logging.getLogger().handlers: logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logging.getLogger("urllib3").propagate = False

#### GitHubApi instance

In [None]:
# In order to use the GitHub API endpoints, the class GitHubApi is defined. A [Personal Access Token](https://github.com/settings/tokens) is needed.
from lib.github import GitHubApi
github_api=GitHubApi(access_token)

#### Get organization info

In [None]:
import time
import datetime
from iso3166 import countries

org= None
try:
    logging.info(f'Getting organization info from GitHub API for organization"{organization}"')
    data=github_api.req('https://api.github.com/orgs/{}'.format(organization.lower()))
    members= github_api.req('https://api.github.com/orgs/{}/public_members'.format(organization))
    org= {
        'user_name': organization,
        'url':'https://github.com/'+organization,
        'name': data['name'] if 'name' in data else data['login'] if 'login' in data else '',
        'location': data['location'] if 'location' in data else '',
        'repos': data['public_repos'] if 'public_repos' in data else '',
        'followers': data['followers'] if 'followers' in data else '',
        'created_at': data['created_at'] if 'created_at' in data else '',
        'timestamp_created_at': time.mktime(datetime.datetime.strptime( data['created_at'], "%Y-%m-%dT%XZ").timetuple()) if 'created_at' in data else '',
        'country': country if country!='' else countries.get(data['location'])[0] if 'location' in data and data['location'] in countries else '',
        'members': len(members),
        'is_verified': data['is_verified'] if 'is_verified' in data else ''
    }
except Exception as e:
    logging.critical('A problem occur while using GitHub API for "{}" organization. Problem: {}'.format(organization,e))
    raise Exception ('A problem occur while using GitHub API for "{}" organization.'.format(organization)) from e
    

#### Store organization info in the database

In [None]:
from lib.sqlite_utils import DBConnection 
import os

if not os.path.exists(os.path.join('database','database.sqlite')):
    raise Exception('Database does not exists! You need to create it first (db_builder.ipynb)')

conn=DBConnection(os.path.join('database','database.sqlite'))
logging.info('Connected with "database/database.sqlite" database.') 
conn.add_or_update('organization',org)
logging.info(f'Added organization "{organization}" to the database.')

#### Get repositories info

In [None]:
from datetime import date

res_per_page,starting_page= 20,1
repos=github_api.req(f'https://api.github.com/orgs/{organization}/repos?page={starting_page}&per_page={res_per_page}')

instances = list()

while len(repos)>0:
    for repo in repos:
        logging.info('Getting info about "{}" repository ...'.format(repo['name'] if 'name' in repo else '-'))
        instances.append({
            'name':repo['name'] if 'name' in repo else '',
            'organization':org['url'],
            'description': repo['description'] if 'description' in repo else '',
            'url': repo['html_url'] if 'html_url' in repo else 'https://github.com/{}/{}'.format(org['user_name'],repo['name'] if 'name' in repo else ''),
            'is_fork': 1 if (repo['fork'] if 'fork' in repo else False) else 0,
            'size': repo['size'] if 'size' in repo else '-1',
            'stars': repo['stargazers_count'] if 'stargazers_count' in repo else (repo['watchers'] if 'watchers' in repo else -1),
            'watcher': repo['subscribers_count'] if 'subscribers_count' in repo else -1,
            'language': repo['language'] if 'language' in repo else '',
            'forks': repo['forks_count'] if 'forks_count' in repo else -1,
            'open_issues': repo['open_issues'] if 'open_issues_count' in repo else -1,
            'created_at': repo['created_at'] if 'created_at' in repo else '',
            'updated_at': repo['updated_at'] if 'updated_at' in repo else '',
            'stored_at': str(date.today()),
            'default_branch': repo['default_branch'] if 'default_branch' in repo else 'master',
            'timestamp_created_at': time.mktime(datetime.datetime.strptime( repo['created_at'], "%Y-%m-%dT%XZ").timetuple()) if 'created_at' in repo else '',
            'timestamp_updated_at': time.mktime(datetime.datetime.strptime( repo['updated_at'], "%Y-%m-%dT%XZ").timetuple()) if 'updated_at' in repo else ''
        })
    starting_page+=1
    repos=github_api.req(f'https://api.github.com/orgs/{organization}/repos?page={starting_page}&per_page={res_per_page}')

#### Store repositories in the database

In [None]:
logging.info(f'Storing repositories information for organization "{organization}"')
for instance in instances:
    conn.add_or_update('repository',instance)


#### Get contributors for each repository

In [None]:
from collections import Counter
from dateutil.parser import parse
from time import strptime,strftime
from lib.github import get_contributors, get_commits, get_pullrequests

users = list()
contribs = list()

for repo in instances:
    branches = github_api.req('https://api.github.com/repos/{}/{}/branches'.format(organization,repo['name']))
    branches = branches if 'message' not in branches else []

    contributors = []
    if len(branches)==0 or repo['size']==0:
        continue

    contributors = get_contributors(organization,repo['name'],github_api)
    commits = get_commits(organization,repo['name'],github_api)
    pull_requests = get_pullrequests(organization,repo['name'],github_api)

    contributors = [user for user in contributors if 'login' in user]
    pull_requests = [pull for pull in pull_requests if 'user' in pull and 'login' in pull['user'] and 'state' in pull]
    commits = [com for com in commits if 'commit' in com and 'author' in com['commit'] and 'date' in com['commit']['author'] and ('login' in com['commit']['author'] or 'name' in com['commit']['author'])] 

    for user in contributors:
        metadata = github_api.req('https://api.github.com/users/{}'.format(user['login']))
        
        users.append({
            'user_name': metadata['login'] if 'login' in metadata else user['login'],
            'created_at': metadata['created_at'] if 'created_at' in metadata else '',
            'updated_at': metadata['updated_at'] if 'updated_at' in metadata else '',
            'name': metadata['name'],
            'company': metadata['company'] if 'company' in metadata else '',
            'location': metadata['location'] if 'location' in metadata else '',
            'followers': metadata['followers'] if 'followers' in metadata else '',
            'following': metadata['following'] if 'following' in metadata else '',
            'email': metadata['email'] if 'email' in metadata else '',
            'bio': metadata['bio'] if 'bio' in metadata else '',
            'repos': metadata['public_repos'] if 'public_repos' in metadata else '',
            'twitter_username': metadata['twitter_username'] if 'twitter_username' in metadata else '',
        })

        commit_dates= [parse(commit['commit']['author']['date']).strftime('%d/%m/%Y') for commit in commits if  (commit['author']['login'] if (commit['author'] is not None and 'login' in commit['author'])  else  commit['commit']['author']['name']) ==user['login']] 

        contribs.append({
            'user_name': metadata['login'],
            'repository': repo['url'],
            'contributions': user['contributions'],
            'rejected_pull_requests': len([pull for pull in pull_requests if pull['user']['login']==user['login'] and pull['state']=='closed' and not('merged_at'  in pull)]),
            'max_commit_in_a_day':  Counter(commit_dates).most_common()[0][1]  if len(commit_dates)>0 else '',
            'first_commit_date':str(strftime('%d/%m/%Y',min([strptime(date,'%d/%m/%Y') for date in commit_dates])) if len(commit_dates)>0 else "")

        })
    logging.info('Collected {} contributors for repository "{}" ...'.format(len(contributors), repo['url']))



#### Store users and contributors in the database

In [11]:
for user in users:
    conn.add_or_update('user',user)

for contrib in contribs:
    conn.add_or_update('contributor',contrib)


#### Close database

In [None]:
conn.close()