### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.

In [1]:
import pandas

import plotly as plotly
import plotly.graph_objs as go

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
initial_date = '2010-01-01'

# Git Activity
## Git: Total Number of commits authored
Commits are contributions in terms of Git. Looking at them we can measure not only global activity of projects and organizations, but also how these projects and organizations evolve through time.

In [3]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits', group_column='Project')

## Git: Number of commits authored by Non-Employees

In [4]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (NON_EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

## Git: Number of commits authored by Employees

In [5]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .filter('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

**Figure above: commits by project using Spreadsheet based project grouping**

## Git: Number of commits authored by Organization 

In [6]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (max 100 projects)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=10)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', 
                       value_column='# Contributions', group_field='organizations', subgroup_field='time', 
                       metric_field='contributions',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

# Plot it
ut.print_stacked_bar(df=df, time_column='Time', value_column='# Contributions', group_column='Organization')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


## GitHub: Issues and Pull Requests

In [None]:
### GET AUTHORS IN GITHUB, THEIR PROJECTS AND NUMBER OF ISSUES AND PR's
source = 'github_issues'

# PULL REQUESTS
s_prs = ut.create_search(es_conn, source)

# Get only PRs
s_prs = s_prs.filter('terms', pull_request=['true'])

s_prs = s_prs.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s_prs.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_prs = s_prs.execute()


# ISSUES
s_iss = ut.create_search(es_conn, 'github_issues')

# Get only Issues
s_iss = s_iss.filter('terms', pull_request=['false'])

s_iss = s_iss.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s_iss.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_iss = s_iss.execute()


# MERGE PR's and ISSUES

prs_df = ut.stack_by(result=result_prs, group_column='uuid', subgroup_column='github project',
                     value_column='pull requests', group_field='uuid', subgroup_field='project')
iss_df = ut.stack_by(result=result_iss, group_column='uuid', subgroup_column='github project',
                     value_column='issues', group_field='uuid', subgroup_field='project')

github_df = pandas.merge(prs_df, iss_df, how='outer', on=['uuid', 'github project'])
github_df = github_df.fillna(0)

In [17]:
# Open & Closed PRs by Project (max 100 projects)
s_prs = ut.create_search(es_conn, 'github_issues')

s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('projects', 'terms', field='project', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = ut.create_search(es_conn, 'github_issues')

s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('projects', 'terms', field='project', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [18]:
prs_df = ut.stack_by(result=result_prs, group_column='Project', subgroup_column='Status',
                     value_column='# Pull Requests', group_field='projects', subgroup_field='status')
iss_df = ut.stack_by(result=result_iss, group_column='Project', subgroup_column='Status',
                     value_column='# Issues', group_field='projects', subgroup_field='status')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Project', 'Status'])
joined_df = joined_df.fillna(0)
joined_df

Unnamed: 0,Project,Status,# Pull Requests,# Issues
0,Unknown,closed,63523.0,34378.0
1,Unknown,open,1425.0,9558.0
2,Rust,closed,26838.0,21483.0
3,Rust,open,217.0,4718.0
4,Web Properties,closed,25822.0,6020.0
5,Web Properties,open,204.0,1530.0
6,Servo,closed,16119.0,6558.0
7,Servo,open,166.0,2663.0
8,Addons/Web Extensions,closed,9001.0,3639.0
9,Addons/Web Extensions,open,49.0,696.0


### GitHub: Issues and Pull Requests by Organization

Below we show number of Pull Requests and Issues open and closed **by Organization**:


In [19]:
# Open & Closed PRs by Organization (max 100 projects)
s_prs = ut.create_search(es_conn, 'github_issues')
s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = ut.create_search(es_conn, 'github_issues')
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [20]:
prs_df = ut.stack_by(result=result_prs, group_column='Organization', subgroup_column='Status',
                     value_column='# Pull Requests', group_field='organizations', subgroup_field='status')
iss_df = ut.stack_by(result=result_iss, group_column='Organization', subgroup_column='Status',
                     value_column='# Issues', group_field='organizations', subgroup_field='status')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Organization', 'Status'])
joined_df = joined_df.fillna(0)
joined_df

Unnamed: 0,Organization,Status,# Pull Requests,# Issues
0,Mozilla Staff,closed,114319.0,58710.0
1,Mozilla Staff,open,943.0,14838.0
2,Community,closed,56098.0,36722.0
3,Community,open,1792.0,11243.0
4,Mozilla Reps,closed,93.0,218.0
5,Mozilla Reps,open,7.0,109.0


### GitHub: Issues and Pull Requests made by people hired by Mozilla

To compare contributors **hired by Mozilla** to the rest of contributors we first show a list of Organizations we are considering as 'Mozilla Staff' or 'Others'. Next a table is shown with aggregated numbers to compare both contributor groups.

In [21]:
# Open & Closed PRs by Organization (max 100 projects)
s_prs = ut.create_search(es_conn, 'github_issues')
s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = ut.create_search(es_conn, 'github_issues')
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [22]:
print("\nPRS\n")
prs_df = ut.stack_by_cusum(result=result_prs, group_column='Organization', subgroup_column='Status',
                           value_column='# Pull Requests', group_field='organizations', subgroup_field='status',
                           staff_org_names=['Mozilla Staff'], staff_org='Employees')
print("\nISSUES\n")
iss_df = ut.stack_by_cusum(result=result_iss, group_column='Organization', subgroup_column='Status',
                           value_column='# Issues', group_field='organizations', subgroup_field='status',
                           staff_org_names=['Mozilla Staff'], staff_org='Employees')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Organization', 'Status'])
joined_df = joined_df.fillna(0)
joined_df


PRS

Mozilla Staff -> Employees
Community -> Non-Employees
Mozilla Reps -> Non-Employees

ISSUES

Mozilla Staff -> Employees
Community -> Non-Employees
Mozilla Reps -> Non-Employees


Unnamed: 0,Organization,Status,# Pull Requests,# Issues
0,Employees,closed,114319.0,58710.0
1,Employees,open,943.0,14838.0
2,Non-Employees,closed,56191.0,36940.0
3,Non-Employees,open,1799.0,11352.0


## Bugzilla: Bugs by Project 

In [19]:
s = ut.create_search(es_conn, 'bugzilla')

# Unique count of Commits by Project
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Bugs',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Bugs', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Bugs', group_column='Project')

**Above: Bugzilla bugs over time by Project**

## Bugzilla: Bugs by Organization

In [16]:
s = ut.create_search(es_conn, 'bugzilla')

# Count of Bugs by Org
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='Bugs',
                       group_field='organizations', subgroup_field='time',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


In [17]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='Bugs', group_column='Organization')

**Above: Bugzilla bugs over time by organization **

# Mailing lists:
## Number of e-mails sent by project
### TODO: enrich index with project information


In [3]:
s = ut.create_search(es_conn, 'mbox')

# Count of e-mails by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('origin', 'terms', field='origin', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
lists_df = ut.stack_by(result=result, group_column='List', subgroup_column='Time', value_column='E-mails',
                       group_field='origin', subgroup_field='time')

# Group By project
projects = ut.get_projects()
merged_df = lists_df.merge(projects['Mailing lists'], on='List', how='left')

projects_df = merged_df.groupby(['Project', 'Time']).agg({'E-mails': 'sum', 'List': 'count'})
projects_df = projects_df.sort_values(by='E-mails', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df.reset_index(), time_column='Time', value_column='E-mails',
                  group_column='Project')

## Number of e-mails sent by organization

In [10]:
s = ut.create_search(es_conn, 'mbox')

# Count of E-mails by Org
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='E-Mails',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Mozilla Reps -> Non-Employees


In [11]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='E-Mails', group_column='Organization')

**Discourse**:
  * Number of threads initiated
    * https://analytics.mozilla.community:443/goto/71771202d68a10cc422c6bda86c7cf3e
  * Number of comments posted
    * https://analytics.mozilla.community:443/goto/73c76412902180d14e0418d03fb30884
  
These metrics will be computed for the speficied contributor groups, over time.