### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.

To work with **survey results** this notebook expect to find the following files:
* **UUID's**: ../data/uuids.csv
* **SURVEY**: ../data/survey-fake.csv

**If you need to use a different survey file, just modify in the second code cell below this one.**

In [1]:
import pandas

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
# Read survey dataframe from survey and (response id, uuid) correspondence files

############# MODIFY YOUR SURVEY CSV FILE HERE ######################################
survey_df = ut.load_survey_df(survey_filepath='../data/survey-fake.csv',
                              uuids_filepath='../data/uuids.csv')
#####################################################################################

project_name = 'Rust'

print('Identities with UUID found in survey file: ', len(survey_df))
print('Unique identities found: ', len(survey_df.uuid.unique()))
print()

def add_common_filters(source, s):
    s = ut.add_survey_filters(s, survey_df)
    
    s = s.filter('term', project=project_name)
        
    return s



Identities with UUID found in survey file:  327
Unique identities found:  327



In [3]:
### GET AUTHORS IN GIT, THEIR PROJECTS AND COMMITS
source = 'git'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000) \
        .metric('commits', 'cardinality', field='hash', precision_threshold=1000000)

result = s.execute()

author_commits_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project', value_field='commits',
                                group_column='uuid', subgroup_column='git project', value_column='commits')

print('AUTHORS FOUND IN GIT: ', len(author_commits_df['uuid'].unique()))

AUTHORS FOUND IN GIT:  25


In [4]:
### GET AUTHORS IN BUGZILLA, THEIR PROJECTS AND NUMBER OF BUGS
source = 'bugzilla'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_bugs_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='bugzilla project', value_column='bugs')

print('AUTHORS FOUND IN BUGZILLA: ', len(author_bugs_df['uuid'].unique()))

AUTHORS FOUND IN BUGZILLA:  0


In [5]:
### GET AUTHORS IN MAILING LISTS, THEIR PROJECTS AND NUMBER OF EMAILS
source = 'mbox'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_emails_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='mbox project', value_column='emails')

print('AUTHORS FOUND IN MBOX: ', len(author_emails_df['uuid'].unique()))

AUTHORS FOUND IN MBOX:  0


In [6]:
### GET AUTHORS IN DISCOURSE, THEIR PROJECTS AND NUMBER OF MESSAGES
source = 'discourse'
s = ut.create_search(es_conn, source)

s = add_common_filters(source,s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_discourse_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='discourse project', value_column='messages')

print('AUTHORS FOUND IN DISCOURSE: ', len(author_discourse_df['uuid'].unique()))

AUTHORS FOUND IN DISCOURSE:  0


In [7]:
### GET AUTHORS IN GITHUB, THEIR PROJECTS AND NUMBER OF ISSUES AND PR's
source = 'github_issues'

# PULL REQUESTS
s_prs = ut.create_search(es_conn, source)

s_prs = add_common_filters(source, s_prs)

# Get only PRs
s_prs = s_prs.filter('terms', pull_request=['true'])

s_prs.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_prs = s_prs.execute()


# ISSUES
s_iss = ut.create_search(es_conn, 'github_issues')

s_iss = add_common_filters(source, s_iss)

# Get only Issues
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_iss = s_iss.execute()


# MERGE PR's and ISSUES

prs_df = ut.stack_by(result=result_prs, group_column='uuid', subgroup_column='github project',
                     value_column='pull requests', group_field='uuid', subgroup_field='project')
iss_df = ut.stack_by(result=result_iss, group_column='uuid', subgroup_column='github project',
                     value_column='issues', group_field='uuid', subgroup_field='project')

github_df = pandas.merge(prs_df, iss_df, how='outer', on=['uuid', 'github project'])
github_df = github_df.fillna(0)

print('AUTHORS FOUND IN GITHUB: ', len(github_df['uuid'].unique()))

AUTHORS FOUND IN GITHUB:  28


In [8]:
authors_global_df = pandas.concat([author_commits_df, 
                                   author_emails_df, 
                                   github_df, 
                                   author_bugs_df,
                                   author_discourse_df], 
                                  keys=['git', 'mbox', 'github', 'bugzilla', 'discourse'])


grouped_df = authors_global_df.groupby(['uuid']).agg({'emails': 'sum', 
                                                     'commits': 'sum',
                                                     'pull requests': 'sum',
                                                     'issues': 'sum',
                                                     'bugs': 'sum',
                                                     'messages': 'sum',
                                                     'git project': pandas.Series.nunique,
                                                     'mbox project': pandas.Series.nunique,
                                                     'github project': pandas.Series.nunique,
                                                     'bugzilla project': pandas.Series.nunique,
                                                     'discourse project': pandas.Series.nunique
                                                     })

grouped_df = grouped_df.reset_index().sort_values(by=['commits'], ascending=[False])


grouped_df = grouped_df[['uuid', 'commits', 'pull requests', 'issues', 'bugs', 'emails', 'messages',
                         'git project', 'github project', 'bugzilla project', 'mbox project',
                         'discourse project']]

print('AUTHORS FOUND IN GIT, GITHUB, BUGZILLA, MBOX, DISCOURSE: ', len(grouped_df['uuid'].unique()))

grouped_df = grouped_df.fillna(0)

AUTHORS FOUND IN GIT, GITHUB, BUGZILLA, MBOX, DISCOURSE:  30


In [9]:
### USERS WITH NO ACTIVITY OR COLLABORATING IN UNTRACKED PROJECTS
print('Authors matched in Survey: ', len(survey_df.uuid.unique()))
print('Authors found in indexes & projects: ', len(grouped_df.uuid.unique()))


Authors matched in Survey:  327
Authors found in indexes & projects:  30


# Compute Experience Dataframe

In [10]:
## COMPUTE EXPERIENCE
###
## GET COMMITS BY YEAR AND AUTHOR
###

min_commits = 1

# Buckets by author name, finding first commit for each of them
s = ut.create_search(es_conn, 'git')

s = add_common_filters('git', s)

# Retrieve commits before given year
#s = s.filter('range', grimoire_creation_date={'lt': 'now/y'})

# Bucketize by time, uuid and organization, then count commits per year
s.aggs.bucket('time', 'date_histogram', field='grimoire_creation_date', interval='year') \
    .bucket('uuid', 'terms', field='author_uuid', size=100000, min_doc_count=min_commits) \
    .metric('commits', 'cardinality', field='hash', precision_threshold=1000)

r = s.execute()

exp_df = ut.to_simple_df_by_time(r, 'uuid', 'time', 'commits', 'uuid', 'time', 'commits')
exp_df['last commit'] = exp_df['time'].apply(lambda x: str(pandas.Period(x,'A')))
exp_df['first commit'] = exp_df['last commit']

## ACTIVE CONDITION #################################
## Filter those having less than N commits per year
exp_df = exp_df[exp_df['commits'] >= 1]
#####################################################

## Group by author, get MAX YEAR and NUMBER OF ROWS FOR THE GIVEN AUTHOR
exp_df = exp_df.groupby(['uuid']).agg({'last commit': 'max', 'first commit': 'min', 'commits': 'count'})

exp_df = exp_df.reset_index()

exp_df['exp'] = exp_df['commits']
exp_df= exp_df.drop('commits', axis=1)


# Merge exp with the rest of information we have splitted by uuid
exp_merged_df = grouped_df.merge(exp_df, on=['uuid'], how='left')

print('Before merging:', len(grouped_df), 'After merging:', len(exp_merged_df))

Before merging: 30 After merging: 30


# Metrics

Take survey fields and compare them one to one with activity information from Bitergia's indexes.

In [11]:
## GET GIT COMMITS DISTRIBUTION
source = 'git'
s = ut.create_search(es_conn, source)

s = s.exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
# GET ONLY COMMITS FOR THE GIVEN PROJECT
s = s.filter('term', project=project_name)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .metric('commits', 'cardinality', field='hash', precision_threshold=1000000)

result = s.execute()

nonemp_commits_df = ut.to_simple_df(result, group_field='uuid', value_field='commits',
                                 group_column='uuid', value_column='commits')

print('Authors in Git(Non-employees):', len(nonemp_commits_df))

Authors in Git(Non-employees): 1924


In [12]:
survey_fields = ['age','country', 'gender', 'disability', 'education level', 'language', 'english proficiency']
activity_fields = ['commits', 'pull requests', 'issues',
                   'git project', 'github project',
                   'exp', 'first commit', 'last commit']

for survey_field in survey_fields:
    for activity_field in activity_fields:
        traces = {}
        for uuid in exp_merged_df['uuid'].unique():
            survey_value = survey_df[survey_df['uuid'] == uuid][survey_field].values[0]
            activity_value = exp_merged_df[exp_merged_df['uuid'] == uuid][activity_field].values[0]
            
            # Skip unanswered entries
            if survey_value == None or survey_value == '':
                continue

            if survey_value not in traces:
                traces[survey_value] = []
            traces[survey_value].append(activity_value)
        
        print(survey_field, activity_field)
        ut.print_boxplot(traces, survey_field, activity_field, min_population=1)
        ut.print_pie_chart(traces, survey_field, min_population=1)
        if activity_field in ['commits', 'pull requests', 'issues', 'bugs', 'emails', 'messages']:
            ut.print_hammer_plot(traces, nonemp_commits_df, activity_field, survey_field)
        else:
            ut.print_histogram(traces, activity_field)
        


age commits


age pull requests


age issues


age git project


age github project


age exp


age first commit


age last commit


country commits


country pull requests


country issues


country git project


country github project


country exp


country first commit


country last commit


gender commits


gender pull requests


gender issues


gender git project


gender github project


gender exp


gender first commit


gender last commit


disability commits


disability pull requests


disability issues


disability git project


disability github project


disability exp


disability first commit


disability last commit


education level commits


education level pull requests


education level issues


education level git project


education level github project


education level exp


education level first commit


education level last commit


language commits


language pull requests


language issues


language git project


language github project


language exp


language first commit


language last commit


english proficiency commits


english proficiency pull requests


english proficiency issues


english proficiency git project


english proficiency github project


english proficiency exp


english proficiency first commit


english proficiency last commit
