In [None]:
import pandas as pd
import numpy as np
import pygal

In [None]:
def fetch_ownership(project_name):
    ownership = pd.read_csv('./drilled-informations/' + project_name + '-ownership.csv', sep = ';')
    ownership['FILE_OWNERSHIP'] = ownership['AUTHOR_LINES'] / ownership['FILE_LINES']
    
    algorithms = pd.read_csv('./drilled-informations/' + project_name + '-algorithms.csv', sep = ';')['ALGORITHM']
    
    return ownership[ownership['FILE_NAME'].isin(algorithms)], algorithms

scikit_learn, scikit_learn_algorithms = fetch_ownership('scikit-learn')
scikit_image, scikit_image_algorithms = fetch_ownership('scikit-image')
nltk, nltk_algorithms = fetch_ownership('nltk')

## La majorité des algorithmes sont maintenus par un contributeur majeur.

In [None]:
chart = pygal.Bar(legend_at_bottom = True)

chart.title = 'Proportion of algorithms with a major contributor in function of the owned lines threshold'
chart.x_title = 'Major contributor threshold (in % of owned lines)'
chart.y_title = 'Proportion of algorithms with a major contributor'

chart.x_labels = [str(i * 10) + '%' for i in range(1, 10)]

chart.add('scikit-learn', [len(scikit_learn[scikit_learn['FILE_OWNERSHIP'] >= i / 10].drop_duplicates('FILE_NAME')) / len(scikit_learn_algorithms) for i in range(1, 10)])
chart.add('scikit-image', [len(scikit_image[scikit_image['FILE_OWNERSHIP'] >= i / 10].drop_duplicates('FILE_NAME')) / len(scikit_image_algorithms) for i in range(1, 10)])
chart.add('nltk', [len(nltk[nltk['FILE_OWNERSHIP'] >= i / 10].drop_duplicates('FILE_NAME')) / len(nltk_algorithms) for i in range(1, 10)])

## Un contributeur majeur d'un algorithme n'est contributeur majeur que de cet algorithme

In [None]:
def major_owner_on_several_files(dataset, threshold):
    major = dataset[dataset['FILE_OWNERSHIP'] >= threshold]['AUTHOR_NAME']
    
    owned = pd.Series(major.value_counts().values)
    
    return len(owned[owned != 1])

chart = pygal.Bar(legend_at_bottom = True)

chart.title = 'Normalized number of contributors which are major on several files in function of the owned lines threshold'
chart.x_title = 'Major contributor threshold (in % of owned lines)'
chart.y_title = 'Normalized number of contributors which are major on several files'

chart.x_labels = [str(i * 10) + '%' for i in range(1, 10)]

# The hardcoded values are the number of contributors in the project.
chart.add('scikit-learn', [major_owner_on_several_files(scikit_learn, i / 10) / 1025 for i in range(1, 10)])
chart.add('scikit-image', [major_owner_on_several_files(scikit_image, i / 10) / 247 for i in range(1, 10)])
chart.add('nltk', [major_owner_on_several_files(nltk, i / 10) / 229 for i in range(1, 10)])