In [1]:
from enum import Enum 

class Modification(Enum):
    ADDED = "Lines added"
    REMOVED = "Lines removed"
    TOTAL = "Lines added + lines removed"
    DIFF = "Lines added - lines removed"

In [2]:
import pydriller
from collections import defaultdict
import tqdm
from pathlib import Path
from git import Repo

path_to_repo = "./mastodon"
repo_path = Path('./mastodon')

start_tag =  'v3.0.0'
current_tag = 'v3.5.0'
file_type = '.rb'


if repo_path.exists():
    print('Repository exists!')
else:
    # Clone the repository
    #  clone using the python git library
    Repo.clone_from('https://github.com/mastodon/mastodon', path_to_repo)

repo = pydriller.Repository(path_to_repo, from_tag=start_tag, to_tag=current_tag)

commits = list(repo.traverse_commits())
progress = tqdm.tqdm(unit="commit", total=len(commits))

files_by_author = defaultdict(set)
authors_per_file = defaultdict(set)
for commit in commits:
    for modified_file in commit.modified_files:
        author = commit.author.name.strip()
        file = str(modified_file.new_path)
        
        if not file:
            continue
            
        if not file.endswith(file_type):
            continue
        
        files_by_author[author].add(file)        
        authors_per_file[file].add(author)
    
    progress.update(1)


Repository exists!


100%|█████████▉| 3413/3428 [00:28<00:00, 84.39commit/s] 

In [3]:
# Number of total contributors

print("Total number of contributors: {}".format(len(files_by_author)))

print("Author with most contributions: {}".format(max(files_by_author, key=lambda x: len(files_by_author[x]))), "with {} contributions".format(len(files_by_author[max(files_by_author, key=lambda x: len(files_by_author[x]))])))

Total number of contributors: 86
Author with most contributions: Eugen Rochko with 910 contributions


In [4]:
files_by_author_count = { author : len(files_by_author[author]) for author in files_by_author}
#  sort
files_by_author_count = {k: v for k, v in sorted(files_by_author_count.items(), key=lambda item: item[1], reverse=True)}
files_by_author_count




{'Eugen Rochko': 910,
 'Claire': 428,
 'ThibG': 281,
 'Takeshi Umeda': 111,
 'abcang': 36,
 'luigi': 35,
 'santiagorodriguez96': 26,
 'Jeong Arm': 23,
 'Josh Soref': 23,
 'dependabot[bot]': 22,
 'Akihiko Odaki': 17,
 'Yamagishi Kazutoshi': 12,
 'luzpaz': 11,
 'OSAMU SATO': 10,
 'dependabot-preview[bot]': 8,
 'trwnh': 8,
 'Shubhendra Singh Chauhan': 8,
 'tateisu': 7,
 'Cecylia Bocovich': 7,
 'BSKY': 5,
 'Daigo 3 Dango': 5,
 'Justin Tracey': 5,
 'Faye Duxovni': 4,
 'Bèr Kessels': 4,
 'puckipedia': 3,
 'Gomasy': 3,
 'scd31': 3,
 'sternenseemann': 3,
 'Lerk': 3,
 'fuyu': 3,
 'chandrn7': 3,
 'Jennifer Glauche': 2,
 'Mathieu Brunot': 2,
 'Thomas Citharel': 2,
 'Alice Gaudon': 2,
 'Alexander': 2,
 'ysksn': 2,
 'notozeki': 2,
 'ButterflyOfFire': 2,
 'Sasha Sorokin': 2,
 'mayaeh': 2,
 'kaiyou': 2,
 'Taras Gogol': 2,
 'taicv': 2,
 'Stanislas': 2,
 'niwatori24': 2,
 'kawaguchi': 2,
 'Josh Leeb-du Toit': 2,
 'Mélanie Chauvel': 2,
 'Levi Bard': 2,
 'rinsuki': 2,
 'Holger': 2,
 'Daniel': 2,
 'Truong

In [5]:
#find unique authors in commits
authors = []
authors_total = []
modified_files = 0

for commit in commits:
    authors_total.append(commit.author.name)
    name = commit.author.name
    modified_files += len(commit.modified_files)

    if name not in authors:
        authors.append(name)

#print("Authors Total: ", authors_total)    
print("Number of unique authors: ", len(set(authors)))
print("Number of total authors: ", len(authors_total))
print('Total Number of modified files: ', modified_files)
print("Unique Authors: \n", authors)
# find number of commits by author





print("Total commits: ",len(commits))

# find number of commits per author
commits_per_author = {}
for author in authors_total:
    if author not in commits_per_author:
        commits_per_author[author] = 1
    else:
        commits_per_author[author] += 1

commits_per_author_sorted = sorted(commits_per_author.items(), key=lambda x:x[1], reverse=True)
print(commits_per_author_sorted)

Total_commits = list(commits_per_author.values())
print("Total commits: ",sum(Total_commits))

Number of unique authors:  156
Number of total authors:  3428
Total Number of modified files:  16291
Unique Authors: 
 ['Eugen Rochko', 'dependabot-preview[bot]', 'Jeong Arm', 'Hugo Gameiro', 'Mareena Kunjachan', 'trwnh', 'BSKY', 'ThibG', 'puckipedia', 'Soft. Dev', 'nightpool', 'kodai', 'Hinaloe', 'umonaca', 'BenisonSebastian', 'Nima Boscarino', 'Takeshi Umeda', 'Faye Duxovni', 'Yamagishi Kazutoshi', 'Nathaniel Suchy', 'David Caldwell', 'Scott Sweeny', 'Nolan Lawson', 'mayaeh', 'Gabriel Rubens', 'Darius Kazemi', 'koyu', 'sclaire-1', 'Jennifer Glauche', 'Gomasy', 'Dimitri Merejkowsky', 'noiob', 'Sasha Sorokin', 'Shlee', 'ntl-purism', 'Mathieu Brunot', 'tateisu', 'Bèr Kessels', 'Acid Chicken (硫酸鶏)', 'Thomas Citharel', 'Alice Gaudon', 'scd31', 'Marcin Mikołajczak', 'Dan Hunsaker', 'Aries', 'Matt Panaro', 'chr v1.x', 'Bastien Durel', 'Daigo 3 Dango', 'Alexander', 'ysksn', 'Stéphane Guillou', 'Kody', 'notozeki', 'Ben Lubar', 'ButterflyOfFire', 'Даниил Пронин', 'Renato "Lond" Cerqueira', 'ab

In [6]:
authors_per_file_count = { file : len(authors_per_file[file]) for file in authors_per_file}
#  sort 
authors_per_file_count = {k: v for k, v in sorted(authors_per_file_count.items(), key=lambda item: item[1], reverse=True)}
authors_per_file_count

#  count of unique authors in total



{'app/models/status.rb': 10,
 'db/schema.rb': 10,
 'app/lib/feed_manager.rb': 8,
 'app/services/fetch_link_card_service.rb': 8,
 'config/application.rb': 8,
 'app/lib/formatter.rb': 7,
 'app/lib/activitypub/activity.rb': 7,
 'app/models/user.rb': 7,
 'config/routes.rb': 7,
 'lib/mastodon/statuses_cli.rb': 7,
 'app/controllers/accounts_controller.rb': 7,
 'app/models/media_attachment.rb': 6,
 'spec/models/status_spec.rb': 6,
 'lib/mastodon/media_cli.rb': 6,
 'app/models/account.rb': 6,
 'app/mailers/user_mailer.rb': 6,
 'app/controllers/application_controller.rb': 6,
 'app/helpers/settings_helper.rb': 6,
 'app/serializers/rest/account_serializer.rb': 6,
 'config/initializers/rack_attack.rb': 6,
 'app/lib/request.rb': 6,
 'app/controllers/auth/sessions_controller.rb': 6,
 'config/environments/production.rb': 6,
 'app/lib/sanitize_config.rb': 5,
 'app/models/concerns/account_interactions.rb': 5,
 'app/services/backup_service.rb': 5,
 'app/lib/activitypub/activity/create.rb': 5,
 'app/mode

In [7]:
from bokeh.plotting import figure, show, output_notebook, output_file, reset_output
from bokeh.models import ColumnDataSource, CategoricalColorMapper, PrintfTickFormatter, NumeralTickFormatter, Legend, CDSView, GroupFilter, CustomJS, BoxSelectTool, FactorRange
from bokeh.layouts import gridplot, column
from bokeh.models.widgets import Div
from bokeh.palettes import Spectral6, Pastel1, Category20c, Inferno256
import math
import os

output_notebook()

In [8]:
# visualize the distribution of authors among files: the visualization should have
# on the x axis the number of authors per file (from 1 to max), and on the y axis
# the number of files with the given number of authors (so for example the first
# bar represent the number of files with single author)

import numpy as np

# we only need a list where every item represents a file, and its value is the number of authors of that file
files_authors = list(authors_per_file_count.values())
max_authors = max(files_authors)

# we create an histogram with numpy, specifying the bins explicitly (instead of automatically) to have round values
hist_files_authors = np.histogram(files_authors, bins=range(1,max_authors + 2))
p = figure(toolbar_location=None, title="Distribution of authors among files", height=300, width=600)
p.vbar(x=hist_files_authors[1][:-1], top=hist_files_authors[0], line_color="white")

# # styling
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.x_range.start = 0.5
p.yaxis.minor_tick_line_color=None
p.xaxis.minor_tick_line_color=None
p.xaxis.ticker = list(range(1,9))
p.yaxis[0].formatter = NumeralTickFormatter(format="0,0")

show(p)

In [9]:
# visualize the distribution of files among authors: the visualization should have
# on the x axis the number of files per author (from 1 to max), and on the y axis
# the number of authors that own the given number of files (so for example the first
# bar represent the minor contributors, i.e., the number of authors who own 1 file)


# we only need a list where every item represents an author, and its value is the number of files modified by that author
authors_file = list(files_by_author_count.values())
max_files = max(authors_file)

hist_authors_file = np.histogram(authors_file, bins=10)
hist_authors_file

(array([82,  1,  0,  1,  1,  0,  0,  0,  0,  1]),
 array([  1. ,  91.9, 182.8, 273.7, 364.6, 455.5, 546.4, 637.3, 728.2,
        819.1, 910. ]))

In [10]:
bins = [round(x) for x in hist_authors_file[1]]

hist_authors_file = np.histogram(authors_file, bins=bins)
hist_authors_file

(array([82,  1,  0,  1,  1,  0,  0,  0,  0,  1]),
 array([  1,  92, 183, 274, 365, 456, 546, 637, 728, 819, 910]))

In [11]:
bins_str = []
for index, el in enumerate(hist_authors_file[1][:-1]):
    bins_str.append(f"[{el},{hist_authors_file[1][index+1]})")
bins_str

['[1,92)',
 '[92,183)',
 '[183,274)',
 '[274,365)',
 '[365,456)',
 '[456,546)',
 '[546,637)',
 '[637,728)',
 '[728,819)',
 '[819,910)']

In [12]:
p = figure(x_range=bins_str, toolbar_location=None, title="Distribution of files among authors", height=300, width=600)
p.vbar(x=bins_str, top=hist_authors_file[0], line_color="white")

# styling
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.yaxis.minor_tick_line_color=None
p.xaxis.minor_tick_line_color=None

show(p)

In [13]:
class Modification(Enum):
    ADDED = "Lines added"
    REMOVED = "Lines removed"
    TOTAL = "Lines added + lines removed"
    DIFF = "Lines added - lines removed"
    
def calculate_author_contributions(repo, package, method):
    commits = list(repo.traverse_commits())
    progress = tqdm.tqdm(unit="commit", total=len(commits))
    
    result = defaultdict(int)
    for commit in commits:
        for modified_file in commit.modified_files:
            author = commit.author.name.strip()
            file = str(modified_file.new_path)
            
            if not file:
                continue
            
            # skip files outside the queried package
            if not package in file:
                continue
            
            if method == Modification.ADDED:
                result[author] += modified_file.added_lines
            elif method == Modification.REMOVED:
                result[author] += modified_file.deleted_lines
            elif method == Modification.TOTAL:
                result[author] += modified_file.added_lines + modified_file.deleted_lines
            elif method == Modification.DIFF:
                result[author] += modified_file.added_lines - modified_file.deleted_lines
        
        progress.update(1)

    return result

calculate_author_contributions(repo, "app/", Modification.ADDED)

100%|██████████| 3428/3428 [00:29<00:00, 116.33commit/s]


defaultdict(int,
            {'Hugo Gameiro': 4,
             'Eugen Rochko': 73631,
             'trwnh': 22,
             'dependabot-preview[bot]': 9,
             'BSKY': 40,
             'ThibG': 5548,
             'puckipedia': 19,
             'nightpool': 1,
             'Hinaloe': 8,
             'umonaca': 1,
             'Nima Boscarino': 28,
             'Takeshi Umeda': 1265,
             'Faye Duxovni': 23,
             'Yamagishi Kazutoshi': 57,
             'Nolan Lawson': 23,
             'mayaeh': 597,
             'Gabriel Rubens': 1,
             'Darius Kazemi': 3,
             'Jennifer Glauche': 1,
             'Gomasy': 4,
             'noiob': 2,
             'Dimitri Merejkowsky': 1,
             'Sasha Sorokin': 833,
             'Mathieu Brunot': 12,
             'Shlee': 5,
             'Jeong Arm': 106,
             'Acid Chicken (硫酸鶏)': 2,
             'Thomas Citharel': 1,
             'Alice Gaudon': 29,
             'scd31': 1,
             'Marcin Mik

In [14]:
method = Modification.TOTAL

# TODO

data_p1 = calculate_author_contributions(repo, "app/", method)
data_p2 = calculate_author_contributions(repo, "lib/", method)
data_p3 = calculate_author_contributions(repo, "public/", method)

100%|██████████| 3428/3428 [00:31<00:00, 107.85commit/s]
100%|██████████| 3428/3428 [00:31<00:00, 110.47commit/s]
100%|██████████| 3428/3428 [00:29<00:00, 116.77commit/s]


In [15]:
data_p1

defaultdict(int,
            {'Hugo Gameiro': 4,
             'Eugen Rochko': 107623,
             'trwnh': 37,
             'dependabot-preview[bot]': 16,
             'BSKY': 79,
             'ThibG': 7333,
             'puckipedia': 27,
             'nightpool': 2,
             'Hinaloe': 9,
             'umonaca': 2,
             'Nima Boscarino': 28,
             'Takeshi Umeda': 1840,
             'Faye Duxovni': 32,
             'Yamagishi Kazutoshi': 821,
             'Nolan Lawson': 26,
             'mayaeh': 741,
             'Gabriel Rubens': 1,
             'Darius Kazemi': 6,
             'Jennifer Glauche': 2,
             'Gomasy': 9,
             'noiob': 4,
             'Dimitri Merejkowsky': 2,
             'Sasha Sorokin': 1204,
             'Mathieu Brunot': 16,
             'Shlee': 10,
             'Jeong Arm': 133,
             'Acid Chicken (硫酸鶏)': 4,
             'Thomas Citharel': 2,
             'Alice Gaudon': 33,
             'scd31': 2,
             'Marci

In [16]:
def plot_author_contribution(data, title):
    sorted_data = sorted(data.items(), key=lambda item: item[1], reverse=True)
    x_range = [item[0] for item in sorted_data]
    top = [item[1] for item in sorted_data]

    p = figure(x_range=x_range, 
               height=350, title=title,
               toolbar_location=None, tools="")
    p.vbar(x=x_range, top=top, width=0.9)

    # styling
    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = "vertical"
    p.y_range.start = 0
    p.yaxis.minor_tick_line_color=None
    p.xaxis.minor_tick_line_color=None
    # p.xaxis.ticker = list(range(1,9))
    p.yaxis[0].formatter = NumeralTickFormatter(format="0,0")

    show(p)

In [17]:
plot_author_contribution(data_p1, "Author Contributions Search (Package 1)")
plot_author_contribution(data_p2, "Author Contributions Index (Package 2)")
plot_author_contribution(data_p3, "Author Contributions Action (Package 3)")


## Task 2: Knowledge loss

We now want to analyze the knowledge loss when the main contributor of the analyzed project would leave. For this we will use the circle packaging layout introduced in the "Code as a Crime Scene" book. This assignment includes the necessary `knowledge_loss.html` file as well as the `d3` folder for all dependencies. You task is to create the `output.json` file according to the specification below. This file can then be visualized with the files provided.

For showing the visualization, once you have the output as `output.json` you should
* make sure to have the `knowledge_loss.html` file in the same folder
* start a local HTTP server in the same folder (e.g. with python `python3 -m http.server`) to serve the html file (necessary for d3 to work)
* open the served `knowledge_loss.html` and look at the visualization

For the package you identify as the worst in terms of knowledge loss, investigate the author contributions using the function defined in the previous exercise and comment how the situation is, e.g. how big the gap between the main author and the second biggest contributor for the selected package is.

In [18]:
repo = pydriller.Repository(path_to_repo, from_tag=start_tag, to_tag=current_tag)

method = Modification.TOTAL

commits = list(repo.traverse_commits())
progress = tqdm.tqdm(unit="commit", total=len(commits))

files_authors = defaultdict(dict)
for commit in commits:
    for modified_file in commit.modified_files:
        author = commit.author.name.strip()
        file = str(modified_file.new_path)

        if not file:
            continue

        if not file.endswith(file_type):
            continue

        if author not in files_authors[file]:
            files_authors[file][author] = 0
            
        if method == Modification.ADDED:
            files_authors[file][author] += modified_file.added_lines
        elif method == Modification.REMOVED:
            files_authors[file][author] += modified_file.deleted_lines
        elif method == Modification.TOTAL:
            files_authors[file][author] += modified_file.added_lines + modified_file.deleted_lines
        elif method == Modification.DIFF:
            files_authors[file][author] += modified_file.added_lines - modified_file.deleted_lines
        
    
    
    progress.update(1)

files_authors

100%|██████████| 3428/3428 [02:33<00:00, 22.27commit/s]


defaultdict(dict,
            {'lib/mastodon/version.rb': {'Eugen Rochko': 74,
              'dependabot[bot]': 6,
              'Claire': 4},
             'config/deploy.rb': {'dependabot-preview[bot]': 6,
              'Bèr Kessels': 18,
              'dependabot[bot]': 8,
              'Claire': 2},
             'lib/mastodon/accounts_cli.rb': {'Jeong Arm': 10,
              'ThibG': 36,
              'Eugen Rochko': 64,
              'Claire': 2},
             'config/initializers/paperclip.rb': {'Eugen Rochko': 65,
              'tateisu': 4,
              'mayaeh': 10,
              'Takeshi Umeda': 2},
             'app/models/media_attachment.rb': {'Hugo Gameiro': 4,
              'ThibG': 35,
              'Yamagishi Kazutoshi': 2,
              'Eugen Rochko': 317,
              'Takeshi Umeda': 1,
              'Claire': 39},
             'app/controllers/api/v1/timelines/home_controller.rb': {'Eugen Rochko': 8},
             'app/lib/feed_manager.rb': {'Eugen Rochko': 375,


In [19]:
files_authors

defaultdict(dict,
            {'lib/mastodon/version.rb': {'Eugen Rochko': 74,
              'dependabot[bot]': 6,
              'Claire': 4},
             'config/deploy.rb': {'dependabot-preview[bot]': 6,
              'Bèr Kessels': 18,
              'dependabot[bot]': 8,
              'Claire': 2},
             'lib/mastodon/accounts_cli.rb': {'Jeong Arm': 10,
              'ThibG': 36,
              'Eugen Rochko': 64,
              'Claire': 2},
             'config/initializers/paperclip.rb': {'Eugen Rochko': 65,
              'tateisu': 4,
              'mayaeh': 10,
              'Takeshi Umeda': 2},
             'app/models/media_attachment.rb': {'Hugo Gameiro': 4,
              'ThibG': 35,
              'Yamagishi Kazutoshi': 2,
              'Eugen Rochko': 317,
              'Takeshi Umeda': 1,
              'Claire': 39},
             'app/controllers/api/v1/timelines/home_controller.rb': {'Eugen Rochko': 8},
             'app/lib/feed_manager.rb': {'Eugen Rochko': 375,


In [20]:
contribs_data = {}

for file, author_contribs in files_authors.items():
    total_contribs = sum(author_contribs.values())
    current_percentages = {}
    main_author = None
    max_author_percentage = 0
    if total_contribs != 0:
        for author, contribs in author_contribs.items():
            author_percentage = contribs / total_contribs
            if author_percentage > max_author_percentage:
                max_author_percentage = author_percentage
                main_author = author
        
        contribs_data[file] = {
            "main_author": main_author,
            "main_author_percentage": round(max_author_percentage, 2),
            "total_contribs": total_contribs
        }
        
        
contribs_data

{'lib/mastodon/version.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.88,
  'total_contribs': 84},
 'config/deploy.rb': {'main_author': 'Bèr Kessels',
  'main_author_percentage': 0.53,
  'total_contribs': 34},
 'lib/mastodon/accounts_cli.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.57,
  'total_contribs': 112},
 'config/initializers/paperclip.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.8,
  'total_contribs': 81},
 'app/models/media_attachment.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.8,
  'total_contribs': 398},
 'app/controllers/api/v1/timelines/home_controller.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 1.0,
  'total_contribs': 8},
 'app/lib/feed_manager.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.73,
  'total_contribs': 513},
 'app/models/home_feed.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.78,
  'total_contribs': 23},
 'app/mo

## Output Format for Visualization

* `root` is always the root of the tree
* `size` should be the total number of lines of contribution
* `weight` can be set to the same as `size`
* `ownership` should be set to the percentage of contributions from the main author (e.g. 0.98 for 98% if contributions coming from the main author)

```
{
  "name": "root",
  "children": [
    {
      "name": "test",
      "children": [
        {
          "name": "benchmarking",
          "children": [
            {
              "author_color": "red",
              "size": "4005",
              "name": "t6726-patmat-analysis.scala",
              "weight": 1.0,
              "ownership": 0.9,
              "children": []
            },
            {
              "author_color": "red",
              "size": "55",
              "name": "TreeSetIterator.scala",
              "weight": 0.88,
              "ownership": 0.9,
              "children": []
            }
          ]
        }
      ]
    }
  ]
}
```

## JSON Export

For exporting the data to JSON you can use the following snippet:

```
import json

with open("output.json", "w") as file:
    json.dump(tree, file, indent=4)
```

In [21]:
main_authors = defaultdict(int)
for info in contribs_data.values():
    main_authors[info['main_author']] += 1
    
sorted(main_authors.items(), key=lambda k: k[1], reverse=True)

[('Eugen Rochko', 791),
 ('Claire', 200),
 ('ThibG', 135),
 ('Takeshi Umeda', 31),
 ('santiagorodriguez96', 17),
 ('abcang', 12),
 ('luigi', 7),
 ('OSAMU SATO', 5),
 ('Bèr Kessels', 4),
 ('Shubhendra Singh Chauhan', 4),
 ('Akihiko Odaki', 4),
 ('dependabot[bot]', 4),
 ('dependabot-preview[bot]', 3),
 ('Jeong Arm', 3),
 ('tateisu', 3),
 ('Josh Soref', 3),
 ('Faye Duxovni', 2),
 ('Jennifer Glauche', 2),
 ('chandrn7', 2),
 ('luzpaz', 2),
 ('BSKY', 1),
 ('puckipedia', 1),
 ('noiob', 1),
 ('Mathieu Brunot', 1),
 ('Yamagishi Kazutoshi', 1),
 ('scd31', 1),
 ('Lerk', 1),
 ('ysksn', 1),
 ('notozeki', 1),
 ('Sasha Sorokin', 1),
 ('sternenseemann', 1),
 ('Taras Gogol', 1),
 ('rinsuki', 1),
 ('mayaeh', 1),
 ('Marcin Mikołajczak', 1),
 ('trwnh', 1),
 ('Cecylia Bocovich', 1),
 ('Sumak', 1)]

In [22]:
main_author_by_contribs = defaultdict(int)
for file, author_contribs in files_authors.items():
    for author, contribs in author_contribs.items():
        main_author_by_contribs[author] += contribs

sorted(main_author_by_contribs.items(), key=lambda k: k[1], reverse=True)        

[('Eugen Rochko', 34106),
 ('Claire', 8179),
 ('ThibG', 6291),
 ('Takeshi Umeda', 2007),
 ('santiagorodriguez96', 1432),
 ('abcang', 502),
 ('tateisu', 163),
 ('Bèr Kessels', 154),
 ('Jeong Arm', 143),
 ('Akihiko Odaki', 138),
 ('Josh Soref', 132),
 ('luigi', 131),
 ('chandrn7', 70),
 ('dependabot[bot]', 66),
 ('Yamagishi Kazutoshi', 63),
 ('Lerk', 63),
 ('OSAMU SATO', 53),
 ('sternenseemann', 50),
 ('Cecylia Bocovich', 42),
 ('dependabot-preview[bot]', 41),
 ('Marcin Mikołajczak', 40),
 ('scd31', 39),
 ('Taras Gogol', 39),
 ('trwnh', 32),
 ('Mathieu Brunot', 30),
 ('Levi Bard', 30),
 ('puckipedia', 27),
 ('luzpaz', 24),
 ('Nathaniel Suchy', 22),
 ('Thomas Citharel', 22),
 ('Justin Tracey', 22),
 ('Daigo 3 Dango', 20),
 ('fuyu', 20),
 ('Alexander', 18),
 ('Shubhendra Singh Chauhan', 18),
 ('dogelover911', 18),
 ('ntl-purism', 16),
 ('Jennifer Glauche', 15),
 ('notozeki', 14),
 ('ysksn', 13),
 ('mayaeh', 12),
 ('BSKY', 12),
 ('rinsuki', 12),
 ('Ben Lubar', 11),
 ('Gomasy', 11),
 ('Mélan

In [23]:
main_author = "uboness"


def add_to_tree(branch, tree, info):
    
    if len(branch) == 1:
        if info["main_author"] == main_author:
            color = "red"
        else:
            color = "navy"
                        
        tree["children"].append({
            "author_color": color,
            "size": info["total_contribs"],
            "weight": info["total_contribs"],
            "name": branch[0],
            "ownership": info["main_author_percentage"],
            "children": []
        })
    else:
        node = branch[0]
        others = branch[1:]
        
        child = None
        for c in tree["children"]:
            if c["name"] == node:
                child=c
                break                
        
        if not child:
            child = {
                "name": node,
                "children": []
            }
            tree["children"].append(child)
            
        add_to_tree(others, child, info)
        
tree = {
    "name": "root",
    "children": []
}

for file, info in contribs_data.items():
    parts = file.split("/")
    add_to_tree(parts, tree, info)


tree
import json

with open("output-2022.json", "w") as file:
    json.dump(tree, file, indent=4)

In [24]:
'app/account/models/account.rb'.split('/')

['app', 'account', 'models', 'account.rb']

In [25]:
data_bucket = calculate_author_contributions(repo, "app/accounts", Modification.TOTAL)
plot_author_contribution(data_bucket, "Author contributions for app/accounts")

100%|██████████| 3428/3428 [00:28<00:00, 122.36commit/s]


## Task 3: Code Churn Analysis

The third and last task is to analyze the code churn of the _elasticsearch_ project. For this analysis we look at the code churn, meaning the daily change in the total number of lines of the project. Visualize the code churn over time bucketing the data by day. Remember that you'll need to interpolate the data for days when there are no commits. Chose an interpolation strategy and justify it.

Look at the churn trend over time and identify two outliers. For each of them:
- identify if it was caused by a single or multiple commits (since you are bucketing the data by day)
- find the hash of the involved commit(s)
- find the involved files
- look at the actual diff

Based on the above, discuss if the outlier is a false positive or should be a reason for concern.

In [26]:
import datetime
import pytz
from bokeh.plotting import figure, show, output_notebook, output_file, reset_output
from bokeh.models import ColumnDataSource, CategoricalColorMapper, PrintfTickFormatter, NumeralTickFormatter, Legend, CDSView, GroupFilter, CustomJS, BoxSelectTool, FactorRange
from bokeh.layouts import gridplot, column
from bokeh.models.widgets import Div
from bokeh.palettes import Spectral6, Pastel1, Category20c, Inferno256
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, RangeTool, HoverTool
from bokeh.plotting import figure, show
from dateutil.rrule import rrule, DAILY


repo = pydriller.Repository(path_to_repo, from_tag=start_tag, to_tag=current_tag)

commits = list(repo.traverse_commits())
progress = tqdm.tqdm(unit="commit", total=len(commits))
timeseries_a = defaultdict(int)
timeseries_d = defaultdict(int)
timeseries_t = defaultdict(int)

for commit in commits:
    timeseries_a[commit.committer_date.date()] += commit.insertions
    timeseries_d[commit.committer_date.date()] += commit.deletions
    timeseries_t[commit.committer_date.date()] += commit.insertions + commit.deletions
    progress.update(1)
    

100%|██████████| 3428/3428 [00:59<00:00, 57.39commit/s]
100%|█████████▉| 3420/3428 [00:55<00:00, 59.85commit/s] 

In [27]:
timeseries_a

defaultdict(int,
            {datetime.date(2019, 10, 3): 5,
             datetime.date(2019, 10, 4): 41,
             datetime.date(2019, 10, 6): 140,
             datetime.date(2019, 10, 7): 323,
             datetime.date(2019, 10, 8): 14,
             datetime.date(2019, 10, 9): 110,
             datetime.date(2019, 10, 10): 1909,
             datetime.date(2019, 10, 21): 56,
             datetime.date(2019, 10, 22): 148,
             datetime.date(2019, 10, 24): 394,
             datetime.date(2019, 10, 25): 33,
             datetime.date(2019, 10, 26): 2,
             datetime.date(2019, 10, 27): 37,
             datetime.date(2019, 10, 28): 41,
             datetime.date(2019, 10, 29): 166,
             datetime.date(2019, 10, 30): 15,
             datetime.date(2019, 11, 4): 226,
             datetime.date(2019, 11, 5): 12,
             datetime.date(2019, 11, 7): 486,
             datetime.date(2019, 11, 8): 3,
             datetime.date(2019, 11, 11): 935,
             dateti

In [28]:
def extrapolate_ts_day(timeseries, start_date, end_date):
    extrapolated_ts = []
    for dt in rrule(DAILY, dtstart=start_date, until=end_date):
        value = timeseries.get(dt.date(), 0)
        extrapolated_ts.append([dt, value])
    return extrapolated_ts


first_commit_date = commits[0].committer_date
last_commit_date = commits[-1].committer_date
full_timeseries_a = extrapolate_ts_day(timeseries_a, first_commit_date, last_commit_date)
full_timeseries_d = extrapolate_ts_day(timeseries_d, first_commit_date, last_commit_date)
full_timeseries_t = extrapolate_ts_day(timeseries_t, first_commit_date, last_commit_date)


In [29]:
full_timeseries_a

[[datetime.datetime(2019, 10, 3, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  5],
 [datetime.datetime(2019, 10, 4, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  41],
 [datetime.datetime(2019, 10, 5, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  0],
 [datetime.datetime(2019, 10, 6, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  140],
 [datetime.datetime(2019, 10, 7, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  323],
 [datetime.datetime(2019, 10, 8, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  14],
 [datetime.datetime(2019, 10, 9, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  110],
 [datetime.datetime(2019, 10, 10, 22, 44, 22, tzinfo=<git.objects.util.tzoffset object at 0x7f8784966380>),
  1909],
 [datetime.datetime(2019, 10, 11, 22, 44, 22, tzinfo=<git.objects.util.tzoffset obje

In [30]:
data = {
    "day": [x[0] for x in full_timeseries_a],
    "addition": [x[1] for x in full_timeseries_a],
    "deletion": [x[1] for x in full_timeseries_d],
    "total": [x[1] for x in full_timeseries_t]    
}

source = ColumnDataSource(data)

p = figure(height=300, width=980, tools="xpan", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           background_fill_color="#efefef", x_range=(data["day"][1], data["day"][10]))

hover = HoverTool(
    tooltips=[
        ("Day", "@day{%F}"),
        ("Lines added", "@addition{0,0}"),
        ("Lines removed", "@deletion{0,0}"),
        ("Lines total", "@total{0,0}")        
    ],
    formatters={
        '@day': 'datetime' # use 'datetime' formatter for '@date' field
    },

    mode='vline',
#     names=["addition"]
)

p.line('day', 'addition', source=source, name="addition", line_width=2)
p.line('day', 'deletion', source=source, line_color="red", line_width=2)
p.line('day', 'total', source=source, line_color="black", line_width=2)
p.add_tools(hover)
p.yaxis.axis_label = 'Churn'

select = figure(title="Zoom",
                height=130, width=980, y_range=p.y_range,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None, background_fill_color="#efefef")

range_tool = RangeTool(x_range=p.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.line('day', 'addition', source=source)
select.line('day', 'deletion', source=source, line_color="red")
select.line('day', 'total', source=source, line_color="black")
select.ygrid.grid_line_color = None
select.add_tools(range_tool)
select.toolbar.active_multi = range_tool

show(column(p, select))


In [31]:
spike1 = [c for c in commits if c.committer_date.date() == datetime.date(2022,2,26)]
[c.hash for c in spike1]

['3d60708508c6bfc5b6635aff0482d640a5f318ca',
 '6aef76b5cde2315135d53215d13a9b2ec0a1adaa',
 '48caeb9d659abd58ec7a9dc04f7365b35e314b74',
 '0dc57ab6ed67657e0a77e08bcd99c7b809fe5e42',
 '57814a98a9c8e4b106d44a31e36561f585f73bac']

In [32]:
def print_diff(commits):
    additions = defaultdict(int)
    deletions = defaultdict(int)
    for commit in commits:
        for f in commit.modified_files:
            additions[str(f.new_path)] += f.added_lines
            deletions[str(f.new_path)] += f.deleted_lines

    all_files = set()
    all_files.update(additions.keys())
    all_files.update(deletions.keys())

    for f in sorted(all_files):
        print(f"+{additions.get(f,0)}\t -{deletions.get(f,0)} \t {f}")

In [33]:
print_diff(spike1)

+2	 -0 	 app/controllers/admin/email_domain_blocks_controller.rb
+10	 -6 	 app/javascript/styles/mastodon/admin.scss
+3	 -37 	 app/lib/activitypub/activity/delete.rb
+8	 -3 	 app/lib/activitypub/activity/update.rb
+65	 -0 	 app/lib/activitypub/forwarder.rb
+16	 -3 	 app/views/admin/reports/show.html.haml
+2	 -1 	 app/workers/scheduler/email_domain_block_refresh_scheduler.rb
+1	 -0 	 config/locales/en.yml
+1	 -1 	 dist/nginx.conf


In [34]:
spike2 = [c for c in commits if c.committer_date.date() == datetime.date(2022,2,11)]
[c.hash for c in spike2]

['aed98fd4ddc41b48522aeda609e0e30998f5e4e1',
 'f23c8dee2a43978e46310028907e382b7dd5c2cd',
 'ccf5bcc3dc884be88a10035305689f3857572599',
 '67034ea1ebc8c5d1df8f137fd67c8466aaf8d750',
 'f9f40069d5874948e7acf600179c2f602ee1e840',
 'b39127705714a732b0337b7a44211176cf2ad5a0',
 'ccad1a3caefe4f188e555d2bcab1f26f9b91bc4f',
 'c363fbe3a353f9f588af59e52aaf12a358e663bc',
 'cc7ae3932ca5542180d8a7200bc5c44cedaea976',
 '6f38765fccfeeed6c39a3b11962a518cae1cca4c',
 'd4e6774a0c88931d907f275821eb001e6dd2cb2d',
 'c9a52833b6840673bbed7454ca6b6b9cd88e7bfa',
 'a131f06e1299e21372f8f002c7959e54128be270',
 'a27729ee48aab4d75d562c2007b9967333c65d29',
 'd0fcf07436d158bcac2617d076a83d0aa49c39e6',
 '8f03b7a2fb4b420eb46942157160816185e81751',
 'ee47e2028bfc06ab9d35e5ea722073151e34042b',
 '4a0b6e3e5ea33fa1167f5820b442725bab16322f']

In [35]:
print_diff(spike2)

+2	 -2 	 Gemfile.lock
+50	 -0 	 app/controllers/admin/reports/actions_controller.rb
+11	 -1 	 app/controllers/api/web/push_subscriptions_controller.rb
+11	 -2 	 app/javascript/mastodon/actions/notifications.js
+11	 -0 	 app/javascript/mastodon/features/notifications/components/column_settings.js
+35	 -0 	 app/javascript/mastodon/features/notifications/components/notification.js
+19	 -3 	 app/javascript/mastodon/locales/af.json
+53	 -37 	 app/javascript/mastodon/locales/ar.json
+32	 -16 	 app/javascript/mastodon/locales/ast.json
+19	 -3 	 app/javascript/mastodon/locales/bg.json
+19	 -3 	 app/javascript/mastodon/locales/bn.json
+132	 -116 	 app/javascript/mastodon/locales/br.json
+35	 -19 	 app/javascript/mastodon/locales/ca.json
+23	 -7 	 app/javascript/mastodon/locales/co.json
+29	 -13 	 app/javascript/mastodon/locales/cs.json
+19	 -3 	 app/javascript/mastodon/locales/cy.json
+157	 -141 	 app/javascript/mastodon/locales/da.json
+27	 -11 	 app/javascript/mastodon/locales/de.json
+93	 -2

In [36]:
for commit in spike2:
    for f in commit.modified_files:
        if f.new_path is None:
            print(f.old_path)

In [37]:
for commit in spike2:
    print(commit.msg)
    print("================================================================")

Bump sidekiq-unique-jobs from 7.1.12 to 7.1.15 (#17505)

Bumps [sidekiq-unique-jobs](https://github.com/mhenrixon/sidekiq-unique-jobs) from 7.1.12 to 7.1.15.
- [Release notes](https://github.com/mhenrixon/sidekiq-unique-jobs/releases)
- [Changelog](https://github.com/mhenrixon/sidekiq-unique-jobs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/mhenrixon/sidekiq-unique-jobs/compare/v7.1.12...v7.1.15)

---
updated-dependencies:
- dependency-name: sidekiq-unique-jobs
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Bump @babel/plugin-proposal-decorators from 7.16.7 to 7.17.0 (#17468)

Bumps [@babel/plugin-proposal-decorators](https://github.com/babel/babel/tree/HEAD/packages/babel-plugin-proposal-decorators) from 7.16.7 to 7.17.0.
- [Release notes](https://github.com/babel/babel/releases)
- [Changelog](https://g

100%|██████████| 3428/3428 [01:06<00:00, 59.85commit/s]