# Fundamentals of Software Systems - SE Part I Assignment

By Andy Wiemeyer and Lucius Bachmann

### Setup tools
* checkout repo
* initialize Git utility
* You can recreate the Repository object with other parameters to analyze different time periods.
  The last year was used that the setup is fast.

In [3]:
import os.path
from datetime import datetime
from os import path, mkdir

import pandas
import pandas as pd
import plotly.express as px
from pydriller import Repository, Git

repo_remote_path = 'https://github.com/mastodon/mastodon.git'
repo_path = 'mastodon'
repo_checkout_path = f'{repo_path}/{repo_path}'
filepath = 'app'

if not path.exists(repo_path):
    mkdir(repo_path)

from_tag = 'v3.5.0'
to_tag = 'v4.0.0'

repo = Repository(repo_remote_path, clone_repo_to=repo_path, from_tag=from_tag, to_tag=to_tag)

# repo = Repository(repo_remote_path, clone_repo_to=repo_path, since=datetime.fromisoformat('2022-10-01'), filepath=filepath)
# clone repo if necessary
for commit in repo.traverse_commits():
    break
git = Git(repo_checkout_path)

In [19]:
from pandas import DataFrame
from typing import Set
import plotly.express as px

authors_to_files : {str, Set[str]} = {}

for commit in repo.traverse_commits():
    author_identifier = commit.author.name
    authors_to_files.setdefault(author_identifier, set())
    for file in commit.modified_files:
        if file.new_path is not None:
            if file.new_path.endswith(".rb"):
                authors_to_files[author_identifier].add(file.new_path)

author_file_count = {(k, len(files)) for k, files in authors_to_files.items()}
col_name_author = 'Author'
col_name_nr_of_files = 'Nr of files'
df_author_file_count = DataFrame(author_file_count, columns=[col_name_author, col_name_nr_of_files])
df_author_file_count_sorted = df_author_file_count.sort_values(by=[col_name_nr_of_files], ascending=False)

display(df_author_file_count_sorted[:20])

hist_author_count = px.histogram(df_author_file_count_sorted, x=col_name_nr_of_files)
hist_author_count.show()


Unnamed: 0,Author,Nr of files
84,Eugen Rochko,509
46,Claire,201
16,Takeshi Umeda,23
20,Yamagishi Kazutoshi,14
1,Jeong Arm,13
47,trwnh,12
43,James Tucker,9
71,prplecake,6
51,luzpaz,5
56,Alexander Ivanov,5


In [20]:
len([key for key,value in author_file_count if value > 11])

6

In [8]:
date_to_churn = {}

for commit in repo.traverse_commits():
    date = commit.committer_date.date()
    if date in date_to_churn:
        date_to_churn[date]["lines added"] += commit.insertions
        date_to_churn[date]["lines deleted"] += commit.deletions
        date_to_churn[date]["hashes"].append(commit.hash)
        date_to_churn[date]["messages"].append(commit.msg)
    else:
        date_to_churn[date] = {"lines added":commit.insertions,
                               "lines deleted":commit.deletions,
                               "hashes":[commit.hash],
                               "messages":[commit.msg]}

min_date = min(date_to_churn.keys())
max_date = max(date_to_churn.keys())
date_list = [entry.date() for entry in pd.date_range(min_date, max_date)]
for date in date_list:
    if date not in date_to_churn:
        date_to_churn[date] = {"lines added":0,
                               "lines deleted":0,
                               "hashes":[],
                               "messages":[]}

display(date_to_churn)


{datetime.date(2022, 3, 30): {'lines added': 7,
  'lines deleted': 2,
  'hashes': ['8c7223f4eac80b5725485be742d3fa2c984f4670'],
  'messages': ['Bump version to 3.5.0 (#17911)']},
 datetime.date(2022, 3, 31): {'lines added': 16,
  'lines deleted': 15,
  'hashes': ['ef196c913c77338be5ebb1e02af2f6225f857080',
   'ea0cfd8e7ed41b32aaa47eabb1c73845ae843fcb',
   '24d446adf22644c61e4d61ef612458cf087326dd'],
  'messages': ['Fix error MethodError in Chewy::Strategy::Sidekiq::Worker (#17912)\n\nAlso refactor a bit to reduce code duplication.',
   'fix: PWA web manifest not changed to new routes (#17921)',
   'Bump puma from 5.6.2 to 5.6.4 (#17914)\n\nBumps [puma](https://github.com/puma/puma) from 5.6.2 to 5.6.4.\r\n- [Release notes](https://github.com/puma/puma/releases)\r\n- [Changelog](https://github.com/puma/puma/blob/master/History.md)\r\n- [Commits](https://github.com/puma/puma/compare/v5.6.2...v5.6.4)\r\n\r\n---\r\nupdated-dependencies:\r\n- dependency-name: puma\r\n  dependency-type: dire

In [16]:
from numpy import mean
import statistics

for i in date_to_churn:
    print(date_to_churn[i])

churn = [date_to_churn[date]['lines added'] for date in date_to_churn]
print(statistics.median(churn))

print(max(churn))
print(mean(churn))


{'lines added': 7, 'lines deleted': 2, 'hashes': ['8c7223f4eac80b5725485be742d3fa2c984f4670'], 'messages': ['Bump version to 3.5.0 (#17911)']}
{'lines added': 16, 'lines deleted': 15, 'hashes': ['ef196c913c77338be5ebb1e02af2f6225f857080', 'ea0cfd8e7ed41b32aaa47eabb1c73845ae843fcb', '24d446adf22644c61e4d61ef612458cf087326dd'], 'messages': ['Fix error MethodError in Chewy::Strategy::Sidekiq::Worker (#17912)\n\nAlso refactor a bit to reduce code duplication.', 'fix: PWA web manifest not changed to new routes (#17921)', 'Bump puma from 5.6.2 to 5.6.4 (#17914)\n\nBumps [puma](https://github.com/puma/puma) from 5.6.2 to 5.6.4.\r\n- [Release notes](https://github.com/puma/puma/releases)\r\n- [Changelog](https://github.com/puma/puma/blob/master/History.md)\r\n- [Commits](https://github.com/puma/puma/compare/v5.6.2...v5.6.4)\r\n\r\n---\r\nupdated-dependencies:\r\n- dependency-name: puma\r\n  dependency-type: direct:production\r\n...\r\n\r\nSigned-off-by: dependabot[bot] <support@github.com>\r\n

In [25]:
from pydriller.metrics.process.process_metric import ProcessMetric


class AuthorsContributions(ProcessMetric):

    def __init__(self, path_to_repo: str,
                 from_tag: str = None,
                 to_tag: str = None):

        self.repo_miner = Repository(path_to_repo=path_to_repo, from_tag=from_tag, to_tag=to_tag)
        self._initialize()

    def _initialize(self):
        renamed_files = {}
        self.contributions = {}

        commits = list(self.repo_miner.traverse_commits())

        for commit in commits:
            for modified_file in commit.modified_files:
                if modified_file.new_path is None:
                    continue
                author = commit.author.name.strip()
                lines_authored = modified_file.added_lines + modified_file.deleted_lines

                self.contributions[modified_file.new_path] = self.contributions.get(modified_file.new_path, {})
                self.contributions[modified_file.new_path][author] = self.contributions[modified_file.new_path].get(
                    author, 0) + \
                                                                     lines_authored


    def author_contributions(self):
        return self.contributions

    def count(self):
        count = {}
        for path, contrib in list(self.contributions.items()):
            total = sum(contrib.values())
            if total != 0:
                count[path] = round(100 * max(contrib.values()) / total, 2)

        return count


contributions = AuthorsContributions(path_to_repo='mastodon/mastodon', from_tag=from_tag, to_tag=to_tag)

display(contributions.author_contributions())


{'CHANGELOG.md': {'Eugen Rochko': 328,
  'Claire': 52,
  'Christian Clauss': 4,
  'Hampton Lintorn-Catlin': 1},
 'lib/mastodon/version.rb': {'Eugen Rochko': 22, 'Claire': 2},
 'app/chewy/statuses_index.rb': {'Claire': 2,
  'Jeong Arm': 5,
  'Eugen Rochko': 4},
 'app/helpers/formatting_helper.rb': {'Claire': 1, 'Eugen Rochko': 28},
 'app/lib/feed_manager.rb': {'Claire': 53,
  'dogelover911': 2,
  'Eugen Rochko': 86},
 'app/models/status.rb': {'Claire': 19, 'Jeong Arm': 2, 'Eugen Rochko': 10},
 'app/serializers/manifest_serializer.rb': {'Holger': 6, 'Eugen Rochko': 69},
 'Gemfile.lock': {'dependabot[bot]': 902, 'Claire': 46, 'Eugen Rochko': 15},
 'config/webpack/shared.js': {'Claire': 1},
 'config/application.rb': {'Holger': 1, 'Eugen Rochko': 7, 'gol-cha': 1},
 'config/initializers/paperclip.rb': {'Holger': 20,
  'David Hewitt': 6,
  'Matt Corallo': 2},
 'app/helpers/application_helper.rb': {'Claire': 15,
  'Stefano Pigozzi': 2,
  'Eugen Rochko': 26,
  'Yamagishi Kazutoshi': 4},
 'confi

In [30]:
from collections import Counter
from numpy import argmax

import pandas

author_contributions = contributions.author_contributions()

authors_with_main_authorship = []

for i in author_contributions:
    main_author = pandas.Series(author_contributions[i]).idxmax()
    authors_with_main_authorship.append(main_author)

counter = Counter(authors_with_main_authorship)

print(counter.most_common())


[('Eugen Rochko', 1267), ('Claire', 205), ('Yamagishi Kazutoshi', 38), ('Gaelan Steele', 38), ('Takeshi Umeda', 19), ('trwnh', 16), ('Alex Nordlund', 13), ('Erik Sundell', 12), ('pea-sys', 10), ('dependabot[bot]', 9), ('Jeong Arm', 7), ('F', 5), ('CommanderRoot', 4), ('Jeremy Kescher', 4), ('luzpaz', 3), ('Alexander Ivanov', 3), ('Shlee', 2), ('Hampton Lintorn-Catlin', 2), ('Holger', 1), ('0x2019', 1), ('rinsuki', 1), ('Sara Golemon', 1), ('Stefano Pigozzi', 1), ('Arthur Isac', 1), ('Vyr Cossont', 1), ('Meisam', 1), ('Ashish Kurmi', 1), ('tateisu', 1), ('Daniel Jakots', 1), ('Rens Groothuijsen', 1), ('The Stranjer', 1), ('zunda', 1), ('prplecake', 1), ('Yurii Izorkin', 1), ('Yarden Shoham', 1), ('Hayden', 1), ('Chris Rose', 1), ('Sunny Ripert', 1), ('Postmodern', 1), ('Moritz Hedtke', 1), ('Pierre Bourdon', 1)]


In [31]:
from typing import Dict

repo = Repository(path_to_repo='mastodon/mastodon', from_tag="v3.5.0", to_tag="v3.5.3")

deleted_lines= {}

for commit in repo.traverse_commits():
    deleted_lines[commit.hash] = commit.deletions

print(pandas.Series(deleted_lines).idxmax())


9250578e6bfe805e81e65fc5d9684bac3503f189


In [34]:

modified_files = {}

for commit in repo.traverse_commits():
    for file in commit.modified_files:
        if file.new_path is None:
            continue
        if file.new_path.endswith('rb'):
            if not file.new_path in modified_files:
                modified_files[file.new_path] = 0
            modified_files[file.new_path] += 1

display(modified_files)
len([i for i in modified_files if modified_files[i] == 1])


{'lib/mastodon/version.rb': 4,
 'app/chewy/statuses_index.rb': 4,
 'app/helpers/formatting_helper.rb': 3,
 'app/lib/feed_manager.rb': 4,
 'app/models/status.rb': 4,
 'app/serializers/manifest_serializer.rb': 1,
 'config/application.rb': 4,
 'config/initializers/paperclip.rb': 1,
 'app/helpers/application_helper.rb': 3,
 'app/serializers/rest/status_serializer.rb': 1,
 'app/controllers/api/v1/trends/tags_controller.rb': 2,
 'app/controllers/api/v1/admin/account_actions_controller.rb': 1,
 'app/controllers/api/v1/admin/accounts_controller.rb': 2,
 'app/controllers/api/v1/admin/dimensions_controller.rb': 1,
 'app/controllers/api/v1/admin/measures_controller.rb': 1,
 'app/controllers/api/v1/admin/reports_controller.rb': 1,
 'app/controllers/api/v1/admin/retention_controller.rb': 1,
 'app/controllers/api/v1/admin/trends/links_controller.rb': 1,
 'app/controllers/api/v1/admin/trends/statuses_controller.rb': 1,
 'app/controllers/api/v1/admin/trends/tags_controller.rb': 1,
 'app/services/remov

133