In [1]:
from enum import Enum 

class Modification(Enum):
    ADDED = "Lines added"
    REMOVED = "Lines removed"
    TOTAL = "Lines added + lines removed"
    DIFF = "Lines added - lines removed"


In [2]:
pip install tqdm


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pydriller
from collections import defaultdict
import tqdm
from pathlib import Path
from git import Repo
from datetime import datetime

#Path to Repo
path_to_repo = "./mastodon"
repo_path = Path('./mastodon')

#From/To Tags
start_tag =  'v3.5.0'
current_tag = 'v3.5.3'

#Starting/End date YYYY,M,D
start_date = datetime(2019,1,1)
end_date = datetime(2019,12,31)

#File type --> if all files need to be considered ""
file_type = '.rb'

#if we need to consider only one path 
#--> SE1="app\\""  --> SE2=whole
path_spec = ''

# Clone the repository
if repo_path.exists():
    print('Repository exists!')
else:
    #  clone using the python git library
    Repo.clone_from('https://github.com/mastodon/mastodon', path_to_repo)

#Repo from to start tag
repo = pydriller.Repository(path_to_repo, from_tag=start_tag, to_tag=current_tag)

#Repo from since start to --> time
#repo = pydriller.Repository(path_to_repo, since=start_date, to=end_date)

#List with all commits
commits = list(repo.traverse_commits())
progress = tqdm.tqdm(unit="commit", total=len(commits))

#unique files / authors (set)
files_by_author = defaultdict(set)
authors_per_file = defaultdict(set)
commit_hashes = []
files_2 = []


#detect only modified files --> created/deleted files are not intresting
for commit in commits:
    commit_hashes.append(commit.hash)
    for modified_file in commit.modified_files:
        author = commit.author.name.strip()
        file = str(modified_file.new_path)
        
        if not file:
            continue
            
        if not file.endswith(file_type):
            continue
        
        if not file.startswith(path_spec):
            continue

        files_2.append(modified_file)
        files_by_author[author].add(file)        
        authors_per_file[file].add(author)
    
    progress.update(1)


Repository exists!


 93%|█████████▎| 199/214 [00:02<00:00, 73.35commit/s]

In [4]:
#Accessing single commit
c = commits[0]

#Acessing path of modified file
print(c.modified_files[0].new_path)
print(c.author.name)
print(c.author.email)

CHANGELOG.md
Eugen Rochko
eugen@zeonfederated.com


# **Hash**

In [5]:
# CALL ANYTHING ON A SPECIFIC COMMIT by HASH

# ----------------------------------
#INDEX = commit_hashes.index('a01580f09f33c275fcc0ffe616b5b5b403f46cae')
#commits[INDEX].insertions

In [6]:
def get_commit_info_from_hash(c_hash):
    git = pydriller.Git(path_to_repo)
    commit = git.get_commit(c_hash)
    print("Name: ", commit.author.name)
    print("Date: ", commit.author_date)
    print("Commit message: ", commit.msg)
    print("Hash: ", commit.hash)
    print("Number of files: ", commit.files)
    print("Insertions: ", commit.insertions)
    print("Deletions: ", commit.deletions)
    print("Modified files: ")
    for f in commit.modified_files: print("* ", f.new_path, "   +",  f.added_lines, "   -", f.deleted_lines)

#get_commit_info_from_hash('a01580f09f33c275fcc0ffe616b5b5b403f46cae')

In [7]:
# EXAMPLE: INSERTIONS
# commit_xy = [(commit.hash, commit.insertions) for commit in commits]
# commit_xy

# EXAMPLE: MODIFIED_FILES
#commit_xx = [(commit.hash, commit.modified_files) for commit in commits]
#commit_xx

# **Unique Author**

In [8]:
#find unique authors in commits
authors = []
authors_total = []
modified_files = 0

for commit in commits:
    authors_total.append(commit.author.name)
    name = commit.author.name
    modified_files += len(commit.modified_files)

    if name not in authors:
        authors.append(name)

#print("Authors Total: ", authors_total)    
print("Number of unique authors: ", len(set(authors)))
print("Number of total authors: ", len(authors_total))
print('Total Number of modified files: ', modified_files)
print("Unique Authors: \n", authors)



Number of unique authors:  19
Number of total authors:  214
Total Number of modified files:  1524
Unique Authors: 
 ['Eugen Rochko', 'Claire', 'Holger', 'dependabot[bot]', 'Ondřej Pokorný', 'CommanderRoot', 'rinsuki', 'Yamagishi Kazutoshi', '0x2019', 'dogelover911', 'Alexandra Catalina', 'Jeong Arm', 'Chris Dzombak', 'Gaelan Steele', 'mayaeh', 'Sara Golemon', 'Stefano Pigozzi', 'luzpaz', 'James Smith']


# **Number of Commits per Author**

In [9]:
# find number of commits
print("Total commits: ",len(commits))

# find number of commits per author
commits_per_author = {}
for author in authors_total:
    if author not in commits_per_author:
        commits_per_author[author] = 1
    else:
        commits_per_author[author] += 1

commits_per_author_sorted = sorted(commits_per_author.items(), key=lambda x:x[1], reverse=True)
print(commits_per_author_sorted)

Total_commits = list(commits_per_author.values())
print("Total commits: ",sum(Total_commits))


Total commits:  214
[('dependabot[bot]', 85), ('Eugen Rochko', 56), ('Claire', 45), ('Yamagishi Kazutoshi', 9), ('Jeong Arm', 3), ('Holger', 2), ('rinsuki', 2), ('Ondřej Pokorný', 1), ('CommanderRoot', 1), ('0x2019', 1), ('dogelover911', 1), ('Alexandra Catalina', 1), ('Chris Dzombak', 1), ('Gaelan Steele', 1), ('mayaeh', 1), ('Sara Golemon', 1), ('Stefano Pigozzi', 1), ('luzpaz', 1), ('James Smith', 1)]
Total commits:  214


# **Number of Files per Author**

In [10]:
files_by_author_count = { author : len(files_by_author[author]) for author in files_by_author}
files_by_author_count = {k: v for k, v in sorted(files_by_author_count.items(), key=lambda item: item[1], reverse=True)}
files_by_author_count

{'Eugen Rochko': 127,
 'Claire': 51,
 'Holger': 3,
 '0x2019': 3,
 'Jeong Arm': 3,
 'Yamagishi Kazutoshi': 2,
 'Stefano Pigozzi': 2,
 'rinsuki': 1,
 'dogelover911': 1,
 'Sara Golemon': 1,
 'luzpaz': 1}

# **Number of Authors working per File**

In [11]:
authors_per_file_count = { file : len(authors_per_file[file]) for file in authors_per_file}
sorted(authors_per_file_count.items(), key=lambda x:x[1], reverse=True)


[('app/chewy/statuses_index.rb', 3),
 ('app/lib/feed_manager.rb', 3),
 ('app/helpers/application_helper.rb', 3),
 ('lib/mastodon/version.rb', 2),
 ('app/helpers/formatting_helper.rb', 2),
 ('app/models/status.rb', 2),
 ('config/application.rb', 2),
 ('app/services/remove_status_service.rb', 2),
 ('app/mailers/user_mailer.rb', 2),
 ('app/services/activitypub/fetch_featured_collection_service.rb', 2),
 ('app/services/activitypub/process_status_update_service.rb', 2),
 ('config/environments/production.rb', 2),
 ('app/lib/activitypub/activity/create.rb', 2),
 ('db/schema.rb', 2),
 ('app/services/resolve_account_service.rb', 2),
 ('config/initializers/stoplight.rb', 2),
 ('app/lib/activitypub/activity/announce.rb', 2),
 ('app/serializers/manifest_serializer.rb', 1),
 ('config/initializers/paperclip.rb', 1),
 ('app/serializers/rest/status_serializer.rb', 1),
 ('app/controllers/api/v1/trends/tags_controller.rb', 1),
 ('app/controllers/api/v1/admin/account_actions_controller.rb', 1),
 ('app/co

# **Lines added per Commit**

In [12]:
#Number added lines within a commit
commit_lines = dict()
for commit in commits:
    commit_lines[commit.hash] = 0
    for file in commit.modified_files:
        if file.filename.endswith(file_type):
            commit_lines[commit.hash] += file.added_lines

print("Number of commits:", len(commit_lines))
print(sorted(commit_lines.items(), key=lambda x:x[1], reverse=True))

Number of commits: 214
[('a9b64b24d6c076cb96a66307c07d4f0158dc07da', 294), ('3917353645b91dae04f7d9b81162fead6f73072a', 243), ('2b8dc58b7ff7fb708687c08a75c99b3fb30efc49', 233), ('8f91e304a5adb98b657a5c096359d0423a5d7e84', 220), ('440eb71310e41d668f00980b73358edd5f8df043', 194), ('71d02ffcf3a79dfc1c413dcc7ff45c77ce9cb94c', 146), ('6221b36b278c02cdbf5b6d1c0753654b506b44fd', 115), ('6cf57c676550068a59149ca82d63fcb5b5431158', 115), ('6476f7e4da4da7c353d497aae5a86fc3909ce532', 81), ('679b7158e3cd3881e8cbaf2d2c0c97725b3b5fd9', 59), ('0360135d4d39d838fa9b090abc1e76284eb8ff64', 53), ('8e20e16cf030fef48850df4764bbf13a317f6b0c', 50), ('f6d35ed57d156f4225338a89372c8e83721e46c9', 43), ('e0bdaeab657d9a320aaf506d98ca82d41e7bfdd5', 42), ('9b4024a3892c48aaf2f6e86fc014360f2dd098f4', 34), ('7b0fe4aef97c6a5f73a03146b669a415f396799c', 28), ('3e0e7a1cfb617837ccada330afc13ed804c3c47b', 24), ('39b489ba4c362997b41dd039971b8b510f6fe10d', 20), ('84d991988eb076a7d83c771b3266f66f1c8a9754', 20), ('465ee7792ff48905

# **Entities**

In [13]:
# find number of changes per file
changes = {}
for commit in commits:
    for mod in commit.modified_files:
        if mod.filename not in changes:
            changes[mod.filename] = 1
        else:
            changes[mod.filename] += 1
print("Number of changes: ", len(changes))
#print(changes)
print("Number of entities",  sum(list(changes.values())))

Number of changes:  520
Number of entities 1524


In [14]:
# find number of entities and number of unique entities
entities = []
entities_unique = set()
for commit in commits:
    for mod in commit.modified_files:
        entities.append(mod.filename)
        entities_unique.add(mod.filename)
        #print(mod.filename)
print("Number of Total entities: ", len(entities))
print("Number of different entities: ", len(entities_unique))

Number of Total entities:  1524
Number of different entities:  520


# **Knowledge loss**

In [15]:
repo = pydriller.Repository(path_to_repo, from_tag=start_tag, to_tag=current_tag)

method = Modification.TOTAL

commits = list(repo.traverse_commits())
progress = tqdm.tqdm(unit="commit", total=len(commits))

#files per author
files_authors = defaultdict(dict)

#detect lines added per file and author
for commit in commits:
    for modified_file in commit.modified_files:
        author = commit.author.name.strip()
        file = str(modified_file.new_path)

        if not file:
            continue

        if not file.endswith(file_type):
            continue

        if author not in files_authors[file]:
            files_authors[file][author] = 0
            
        if method == Modification.ADDED:
            files_authors[file][author] += modified_file.added_lines
        elif method == Modification.REMOVED:
            files_authors[file][author] += modified_file.deleted_lines
        elif method == Modification.TOTAL:
            files_authors[file][author] += modified_file.added_lines + modified_file.deleted_lines
        elif method == Modification.DIFF:
            files_authors[file][author] += modified_file.added_lines - modified_file.deleted_lines
        
    progress.update(1)

files_authors

100%|██████████| 214/214 [00:03<00:00, 66.08commit/s]


defaultdict(dict,
            {'lib/mastodon/version.rb': {'Eugen Rochko': 6, 'Claire': 2},
             'app/chewy/statuses_index.rb': {'Claire': 2,
              'Jeong Arm': 5,
              'Eugen Rochko': 4},
             'app/helpers/formatting_helper.rb': {'Claire': 1,
              'Eugen Rochko': 28},
             'app/lib/feed_manager.rb': {'Claire': 23,
              'dogelover911': 2,
              'Eugen Rochko': 6},
             'app/models/status.rb': {'Claire': 13, 'Jeong Arm': 2},
             'app/serializers/manifest_serializer.rb': {'Holger': 6},
             'config/application.rb': {'Holger': 1, 'Eugen Rochko': 5},
             'config/initializers/paperclip.rb': {'Holger': 20},
             'app/helpers/application_helper.rb': {'Claire': 7,
              'Stefano Pigozzi': 2,
              'Eugen Rochko': 4},
             'app/serializers/rest/status_serializer.rb': {'rinsuki': 4},
             'app/controllers/api/v1/trends/tags_controller.rb': {'Eugen Rochko': 

In [16]:
contribs_data = {}

for file, author_contribs in files_authors.items():
    total_contribs = sum(author_contribs.values())
    current_percentages = {}
    main_author = None
    max_author_percentage = 0
    if total_contribs != 0:
        for author, contribs in author_contribs.items():
            author_percentage = contribs / total_contribs
            if author_percentage > max_author_percentage:
                max_author_percentage = author_percentage
                main_author = author
        
        contribs_data[file] = {
            "main_author": main_author,
            "main_author_percentage": round(max_author_percentage, 2),
            "total_contribs": total_contribs
        }
        
        
contribs_data

{'lib/mastodon/version.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.75,
  'total_contribs': 8},
 'app/chewy/statuses_index.rb': {'main_author': 'Jeong Arm',
  'main_author_percentage': 0.45,
  'total_contribs': 11},
 'app/helpers/formatting_helper.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.97,
  'total_contribs': 29},
 'app/lib/feed_manager.rb': {'main_author': 'Claire',
  'main_author_percentage': 0.74,
  'total_contribs': 31},
 'app/models/status.rb': {'main_author': 'Claire',
  'main_author_percentage': 0.87,
  'total_contribs': 15},
 'app/serializers/manifest_serializer.rb': {'main_author': 'Holger',
  'main_author_percentage': 1.0,
  'total_contribs': 6},
 'config/application.rb': {'main_author': 'Eugen Rochko',
  'main_author_percentage': 0.83,
  'total_contribs': 6},
 'config/initializers/paperclip.rb': {'main_author': 'Holger',
  'main_author_percentage': 1.0,
  'total_contribs': 20},
 'app/helpers/application_helper.rb': {'main_aut

# **GIT UNIX Shell**

In [17]:
# run inside \mastodon\app
git ls-files | awk -F . '{print $NF}' | sort | uniq -c | sort -n -r | awk '{print $2,$1}' | head -10
rb 895
js 437
haml 214
json 178
png 37
erb 33
scss 32
svg 24
woff2 5
woff 5

# ls-files iterates through each line
# awk -F --> sepeartor -f-->reads from file not first line argument
# print $NF --> counts number of fields within current input record
# print $2$1 --> print second column first, then first column


find . -type f -name "*.rb" | awk -F / '{print $(NF)}' > mastodon_../rubyfiles.txt
# find names of ruby files and store them in a listcat


#week 8 slide 19
git log --numstat --since "Jan 1 2019" --until "DEC 31 2019" > ../commits2019.txt --> writes 
git log --numstat -n 1 cfd49dd646c0e13fa54645ff27a5c59c1f96dc8b
cat logs.log | grep ".java" | awk -F '\t' '{print $3}' | sort | uniq –c



SyntaxError: invalid character '–' (U+2013) (514944160.py, line 27)

