# Git Repository Analysis Notebook
This notebook analyzes Git commit data from `commits.csv` and the repository.

## 1. Setup
Import necessary libraries.

In [21]:
import os
import pandas as pd
from git import Repo
import re
from collections import Counter

## 2. Load Data
Read commit data and locate the Git repository root.

In [22]:
# Load commits.csv
commits = pd.read_csv('commits.csv')

# Initialize Repo at the nearest git root
repo = Repo(os.getcwd(), search_parent_directories=True)
git_root = repo.git.rev_parse('--show-toplevel')
print(f"Using Git repository at: {git_root}")

Using Git repository at: /home/oldmartijntje/Documents/code/ICT1_4_ZwerfAfval


## 3. Show All Commits (Head)
Display the first few rows of the commit DataFrame.

In [23]:
commits.head()

Unnamed: 0,hexsha,author_name,author_email,date,message,files_changed,insertions,deletions,total_changes,is_merge,branches
0,4b638de3ae2a201e246255b15bc2a1fc93787808,ChrisHorler,csh.sammt@gmail.com,2025-05-16T13:05:11+02:00,Initial commit,1,1,0,1,False,"develop,feature/api_connection_backend,feature..."
1,0e8fc789c0ad05021d1c5495f2a4c8e3a4deeafc,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:17:41+02:00,template data,2,25,0,25,False,"develop,feature/api_connection_backend,feature..."
2,455b509487e9bcea5dd8494d151090b6de6f6313,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:18:00+02:00,Merge branch 'feature/json_templating' into de...,2,25,0,25,True,"develop,feature/api_connection_backend,feature..."
3,2f837d51a2a748908d2c26fc8bc8f73a72064f93,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:21:47+02:00,Add comment to clarify purpose of empty list i...,1,1,1,2,False,"develop,feature/api_connection_backend,feature..."
4,5d72621c38fb9b99601a6da468034e77c12b3391,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:22:01+02:00,Merge branch 'feature/json_templating' into de...,1,1,1,2,True,"develop,feature/api_connection_backend,feature..."


## 4. Stats Per Person
Compute total commits, merges, insertions, deletions, branch diversity, and branches created.

In [24]:
# Branch diversity: number of unique branches each author has commits on
commits['branch_list'] = commits['branches'].fillna('').apply(lambda x: [b for b in x.split(',') if b])
# commits['branch_diversity'] = commits['branch_list'].apply(lambda lst: len(set(lst)))

# Branches created: count branches whose tip commit author matches the person
all_branches = list(repo.branches) + [ref for remote in repo.remotes for ref in remote.refs]
created = {}
for br in all_branches:
    try:
        auth = br.commit.author.name
        created.setdefault(auth, []).append(br.name)
    except:
        pass

author_stats = commits.groupby('author_name').agg(
    commits=('hexsha','count'),
    merges=('is_merge','sum'),
    insertions=('insertions','sum'),
    deletions=('deletions','sum'),
    # branch_diversity=('branch_diversity','sum')
).reset_index()

author_stats['branches_created'] = author_stats['author_name'].map(lambda name: len(set(created.get(name, []))))
author_stats

Unnamed: 0,author_name,commits,merges,insertions,deletions,branches_created
0,Angel,1,1,13122,0,0
1,Angel Zeng,18,2,13815,2290,5
2,ChrisHorler,76,0,3683,1080,12
3,Christian Scott Horler,57,57,10342,5019,6
4,JurPurr,36,1,5423,1722,4
5,Martijn,59,57,65906,9303,6
6,OldMartijntje_Frituurpan,8,1,1298,1146,1
7,OldMartijntje_ubuntu,21,5,4430,2810,3
8,oldmartijntje_TH,94,13,7263,1809,21
9,whoskhoi,24,0,698,559,4


## 5. Most Used Commit Messages
Top 10 most frequent full commit messages.

In [25]:
commits['message'].value_counts().head(10)

message
Merge branch 'develop'                                                                                    29
Merge remote-tracking branch 'origin/blazor_project_css' into develop                                      2
remove pycache                                                                                             2
wip                                                                                                        2
Merge branch 'feature/control_api_init' into develop                                                       2
Merge remote-tracking branch 'origin/feature/fetch-db-endpoints-v2' into feature/fetch-db-endpoints-v2     2
Merge branch 'feature/poi_model_changes' into develop                                                      2
Merge branch 'feature/setup_python_deployment' into develop                                                2
Merge branch 'feature/trash_prediction_api' into develop                                                   2
Merge branc

## 6. Most Used Words in Commits
Tokenize all commit messages and count the top 20 words.

In [26]:
words = re.findall(r"\w+", " ".join(commits['message']).lower())
Counter(words).most_common(20)

[('merge', 139),
 ('develop', 135),
 ('branch', 79),
 ('feature', 67),
 ('from', 64),
 ('request', 58),
 ('pull', 57),
 ('chrishorler', 57),
 ('into', 52),
 ('to', 50),
 ('the', 46),
 ('updated', 43),
 ('added', 41),
 ('for', 32),
 ('and', 24),
 ('api', 22),
 ('deployment', 22),
 ('file', 21),
 ('in', 20),
 ('fixed', 18)]

## 7. Most Common Words in Branch Names
Split branch names on `/`, `-`, or `_` and count top 20 tokens.

In [27]:
branches = [b for sub in commits['branch_list'] for b in sub]
tokens = []
for b in branches:
    tokens.extend(re.split(r'[\/\-_]', b))
Counter([t for t in tokens if t]).most_common(20)

[('origin', 9518),
 ('feature', 9400),
 ('api', 1520),
 ('blazor', 1187),
 ('fetch', 1080),
 ('db', 1080),
 ('endpoints', 1080),
 ('project', 987),
 ('develop', 968),
 ('connection', 906),
 ('release', 897),
 ('main', 695),
 ('backend', 626),
 ('v2', 606),
 ('trash', 599),
 ('prediction', 599),
 ('class', 566),
 ('diagram', 566),
 ('poi', 490),
 ('model', 490)]

## 8. Personal Stats
For each author: max/min/avg additions, words, and characters per commit.

In [28]:
def word_count(msg): return len(re.findall(r"\w+", msg))
commits['word_count'] = commits['message'].apply(word_count)
commits['char_count'] = commits['message'].str.len()

personal = {}
for author, grp in commits.groupby('author_name'):
    personal[author] = {
        'max_additions': grp['insertions'].max(),
        'min_additions': grp['insertions'].min(),
        'avg_additions': grp['insertions'].mean(),
        'max_deletions': grp['deletions'].max(),
        'min_deletions': grp['deletions'].min(),
        'avg_deletions': grp['deletions'].mean(),
        'max_words': grp['word_count'].max(),
        'min_words': grp['word_count'].min(),
        'avg_words': grp['word_count'].mean(),
        'max_chars': grp['char_count'].max(),
        'min_chars': grp['char_count'].min(),
        'avg_chars': grp['char_count'].mean(),
    }
pd.DataFrame.from_dict(personal, orient='index')

Unnamed: 0,max_additions,min_additions,avg_additions,max_deletions,min_deletions,avg_deletions,max_words,min_words,avg_words,max_chars,min_chars,avg_chars
Angel,13122,13122,13122.0,0,0,0.0,6,6,6.0,56,56,56.0
Angel Zeng,13122,0,767.5,1867,0,127.222222,16,2,5.555556,102,12,37.166667
ChrisHorler,852,1,48.460526,248,0,14.210526,22,2,6.921053,162,14,45.697368
Christian Scott Horler,2500,1,181.438596,2015,0,88.052632,14,3,5.210526,121,22,41.578947
JurPurr,1850,1,150.638889,1108,0,47.833333,33,1,7.527778,182,11,46.25
Martijn,16368,0,1117.050847,1924,0,157.677966,17,2,8.983051,127,10,66.254237
OldMartijntje_Frituurpan,692,0,162.25,1078,0,143.25,11,2,5.125,70,13,35.125
OldMartijntje_ubuntu,1221,0,210.952381,1221,0,133.809524,11,1,4.428571,90,4,33.809524
oldmartijntje_TH,1239,0,77.265957,261,0,19.244681,19,1,4.010638,104,1,26.989362
whoskhoi,164,1,29.083333,218,0,23.291667,27,2,6.416667,155,17,41.125
