# Git Repository Analysis

This notebook analyzes the commits and contributors in a Git repository using the output from `git_stats.py`.

## Requirements:
- Ensure `commits.csv` and `author_summary.csv` are in the current directory.
- These files should be generated using the `git_stats.py` script provided earlier.

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from wordcloud import WordCloud

sns.set(style='whitegrid')
df = pd.read_csv('commits.csv')
authors = pd.read_csv('author_summary.csv')

## 1. All Commits (head)

In [18]:
df.head()

Unnamed: 0,hexsha,author_name,author_email,date,message,files_changed,insertions,deletions,total_changes,is_merge,primary_branch
0,4b638de3ae2a201e246255b15bc2a1fc93787808,ChrisHorler,csh.sammt@gmail.com,2025-05-16T13:05:11+02:00,Initial commit,1,1,0,1,False,develop
1,0e8fc789c0ad05021d1c5495f2a4c8e3a4deeafc,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:17:41+02:00,template data,2,25,0,25,False,develop
2,455b509487e9bcea5dd8494d151090b6de6f6313,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:18:00+02:00,Merge branch 'feature/json_templating' into de...,2,25,0,25,True,develop
3,2f837d51a2a748908d2c26fc8bc8f73a72064f93,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:21:47+02:00,Add comment to clarify purpose of empty list i...,1,1,1,2,False,develop
4,5d72621c38fb9b99601a6da468034e77c12b3391,OldMartijntje_ubuntu,oldmartijntje@gmail.com,2025-05-27T12:22:01+02:00,Merge branch 'feature/json_templating' into de...,1,1,1,2,True,develop


## 2. Author Stats

In [19]:
# Branch diversity: count unique branches per author
branch_diversity = df.groupby('author_name')['primary_branch'].nunique().reset_index(name='branch_diversity')

# Merge with author summary
author_stats = pd.merge(authors, branch_diversity, on='author_name')
author_stats.sort_values(by='total_commits', ascending=False)

Unnamed: 0,author_name,total_commits,total_merges,insertions,deletions,total_changes,most_commits,most_merges,most_changes,branch_diversity
8,oldmartijntje_TH,164,22,19550,9431,28981,True,False,False,4
2,ChrisHorler,109,2,6918,2689,9607,False,False,False,2
3,Christian Scott Horler,82,82,60112,6697,66809,False,True,False,2
5,Martijn,81,74,155000,14009,169009,False,False,True,4
9,whoskhoi,46,1,1566,731,2297,False,False,False,3
4,JurPurr,36,1,5423,1722,7145,False,False,False,3
1,Angel Zeng,28,2,44303,16837,61140,False,False,False,5
7,OldMartijntje_ubuntu,21,5,4430,2810,7240,False,False,False,1
6,OldMartijntje_Frituurpan,8,1,1298,1146,2444,False,False,False,1
0,Angel,1,1,13122,0,13122,False,False,False,1


## 3. Most Used Commit Messages

In [20]:
df['message'].value_counts().head(10)

message
Merge branch 'develop'                                                                                    38
Merge branch 'feature/trash_prediction_api' into develop                                                   2
Merge remote-tracking branch 'origin/feature/frontend-analytics' into feature/waste_page_functionality     2
Merge branch 'feature/setup_python_deployment' into develop                                                2
CalendarFeaturesRequest                                                                                    2
remove comments                                                                                            2
Deleted barchart.razor                                                                                     2
Added more old code  In program.cs and from the graph and barchart                                         2
Added files from the old code  Added services and models from previous frontend code.                      2
working on 

## 4. Most Used Words in Commits

In [21]:
all_words = ' '.join(df['message']).lower()
words = re.findall(r'\b\w+\b', all_words)
common_words = Counter(words).most_common(20)
pd.DataFrame(common_words, columns=['word', 'count'])

Unnamed: 0,word,count
0,develop,199
1,merge,196
2,branch,113
3,feature,99
4,from,91
5,into,78
6,request,77
7,pull,76
8,chrishorler,76
9,to,65


## most common branch names

In [22]:
branches = df['primary_branch'].dropna().astype(str)
split_words = [re.split(r'[\/\-_]', branch) for branch in branches]
flat_words = [word.lower() for sublist in split_words for word in sublist if word]
common_branch_words = Counter(flat_words).most_common(20)
pd.DataFrame(common_branch_words, columns=['word', 'count'])

Unnamed: 0,word,count
0,develop,507
1,origin,54
2,head,41
3,> origin,41
4,main,41
5,feature,26
6,blazor,10
7,project,10
8,frontend,8
9,analytics,8


In [24]:
branches = df['primary_branch'].dropna().astype(str)
common_branch_words = Counter(branches).most_common(20)
pd.DataFrame(common_branch_words, columns=['word', 'count'])

Unnamed: 0,word,count
0,develop,507
1,origin/HEAD -> origin/main,41
2,feature/frontend-analytics-graph-barchart,8
3,origin/feature/blazor_project_WasteCalendar,7
4,feature/dbcontext_issue_aproach1,2
5,feature/dbcontext_issue_aproach2,2
6,feature/fetch-db-endpoints,2
7,origin/feature/C4_diagrams,2
8,origin/feature/blazor_project_first_layout,1
9,origin/blazor_project_css,1


## 6. Personal Commit Stats

In [23]:
def personal_commit_stats(author):
    # Make an explicit copy so that pandas knows you’re not working on a view
    subset = df[df['author_name'] == author].copy()

    # Now it’s safe to assign new columns
    subset['word_count'] = subset['message'].str.split().str.len()
    subset['char_count'] = subset['message'].str.len()

    return {
        'most_additions': subset['insertions'].max(),
        'least_additions': subset['insertions'].min(),
        'avg_additions': subset['insertions'].mean(),
        'most_words': subset['word_count'].max(),
        'least_words': subset['word_count'].min(),
        'avg_words': subset['word_count'].mean(),
        'most_chars': subset['char_count'].max(),
        'least_chars': subset['char_count'].min(),
        'avg_chars': subset['char_count'].mean(),
    }

pd.DataFrame({a: personal_commit_stats(a)
              for a in df['author_name'].unique()}).T


Unnamed: 0,most_additions,least_additions,avg_additions,most_words,least_words,avg_words,most_chars,least_chars,avg_chars
ChrisHorler,852.0,0.0,63.46789,24.0,1.0,6.284404,162.0,11.0,42.027523
OldMartijntje_ubuntu,1221.0,0.0,210.952381,11.0,1.0,4.047619,90.0,4.0,33.809524
JurPurr,1850.0,1.0,150.638889,33.0,1.0,7.277778,182.0,11.0,46.25
Martijn,34843.0,0.0,1913.580247,12.0,2.0,7.395062,127.0,10.0,67.012346
OldMartijntje_Frituurpan,692.0,0.0,162.25,11.0,2.0,5.0,70.0,13.0,35.125
Christian Scott Horler,15264.0,1.0,733.073171,11.0,3.0,4.45122,121.0,22.0,45.926829
oldmartijntje_TH,2659.0,0.0,119.207317,19.0,1.0,3.603659,104.0,1.0,26.390244
whoskhoi,209.0,0.0,34.043478,27.0,2.0,6.369565,155.0,11.0,41.608696
Angel Zeng,15195.0,0.0,1582.25,37.0,2.0,6.464286,217.0,12.0,45.5
Angel,13122.0,13122.0,13122.0,5.0,5.0,5.0,56.0,56.0,56.0
