In [1]:
import pandas as pd

# Descriptive Statistic - Email Type Using In Commit
df = pd.read_csv("full.csv")

# Extract Email Type From Author Commit
def extract_email_type(author):
    author = author.lower()

    if "@users.noreply.github.com" in author:
        return "Private Email"
    elif "@gmail.com" in author:
        return "Gmail"
    elif "@yahoo.com" in author:
        return "Yahoo"
    elif "@outlook.com" in author:
        return "Outlook"
    else:
        return "Unknown"

df['Author Email Type'] = df['author'].apply(extract_email_type)

# Count Occurrences
email_counts = df['Author Email Type'].value_counts()
print(email_counts)

Author Email Type
Unknown          3749922
Gmail             497509
Private Email      79081
Outlook             6268
Yahoo               3519
Name: count, dtype: int64


In [5]:
import pandas as pd
import re

# Descriptive Statistic - Count Total Commit By Timezone
df = pd.read_csv("full.csv")

# Extract Timezone From Date
def extract_timezone(date_str):
    if isinstance(date_str, str):
        match = re.search(r'[-+]\d{4}', date_str)
        if match:
            return match.group()
    return 'unknown'

df['Timezone'] = df['date'].apply(extract_timezone)

# Count occurrences
timezone_counts = df['Timezone'].value_counts()
print(timezone_counts)

Timezone
+0000    2060543
-0700     627215
-0800     347714
+0200     320798
+0100     287057
-0400     156506
-0500     128272
+0800      91295
+0300      75476
+0900      50249
+0530      46481
-0300      38877
-0600      37784
+1000      25150
+0400      13255
+1100      13168
-0200       4948
+0700       2796
-1000       1873
+1200       1835
+1300       1779
+1030       1088
+0930        791
+0500        698
+0600        294
+0430        130
+0330         76
+0545         46
-0900         27
+0206         26
-1100         13
-0230          9
+0106          9
-0100          8
-0430          4
+1400          3
-0330          3
+0159          2
+0630          1
Name: count, dtype: int64


In [5]:
import pandas as pd
import re

# Descriptive Statistic - 20 Most Commited Repo
df = pd.read_csv("full.csv")

# Extract Repo Name From Repo
def extract_repo_name(repo):
    if isinstance(repo, str) and '/' in repo:
        return repo.split('/')[-1].strip() 
    return None

df['Repo Name'] = df['repo'].apply(extract_repo_name)

# Count Occurrences
repo_counts = df['Repo Name'].value_counts()
print(repo_counts.head(20))

Repo Name
linux           998326
chromium        995884
llvm-project    386114
freebsd-src     258157
src             212189
gcc             184430
rust            142093
swift           119903
cpython         109619
tensorflow      109154
kubernetes       99961
vscode           80577
v8               68575
postgres         51334
go               47713
matplotlib       38649
pytorch          35726
rstudio          34681
node             33535
httpd            32515
Name: count, dtype: int64


In [21]:
import pandas as pd
from datetime import datetime, timezone
import re

# Descriptive Statistic - 20 Most Committed Repo With First Commit Date, Age (Days)
df = pd.read_csv("full.csv")

# Extract Repo Name From Repo
def extract_repo_name(repo):
    if isinstance(repo, str) and '/' in repo:
        return repo.split('/')[-1].strip()  
    return None

df['Repo Name'] = df['repo'].apply(extract_repo_name)

# Convert Datetime
df['Parsed Date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)

# Group Repo Name and Get First Commit Date
first_commit_dates = df.groupby('Repo Name')['Parsed Date'].min()

# Count Occurences
repo_counts = df['Repo Name'].value_counts()

# Combine counts and first dates
summary = pd.DataFrame({
    'Total Commits': repo_counts,
    'First Commit Date': first_commit_dates
}).dropna().sort_values(by='Total Commits', ascending=False)

now = datetime.now(timezone.utc)
summary['Age (Days)'] = (now - summary['First Commit Date']).dt.days

print(summary[['First Commit Date', 'Age (Days)']].head(20))

                     First Commit Date  Age (Days)
Repo Name                                         
linux        1970-01-01 00:00:01+00:00       20233
chromium     2001-08-24 14:11:42+00:00        8675
llvm-project 2001-06-06 20:29:01+00:00        8753
freebsd-src  1993-06-12 14:49:13+00:00       11670
src          1995-10-18 08:37:01+00:00       10812
gcc          1988-11-23 07:17:23+00:00       13332
rust         2010-06-16 21:30:45+00:00        5456
swift        2010-07-17 23:50:59+00:00        5425
cpython      1990-08-09 14:25:15+00:00       12708
tensorflow   2015-11-07 00:27:58+00:00        3487
kubernetes   2014-06-06 23:40:48+00:00        4005
vscode       2015-11-13 13:39:38+00:00        3481
v8           2008-06-30 17:16:55+00:00        6172
postgres     1996-07-09 06:22:35+00:00       10547
go           1972-07-19 00:05:45+00:00       19303
matplotlib   2003-05-12 15:20:38+00:00        8049
pytorch      2012-01-25 13:55:20+00:00        4869
rstudio      2010-12-07 21:31:5