# Code

In [1]:
%pip install PyGithub python-dotenv pandas tqdm matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import datetime


In [3]:
ORGANIZATIONS = [
    "spring-guides", # 74
    "spring-cloud-samples", # 29
    "googlesamples", # 71
    "googlearchive", # 973
    "Azure-Samples", # 2.6k
    "aws-samples", # 6.3k
]
EXCLUDED_REPOS = [
    "googlearchive/digits-migration-helper-android",
    "googlearchive/play-apk-expansion",
    "googlearchive/tiger",
    "googlearchive/two-token-sw",
    "googlearchive/Abelana-Android",
    "googlearchive/solutions-mobile-backend-starter-java"
]

In [4]:
load_dotenv()
github_client = Github(getenv('GITHUB_TOKEN'), per_page=100)

In [5]:
def fetch_organization_repositories(organization_name, language=None):
    organization = github_client.get_organization(organization_name)
    repositories = organization.get_repos("all")
    total_repositories = repositories.totalCount
    repo_data = []

    def process_repo(repo):
        try:
            if repo.full_name in EXCLUDED_REPOS:
                return None

            if organization_name == "googlearchive" and not is_valid_googlearchive_repo(repo):
                return None
            elif organization_name == "SAP-samples" and not is_valid_sap_sample_repo(repo):
                return None
            

            if repo.size == 0:
                return None

            repo_languages = repo.get_languages()
            total_lines = sum(repo_languages.values())
            language_percentages = {lang: f'{(lines/total_lines):.2%}' for lang, lines in repo_languages.items()}

            issues = repo.get_issues(state="all")
            pulls = repo.get_pulls(state="all")
            
            issues_count = issues.totalCount
            closed_issues_count = sum(1 for issue in issues if issue.state == "closed")
            total_pulls_count = pulls.totalCount
            open_pulls_count = sum(1 for pr in pulls if pr.state == "open")
            closed_pulls_count = sum(1 for pr in pulls if pr.state == "closed")
            merged_pulls_count = sum(1 for pr in pulls if pr.merged_at is not None)
            commits = repo.get_commits()
            commits_count = commits.totalCount
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            
            if commits_count == 0:
                first_commit_date = None
                last_commit_date = None
            else:
                first_commit_date = commits.reversed[0].commit.author.date
                last_commit_date = commits[0].commit.author.date
            
            branches_count = repo.get_branches().totalCount
            contributors_count = repo.get_contributors().totalCount
            
            topics = repo.get_topics()

            return {
                "full_name": repo.full_name,
                "name": repo.name,
                "owner": repo.owner.login,
                "html_url": repo.html_url,
                "description": repo.description,
                "language": repo.language,
                "created_at": repo.created_at,
                "updated_at": repo.updated_at,
                "pushed_at": repo.pushed_at,
                "size": repo.size,
                "stargazers_count": repo.stargazers_count,
                "watchers_count": repo.watchers_count,
                "forks_count": repo.forks_count,
                "open_issues_count": repo.open_issues_count,
                "subscribers_count": repo.subscribers_count,
                "network_count": repo.network_count,
                "archived": repo.archived,
                "total_lines": total_lines,
                "langs_percentage": language_percentages,
                "issues_count": issues_count,
                "closed_issues_count": closed_issues_count,
                "closed_pulls_count": closed_pulls_count,
                "open_pulls_count": open_pulls_count,
                "total_pulls_count": total_pulls_count,
                "merged_pulls_count": merged_pulls_count,
                "commits_count": commits_count,
                "first_commit_date": first_commit_date,
                "last_commit_date": last_commit_date,
                "branches_count": branches_count,
                "topics": topics,
                "contributors_count": contributors_count,
                "timestamp": timestamp
            }
        except Exception as e:
            print(f"Error processing repository {repo.full_name}: {e}")
            return None


    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_repo, repo) for repo in repositories]
        for future in tqdm(as_completed(futures), desc=organization_name, unit=" repos", total=total_repositories, ncols=100):
            result = future.result()
            if result:
                repo_data.append(result)
                
    return pd.DataFrame(repo_data)

def is_valid_googlearchive_repo(repo):
    if not repo.description:
        return False
    keywords = ["example", "sample", "migrated"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)

def is_valid_sap_sample_repo(repo):
    if not repo.description:
        return False
    keywords = ["cloud"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)


In [6]:
def save_organization_results(dataframe, organization):
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = f"results/{organization}_{timestamp}.csv"
    dataframe.to_csv(file_path, index=False)

In [7]:
def generate_metrics_csv():
    all_repos_data = []
    
    def fetch_data_for_organization(organization):
        print(f'Retrieving repos from {organization}...')
        organization_repos_data = fetch_organization_repositories(organization)
        if not organization_repos_data.empty:
            save_organization_results(organization_repos_data, organization)
            return organization_repos_data
        else:
            return pd.DataFrame()

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_data_for_organization, org) for org in ORGANIZATIONS]
        for future in tqdm(as_completed(futures), desc="Processing organizations", unit=" orgs"):
            org_data = future.result()
            if not org_data.empty:
                all_repos_data.append(org_data)

    all_repos_data_df = pd.concat(all_repos_data, ignore_index=True) if all_repos_data else pd.DataFrame()
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = f"results/codesamples_{timestamp}.csv"
    all_repos_data_df.to_csv(file_path, index=False)

    return all_repos_data_df

# Results

In [8]:
dataframe = generate_metrics_csv()
dataframe

Retrieving repos from googlearchive...


Processing organizations: 0 orgs [00:00, ? orgs/s]Request GET /repos/googlearchive/drive-ios-quickeditor/contributors?per_page=1 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/appengine-try-python-webapp2/branches?per_page=1 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/android-credentials/pulls?state=all&per_page=1 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/udacity-60fps-samples/commits?per_page=100 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/android-XYZTouristAttractions/commits?per_page=100 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/android-AlwaysOn/commits?per_page=100 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/attendee-checkin/contributors?per_page=1 failed with 403: Forbidden
Retrying after 60 seconds
Request GET /repos/googlearchive/

Unnamed: 0,full_name,name,owner,html_url,description,language,created_at,updated_at,pushed_at,size,...,open_pulls_count,total_pulls_count,merged_pulls_count,commits_count,first_commit_date,last_commit_date,branches_count,topics,contributors_count,timestamp
0,googlearchive/storage-bulk-delete-python,storage-bulk-delete-python,googlearchive,https://github.com/googlearchive/storage-bulk-...,Example program to showcase the use of bulk re...,Python,2013-05-15 06:03:13+00:00,2023-01-28 09:19:36+00:00,2015-09-23 21:35:44+00:00,155,...,0,1,1,15,2013-05-15 05:38:18+00:00,2015-09-23 21:35:44+00:00,1,[],2,20240809_225242
1,googlearchive/instantbuy-sample-xyz-java,instantbuy-sample-xyz-java,googlearchive,https://github.com/googlearchive/instantbuy-sa...,Mobile Web Sample using Instant Buy API,Java,2013-09-27 22:19:45+00:00,2024-07-29 10:34:43+00:00,2018-02-23 00:53:42+00:00,4413,...,0,0,0,5,2013-09-27 22:19:45+00:00,2018-02-23 00:53:41+00:00,1,[],2,20240809_225242
2,googlearchive/gplus-verifytoken-python,gplus-verifytoken-python,googlearchive,https://github.com/googlearchive/gplus-verifyt...,This sample demonstrates how to verify authent...,HTML,2013-05-05 09:12:47+00:00,2023-01-28 18:21:12+00:00,2017-02-14 21:15:31+00:00,11,...,0,3,3,15,2013-05-05 09:12:48+00:00,2017-02-14 21:15:30+00:00,1,[],6,20240809_225234
3,googlearchive/storage-getting-started-go,storage-getting-started-go,googlearchive,https://github.com/googlearchive/storage-getti...,This is a simple example of calling the Google...,Go,2013-06-05 03:14:23+00:00,2023-01-28 09:23:20+00:00,2016-11-29 19:36:06+00:00,20,...,0,5,4,23,2013-06-05 03:14:23+00:00,2016-11-29 19:36:06+00:00,1,[],4,20240809_225242
4,googlearchive/compute-getting-started-java,compute-getting-started-java,googlearchive,https://github.com/googlearchive/compute-getti...,This sample java command line application demo...,Java,2013-11-22 22:42:03+00:00,2023-01-28 09:23:21+00:00,2016-04-19 16:45:43+00:00,29,...,0,4,4,26,2013-11-22 22:42:03+00:00,2016-04-19 16:45:38+00:00,3,[],5,20240809_225249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,googlearchive/android-SharingShortcuts,android-SharingShortcuts,googlearchive,https://github.com/googlearchive/android-Shari...,Migrated:,Java,2019-03-13 18:32:49+00:00,2024-05-19 05:34:17+00:00,2019-10-04 18:16:48+00:00,1162,...,0,1,0,4,2019-03-13 18:35:59+00:00,2019-10-04 18:16:48+00:00,1,[],2,20240809_225444
236,googlearchive/android-viewpager2,android-viewpager2,googlearchive,https://github.com/googlearchive/android-viewp...,Migrated:,,2019-03-14 15:33:54+00:00,2024-02-03 18:46:52+00:00,2019-10-21 21:35:22+00:00,769,...,0,6,4,35,2019-01-15 17:00:30+00:00,2019-10-18 21:07:10+00:00,1,[],6,20240809_225444
237,googlearchive/android-Bubbles,android-Bubbles,googlearchive,https://github.com/googlearchive/android-Bubbles,Migrated:,Kotlin,2019-04-03 16:33:18+00:00,2023-01-28 15:25:27+00:00,2020-01-09 04:03:20+00:00,1009,...,0,3,1,4,2019-04-03 16:43:48+00:00,2020-01-09 04:03:19+00:00,2,[],3,20240809_225444
238,googlearchive/android-DarkTheme,android-DarkTheme,googlearchive,https://github.com/googlearchive/android-DarkT...,migrated:,Java,2019-05-07 18:45:05+00:00,2024-05-26 17:16:02+00:00,2019-10-08 16:55:27+00:00,497,...,0,7,0,3,2019-05-07 18:45:56+00:00,2019-10-08 16:55:27+00:00,1,[],3,20240809_225444
