#### Outputs a **.csv** file with metrics on repositories from the following organizations:
- googlesamples
- aws-samples
- Azure-Samples
- spring-guides 
- googlearchive
- spring-cloud-samples
-spring-io
#### The metrics include:
- full_name
- name
- owner
- html_url
- description
- created_at
- updated_at
- pushed_at
- size
- language
- forks_count
- stargazers_count
- subscribers_count
- watchers_count
- network_count
- archived
- total_lines
- langs_percentage
<div class="alert alert-box alert-info">
    <b>Note:</b> You can change the organizations by modifying the list of organizations in the code cell below.
</div>

In [8]:
ORGANIZATIONS = [
    # "aws-samples", # 6.3k
    # "Azure-Samples", # 2.6k
    # "googlesamples", # 71
    # "spring-guides", # 74
    # "googlearchive", # 973
    "spring-cloud-samples", # 29
]

EXCLUDED_REPOS = [
    "googlearchive/digits-migration-helper-android",
    "googlearchive/play-apk-expansion",
    "googlearchive/tiger",
    "googlearchive/two-token-sw",
    "googlearchive/Abelana-Android",
    "googlearchive/solutions-mobile-backend-starter-java"
]

In [9]:
%pip install PyGithub python-dotenv pandas tqdm cachetools matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:
from github import Github
import os
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from cachetools import cached, TTLCache

In [11]:
load_dotenv()
g = Github(getenv('GITHUB_TOKEN'), per_page=100)

In [12]:
cache = TTLCache(maxsize=1024, ttl=300)

def fetch_organization_repositories(organization_name, language=None):
    repo_data = []
    organization = g.get_organization(organization_name)
    repos = organization.get_repos("all")
    total_repos = repos.totalCount

    @cached(cache)
    def process_repo(repo):
        if repo.full_name in EXCLUDED_REPOS:
            return None
        if language and repo.language != language:
            return None
        if organization_name == "googlearchive" and not is_valid_googlearchive_repo(repo):
            return None
        elif organization_name == "SAP-samples" and not is_valid_sap_sample_repo(repo):
            return None
        
        repo_languages = repo.get_languages()
        total_lines = sum(repo_languages.values())
        language_percentages = {lang: f'{(lines/total_lines):.2%}' for lang, lines in repo_languages.items()}
        
        all_issues = list(repo.get_issues(state="all"))
        open_issues_count = sum(1 for issue in all_issues if issue.state == "open")
        closed_issues_count = len(all_issues) - open_issues_count

        all_pulls = list(repo.get_pulls(state='all'))
        open_pulls_count = sum(1 for pull in all_pulls if pull.state == "open")
        closed_pulls_count = len(all_pulls) - open_pulls_count
        merged_pulls_count = sum(1 for pull in all_pulls if pull.merged)

        commits = list(repo.get_commits())
        commits_count = len(commits)
        first_commit_date = commits[-1].commit.author.date if commits else None
        last_commit_date = commits[0].commit.author.date if commits else None
        
        return {
        "full_name": repo.full_name,
        "name": repo.name,
        "owner": repo.owner.login,
        "html_url": repo.html_url,
        "description": repo.description, 
        "created_at": repo.created_at,
        "updated_at": repo.updated_at, 
        "pushed_at": repo.pushed_at,
        "size": repo.size, 
        "language": repo.language, 
        "forks_count": repo.forks_count,
        "stargazers_count": repo.stargazers_count, 
        "subscribers_count": repo.subscribers_count,
        "watchers_count": repo.watchers_count,
        "network_count": repo.network_count,
        "archived": repo.archived,
        "total_lines": total_lines,
        "langs_percentage": language_percentages,
        "issues_count": len(all_issues),
        "closed_issues_count": closed_issues_count,
        "open_issues_count": open_issues_count,
        "total_issues_count": closed_issues_count + open_issues_count,
        "closed_pulls_count": closed_pulls_count,
        "open_pulls_count": open_pulls_count,
        "total_pulls_count": closed_pulls_count + open_pulls_count,
        "merged_pulls_count": merged_pulls_count,
        "commits_count": commits_count,
        "first_commit_date": first_commit_date,
        "last_commit_date": last_commit_date,
        "branches_count": int(repo.get_branches().totalCount),
        "topics": str(tuple(repo.topics)),
        "contributors_count": int(repo.get_contributors().totalCount)
        }


    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_repo, repo) for repo in repos]
        for future in tqdm(as_completed(futures), total=total_repos, desc=organization_name, unit=" repos", ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {percentage:3.0f}%'):
            try:
                result = future.result()
                repo_data.append(result)
                    
            except Exception as e:
                print(f"Error fetching data for repo: {futures[future].full_name}, error: {e}")

    return pd.DataFrame(repo_data)

def is_valid_googlearchive_repo(repo):
    if not repo.description:
        return False
    keywords = ["example", "sample", "migrated"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)

def is_valid_sap_sample_repo(repo):
    if not repo.description:
        return False
    keywords = ["cloud"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)

<div class='alert alert-box alert-info'>
    Below is the code that generates the <b>.csv</b> metrics file.
    You can change the language by modifying the <i style='color: red'>language</i> variable in the code cell below.
</div>
<div class='alert alert-box alert-warning'>
    <b>Note:</b> The <i style='color: blue'>language</i> variable is case sensitive and can be <i style='color: blue'>None</i>, if you want to get all the repositories.
</div>

In [13]:

def generate_metrics_csv(name):
    
    if not os.path.exists('codesamples'): 
        os.makedirs("codesamples")
    
    all_repos_data = []
    
    def fetch_data_for_organization(organization):
        print(f'Retrieving repos from {organization}...')
        organization_repos_data = fetch_organization_repositories(organization, language=None)
        if not organization_repos_data.empty:
            return organization_repos_data
        else:
            return pd.DataFrame()

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_data_for_organization, org) for org in ORGANIZATIONS]
        for future in tqdm(as_completed(futures), desc="Processing organizations", unit=" orgs", total=len(ORGANIZATIONS), ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {percentage:3.0f}%'):
            org_data = future.result()
            if not org_data.empty:
                all_repos_data.append(org_data)

    all_repos_data_df = pd.concat(all_repos_data, ignore_index=True) if all_repos_data else pd.DataFrame()
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    file_path = f"codesamples/{name}_{timestamp}.csv"
    all_repos_data_df.to_csv(file_path, index=False)

    return all_repos_data_df

In [14]:
dataframe = generate_metrics_csv('spring_cloud')
dataframe

Retrieving repos from spring-cloud-samples...


Processing organizations:   0%|                                       | 0/1 [00:00<?, ? orgs/s]   0%

