#### Outputs a **.csv** file with metrics on repositories from the following organizations:
- googlesamples
- aws-samples
- Azure-Samples
- spring-guides 
- googlearchive
- spring-cloud-samples
#### The metrics include:
- full_name
- created_at
- description
- description
- forks_count
- language
- open_issues_count
- size
- stargazers_count
- subscribers_count
- updated_at
- watchers_count
- langs_percentage
<div class="alert alert-box alert-info">
    <b>Note:</b> You can change the organizations by modifying the list of organizations in the code cell below.
</div>

In [1]:
organizations = ["googlesamples", #"aws-samples", "Azure-Samples", "spring-guides", "googlearchive", "spring-cloud-samples"
                 ]

In [2]:
%pip install PyGithub
%pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\oheit\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\oheit\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [4]:
load_dotenv()
g = Github(getenv('GITHUB_TOKEN'))

In [5]:
def get_org_repos(organization_name, language=None):
    non_samples = ["googlearchive/digits-migration-helper-android", "googlearchive/play-apk-expansion", "googlearchive/tiger", "googlearchive/two-token-sw", "googlearchive/Abelana-Android", "googlearchive/solutions-mobile-backend-starter-java"]
    organization = g.get_organization(organization_name)
    repos = organization.get_repos("all")
    total_repos = len(list(repos))
    dataframe = pd.DataFrame(columns=["full_name", "created_at", "description", "forks_count", "language", "open_issues_count", "size", "stargazers_count", "subscribers_count", "updated_at", "watchers_count", "langs_percentage"])
    for repo in tqdm(repos, desc=organization_name, unit=" repos", ncols=100, total=total_repos, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {percentage:3.0f}%'):
        if repo.full_name in non_samples:
            continue
        
        if (language == None or repo.language == language):
            if (organization_name == "googlearchive"):
                if (not repo.description == None and not ("example" in repo.description.lower() or "sample" in repo.description.lower() or "example" in repo.full_name.lower() or "sample" in repo.full_name.lower() or "migrated" in repo.description.lower())):
                    continue
            elif (organization_name == "SAP-samples"):
                if (not repo.description == None and not ("cloud" in repo.description.lower() or "cloud" in repo.full_name.lower())):
                    continue
        else:
            continue
        
        repo_langs = repo.get_languages()
        total_lines = sum(repo_langs.values())
        langs_percentage = {lang: f'{(lines/total_lines):.2%}' for lang, lines in repo_langs.items()}
        dataframe = pd.concat([dataframe, pd.DataFrame(data={"framework": [organization_name],"full_name": [repo.full_name], "created_at": [repo.created_at], "description": [repo.description], "forks_count": [repo.forks_count], "language": [repo.language], "open_issues_count": [repo.open_issues_count], "size": [repo.size], "stargazers_count": [repo.stargazers_count], "subscribers_count": [repo.subscribers_count], "updated_at": [repo.updated_at], "watchers_count": [repo.watchers_count], "langs_percentage": [langs_percentage]})], ignore_index=True)
        # print(g.get_rate_limit())
    return dataframe

<div class='alert alert-box alert-info'>
    Below is the code that generates the <b>.csv</b> file.
    You can change the language by modifying the <i style='color: red'>language</i> variable in the code cell below.
</div>
<div class='alert alert-box alert-warning'>
    <b>Note:</b> The <i style='color: blue'>language</i> variable is case sensitive and can be <i style='color: blue'>None</i>, if you want to get all the repositories.
</div>

In [6]:
dataframe = pd.DataFrame()
language = None
for organization in organizations:
    print(f'Retrieving repos from {organization} so that their github data is taken...')
    dataframe = pd.concat([dataframe, get_org_repos(organization, language)])
    print(f'done with {organization}')
dataframe.to_csv("codesamples.csv", index=False)

Retrieving repos from googlesamples so that their github data is taken...


  dataframe = pd.concat([dataframe, pd.DataFrame(data={"framework": [organization_name],"full_name": [repo.full_name], "created_at": [repo.created_at], "description": [repo.description], "forks_count": [repo.forks_count], "language": [repo.language], "open_issues_count": [repo.open_issues_count], "size": [repo.size], "stargazers_count": [repo.stargazers_count], "subscribers_count": [repo.subscribers_count], "updated_at": [repo.updated_at], "watchers_count": [repo.watchers_count], "langs_percentage": [langs_percentage]})], ignore_index=True)
googlesamples: 100%|███████████████████████████████████████| 71/71 [01:21<00:00,  1.15s/ repos] 100%

done with googlesamples





In [7]:
language_usage = {}
for repo in dataframe["langs_percentage"]:
    for language, percentage in repo.items():
        language_usage[language] = language_usage.get(language, 0) + float(percentage.strip("%"))

total_usage = sum(language_usage.values())

formatted_languages = sorted(
    [(language, f'{percentage / total_usage:.2%}') for language, percentage in language_usage.items()],
    key=lambda item: float(item[1].strip('%')),
    reverse=True
)

pd.DataFrame(formatted_languages, columns=["Language", "Usage Percentage"]).to_csv("languages_usage_from_organizations.csv", index=False)