#### Outputs a **.csv** file with metrics on repositories from the following organizations:
- googlesamples
- aws-samples
- Azure-Samples
- spring-guides 
- googlearchive
- spring-cloud-samples
-spring-io
#### The metrics include:
- full_name
- created_at
- description
- forks_count
- language
- open_issues_count
- size
- stargazers_count
- subscribers_count
- updated_at
- watchers_count
- langs_percentage
- contibutors count
- issues and pull reqs open and closed
- accepted pull reqs?
- num of commits
- num of branches
- last and first commit dates in the .csv file
- tags
<div class="alert alert-box alert-info">
    <b>Note:</b> You can change the organizations by modifying the list of organizations in the code cell below.
</div>

In [None]:
organizations = [
                "googlesamples", 
                "aws-samples",
                "Azure-Samples",
                "googlearchive",
                "spring-guides",
                "spring-cloud-samples",
                "spring-io"
                 ]

In [None]:
%pip install PyGithub
%pip install python-dotenv
%pip install pandas
%pip install tqdm
%pip install cachetools

In [None]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from cachetools import cached, TTLCache

In [None]:
load_dotenv()
g = Github(getenv('GITHUB_TOKEN'))

In [None]:
cache = TTLCache(maxsize=100, ttl=300)

@cached(cache)
def fetch_repo_data(repo):
    repo_langs = repo.get_languages()
    total_lines = sum(repo_langs.values())
    langs_percentage = {lang: f'{(lines/total_lines):.2%}' for lang, lines in repo_langs.items()}

    return {
        "full_name": repo.full_name, "description": repo.description, "created_at": repo.created_at,
        "updated_at": repo.updated_at, "size": repo.size, "main_language": repo.language, "forks_count": repo.forks_count,
        "stargazers_count": repo.stargazers_count, "subscribers_count": repo.subscribers_count,
        "watchers_count": repo.watchers_count,
        "langs_percentage": langs_percentage
    }

In [None]:
def get_org_repos(organization_name, language=None):
    data_list = []
    non_samples = [
        "googlearchive/digits-migration-helper-android", "googlearchive/play-apk-expansion",
        "googlearchive/tiger", "googlearchive/two-token-sw", "googlearchive/Abelana-Android",
        "googlearchive/solutions-mobile-backend-starter-java"
    ]
    organization = g.get_organization(organization_name)
    repos = [repo for repo in organization.get_repos() if repo.full_name not in non_samples and not repo.private and not repo.archived]
    total_repos = len(repos)

    def filter_repo(repo):
        if language and repo.language != language:
            return False
        if organization_name == "googlearchive":
            if repo.description and any(keyword in repo.description.lower() for keyword in ["example", "sample", "migrated"]):
                return False
        elif organization_name == "SAP-samples":
            if repo.description and "cloud" not in repo.description.lower():
                return False
        return True

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_repo = {executor.submit(fetch_repo_data, repo): repo for repo in repos if filter_repo(repo)}
        for future in tqdm(as_completed(future_to_repo), total=total_repos, desc=organization_name, unit=" repos", ncols=100, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {percentage:3.0f}%'):
            try:
                data = future.result()
                data["framework"] = organization_name
                data_list.append(data)
            except Exception as e:
                print(f"Error fetching data for repo: {future_to_repo[future].full_name}, error: {e}")

    return pd.DataFrame(data_list)

<div class='alert alert-box alert-info'>
    Below is the code that generates the <b>.csv</b> file.
    You can change the language by modifying the <i style='color: red'>language</i> variable in the code cell below.
</div>
<div class='alert alert-box alert-warning'>
    <b>Note:</b> The <i style='color: blue'>language</i> variable is case sensitive and can be <i style='color: blue'>None</i>, if you want to get all the repositories.
</div>

In [None]:
dataframe = pd.DataFrame()
language = None
for organization in organizations:
    print(f'Retrieving repos from {"\033[95m"}{organization} {"\033[0m"}so that their GitHub data is processed...')
    dataframe = pd.concat([dataframe, get_org_repos(organization, language)])
    print(f'{"\033[92m"}Data from {"\033[95m"}{organization} {"\033[92m"}was processed successfully!{"\033[0m"}')
dataframe.to_csv("codesamples.csv", index=False)