In [None]:
%pip install PyGithub python-dotenv pandas tqdm matplotlib seaborn

In [None]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# ORGANIZATIONS = ["googlesamples", "aws-samples", "Azure-Samples", "spring-guides", "googlearchive", "spring-cloud-samples"]
ORGANIZATIONS = ["googlesamples"]
EXCLUDED_REPOS = [
    "googlearchive/digits-migration-helper-android",
    "googlearchive/play-apk-expansion",
    "googlearchive/tiger",
    "googlearchive/two-token-sw",
    "googlearchive/Abelana-Android",
    "googlearchive/solutions-mobile-backend-starter-java"
]

In [None]:
load_dotenv()
github_client = Github(getenv('GITHUB_TOKEN'))

In [None]:
def fetch_organization_repositories(organization_name, language=None):
    organization = github_client.get_organization(organization_name)
    repositories = organization.get_repos("all")
    total_repositories = repositories.totalCount
    repo_data = []

    conut = 0

    for repo in tqdm(repositories, desc=organization_name, unit=" repos", total=total_repositories, ncols=100):
        if repo.full_name in EXCLUDED_REPOS:
            continue

        # if language and repo.language != language:
        #     continue

        if organization_name == "googlearchive" and not is_valid_googlearchive_repo(repo):
            continue
        elif organization_name == "SAP-samples" and not is_valid_sap_sample_repo(repo):
            continue

        repo_languages = repo.get_languages()
        total_lines = sum(repo_languages.values())
        language_percentages = {lang: f'{(lines/total_lines):.2%}' for lang, lines in repo_languages.items()}
        repo_data.append({
            "full_name": repo.full_name,
            "name": repo.name,
            "owner": repo.owner.login,
            "html_url": repo.html_url,
            "description": repo.description,
            "language": repo.language,
            "created_at": repo.created_at,
            "updated_at": repo.updated_at,
            "pushed_at": repo.pushed_at,
            "size": repo.size,
            "stargazers_count": repo.stargazers_count,
            "watchers_count": repo.watchers_count,
            "forks_count": repo.forks_count,
            "open_issues_count": repo.open_issues_count,
            "subscribers_count": repo.subscribers_count,
            "network_count": repo.network_count,
            "archived": repo.archived,
            "total_lines": total_lines,
            "langs_percentage": language_percentages
        })
        conut += 1
        
        # if conut == 3:
        #     break

    return pd.DataFrame(repo_data)

def is_valid_googlearchive_repo(repo):
    if not repo.description:
        return False
    keywords = ["example", "sample", "migrated"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)

def is_valid_sap_sample_repo(repo):
    if not repo.description:
        return False
    keywords = ["cloud"]
    return any(keyword in repo.description.lower() for keyword in keywords) or any(keyword in repo.full_name.lower() for keyword in keywords)


In [None]:
def generate_metrics_csv():
    all_repos_data = pd.DataFrame()
    language_filter = None

    for organization in ORGANIZATIONS:
        print(f'Retrieving repos from {organization}...')
        organization_repos_data = fetch_organization_repositories(organization, language_filter)
        all_repos_data = pd.concat([all_repos_data, organization_repos_data], ignore_index=True)
        print(f'Done with {organization}')

    file_path = f"results/codesamples.csv"

    all_repos_data.to_csv(file_path, index=False)

    return all_repos_data

In [None]:
def calculate_language_usage(dataframe):
    language_usage = {}

    for repo in dataframe.itertuples():
        repo_languages = repo.langs_percentage
        total_lines = repo.total_lines

        for language, percentage in repo_languages.items():
            lines = total_lines * (float(percentage.strip('%')) / 100)
            language_usage[language] = language_usage.get(language, 0) + lines

    total_lines_of_code = sum(language_usage.values())

    formatted_languages = sorted(
        [(language, f'{(lines / total_lines_of_code):.2%}', int(lines)) for language, lines in language_usage.items()],
        key=lambda item: float(item[1].strip('%')),
        reverse=True
    )

    file_path = f"results/languages_usage.csv"
    dataframeLanguages = pd.DataFrame(formatted_languages, columns=["Language", "Usage Percentage", "Lines"])
    dataframeLanguages.to_csv(file_path, index=False)
    return dataframeLanguages


In [None]:
def generate_statistics(dataframe):    
    statistics = dataframe[METRICS].describe().transpose()
    
    statistics['median'] = dataframe[METRICS].median()
    statistics['mode'] = dataframe[METRICS].mode().iloc[0]
    
    file_path = f"results/statistics.csv"
    statistics.to_csv(file_path)
    
    return statistics

# Results

In [None]:
METRICS = [
    'size', 
    'stargazers_count', 
    'watchers_count', 
    'forks_count', 
    'open_issues_count', 
    'subscribers_count', 
    'network_count', 
    'total_lines'
]

In [None]:
dataframe = generate_metrics_csv()
dataframe

In [None]:
dataframeLanguages = calculate_language_usage(dataframe)
dataframeLanguages

In [None]:
dataframe_statistics = generate_statistics(dataframe)
dataframe_statistics