#### Outputs a **.csv** file with additional metrics for the organizations in the dataset. The metrics are:

- issues_count
- closed_issues_count
- open_issues_count
- total_issues_count
- closed_pulls_count
- open_pulls_count
- total_pulls_count
- merged_pulls_count
- commits_count
- first_commit_date
- last_commit_date
- branches_count
- topics
- contributors_count

In [None]:
%pip install PyGithub python-dotenv pandas tqdm cachetools

In [None]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from cachetools import cached, TTLCache
import time

In [None]:
load_dotenv()
g = Github(getenv('GITHUB_TOKEN'))

In [None]:
cache = TTLCache(maxsize=1024, ttl=300)

@cached(cache)
def fetch_repo_data(repo):
    all_issues = list(repo.get_issues(state="all"))
    open_issues_count = sum(1 for issue in all_issues if issue.state == "open")
    closed_issues_count = len(all_issues) - open_issues_count
    
    all_pulls = list(repo.get_pulls(state='all'))
    open_pulls_count = sum(1 for pull in all_pulls if pull.state == "open")
    closed_pulls_count = len(all_pulls) - open_pulls_count
    merged_pulls_count = sum(1 for pull in all_pulls if pull.merged)
    
    commits = list(repo.get_commits())
    commits_count = len(commits)
    first_commit_date = commits[-1].commit.author.date if commits else None
    last_commit_date = commits[0].commit.author.date if commits else None

    return {
        "issues_count": len(all_issues),
        "closed_issues_count": closed_issues_count,
        "open_issues_count": open_issues_count,
        "total_issues_count": closed_issues_count + open_issues_count,
        "closed_pulls_count": closed_pulls_count,
        "open_pulls_count": open_pulls_count,
        "total_pulls_count": closed_pulls_count + open_pulls_count,
        "merged_pulls_count": merged_pulls_count,
        "commits_count": commits_count,
        "first_commit_date": first_commit_date,
        "last_commit_date": last_commit_date,
        "branches_count": int(repo.get_branches().totalCount),
        "topics": str(tuple(repo.topics)),
        "contributors_count": int(repo.get_contributors().totalCount)
    }

def get_repo_data(full_name):
    repo = g.get_repo(full_name)
    return fetch_repo_data(repo)

In [None]:

def fetch_all_repo_data(full_names):
    data_list = []
    
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_repo = {executor.submit(get_repo_data, full_name): full_name for full_name in full_names}
        for future in tqdm(as_completed(future_to_repo), total=len(full_names), unit="repo", ncols=100, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {percentage:3.0f}%"):
            try:
                data = future.result()
                full_name = future_to_repo[future]
                data["full_name"] = full_name
                data_list.append(data)
            except Exception as e:
                print(f"Error fetching data for repo: {future_to_repo[future]}, error: {e}")

    return pd.DataFrame(data_list)

In [None]:
dataframe = pd.read_csv('codesamples.csv')
full_names = dataframe['full_name'].unique()

repo_data_df = fetch_all_repo_data(full_names)

dataframe = pd.merge(dataframe, repo_data_df, on='full_name', how='left')
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_path = f'results/codesamples_full_{timestamp}.csv'
dataframe.to_csv(file_path, index=False)