In [8]:
import pandas as pd
import requests
import time
from datetime import datetime, timezone

In [9]:
# Загрузим исходную таблицу
repos = pd.read_csv('https://raw.githubusercontent.com/AndreyG75/AI-DeepFund_Ethereum/refs/heads/main/Contest_2_POND/Datasets/data_contest_all_repos.csv')

In [10]:
# GitHub токен
GITHUB_TOKEN = "GITHUB_TOKEN"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

In [11]:
# Функция для извлечения основных метрик репозитория
def get_repo_metrics(repo_url):
    try:
        repo_path = "/".join(repo_url.split("/")[-2:])
        api_url = f"https://api.github.com/repos/{repo_path}"
        
        response = requests.get(api_url, headers=HEADERS)

        # Обработка ошибок
        if response.status_code == 403:
            print(f"Rate limit exceeded or forbidden access for {repo_url}. Retrying after a pause...")
            time.sleep(10)
            return {}
        elif response.status_code == 404:
            print(f"Repository {repo_url} not found.")
            return {}
        elif response.status_code != 200:
            print(f"Error fetching data for {repo_url}: {response.status_code}")
            return {}

        data = response.json()

        # Извлекаем основные метрики
        metrics = {
            "size": data.get("size", 0),
            "forks_count": data.get("forks_count", 0),
            "stars_count": data.get("stargazers_count", 0),
            "open_issues_count": data.get("open_issues_count", 0),
            "subscribers_count": data.get("subscribers_count", 0),
            "watchers_count": data.get("watchers_count", 0),
        }
        
        # Дата создания и последнего обновления
        created_at = data.get("created_at")
        updated_at = data.get("updated_at")
        if created_at:
            metrics["age_days"] = (datetime.now(timezone.utc) - datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)).days
        else:
            metrics["age_days"] = None

        if updated_at:
            metrics["days_since_update"] = (datetime.now(timezone.utc) - datetime.strptime(updated_at, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)).days
        else:
            metrics["days_since_update"] = None

        return metrics

    except Exception as e:
        print(f"Exception: {e}")
        return {}


In [12]:
# Функция для извлечения дополнительных метрик (спонсоры, релизы и т.д.)
def get_additional_metrics(repo_url):
    try:
        repo_path = "/".join(repo_url.split("/")[-2:])
        additional_metrics = {}

        # Считаем спонсоров
        sponsors_api = f"https://api.github.com/repos/{repo_path}/sponsors"
        sponsors_response = requests.get(sponsors_api, headers=HEADERS)
        additional_metrics["sponsor_count"] = sponsors_response.json().get("sponsorships_as_maintainer", 0) if sponsors_response.status_code == 200 else 0

        # Считаем релизы
        releases_api = f"https://api.github.com/repos/{repo_path}/releases"
        releases_response = requests.get(releases_api, headers=HEADERS)
        additional_metrics["releases_count"] = len(releases_response.json()) if releases_response.status_code == 200 else 0

        # Считаем pull requests
        pulls_api = f"https://api.github.com/repos/{repo_path}/pulls?state=all"
        pulls_response = requests.get(pulls_api, headers=HEADERS)
        additional_metrics["pull_requests_count"] = len(pulls_response.json()) if pulls_response.status_code == 200 else 0

        # Считаем контрибьюторов
        contributors_api = f"https://api.github.com/repos/{repo_path}/contributors"
        contributors_response = requests.get(contributors_api, headers=HEADERS)
        additional_metrics["contributors_count"] = len(contributors_response.json()) if contributors_response.status_code == 200 else 0

        # Считаем деплои
        deployments_api = f"https://api.github.com/repos/{repo_path}/deployments"
        deployments_response = requests.get(deployments_api, headers=HEADERS)
        additional_metrics["deployments_count"] = len(deployments_response.json()) if deployments_response.status_code == 200 else 0

        # Количество используемых пакетов (если доступно через API)
        packages_api = f"https://api.github.com/repos/{repo_path}/packages"
        packages_response = requests.get(packages_api, headers=HEADERS)
        additional_metrics["num_packages_count"] = len(packages_response.json()) if packages_response.status_code == 200 else 0

        return additional_metrics

    except Exception as e:
        print(f"Exception: {e}")
        return {}


In [13]:
# Применение функций к каждой строке таблицы
all_repo_data = []

for _, row in repos.iterrows():
    repo_url = row['repo']
    print(f"Fetching data for {repo_url}...")

    # Получаем основные метрики
    metrics = get_repo_metrics(repo_url)

    # Получаем дополнительные метрики
    additional_metrics = get_additional_metrics(repo_url)

    # Объединяем все метрики
    combined_metrics = {**metrics, **additional_metrics, "repo_url": repo_url}
    all_repo_data.append(combined_metrics)


Fetching data for https://github.com/mochajs/mocha...
Fetching data for https://github.com/chzyer/readline...
Fetching data for https://github.com/gulpjs/gulp...
Fetching data for https://github.com/webpack/webpack...
Fetching data for https://github.com/redux-saga/redux-saga...
Fetching data for https://github.com/babel/babel...
Fetching data for https://github.com/debug-js/debug...
Fetching data for https://github.com/vuejs/vue...
Fetching data for https://github.com/marak/colors.js...
Fetching data for https://github.com/reactivex/rxjs...
Fetching data for https://github.com/webreflection/flatted...
Fetching data for https://github.com/level/levelup...
Fetching data for https://github.com/rollup/rollup...
Fetching data for https://github.com/go-task/slim-sprig...
Fetching data for https://github.com/mikemcl/bignumber.js...
Fetching data for https://github.com/electron/electron...
Fetching data for https://github.com/xtuc/webassemblyjs...
Fetching data for https://github.com/gregberg

In [14]:
# Преобразуем результаты в DataFrame
result_df = pd.DataFrame(all_repo_data)


In [15]:
# Вывод первых строк результата для проверки
print(result_df.head())


    size  forks_count  stars_count  open_issues_count  subscribers_count  \
0  26733         3014        22698                198                397   
1   3576          281         2109                108                 41   
2   1118         4217        33061                 34               1025   
3  67146         8933        64999                242               1456   
4   9473         1967        22535                 40                249   

   watchers_count  age_days  days_since_update  sponsor_count  releases_count  \
0           22698      5084                  0              0              30   
1            2109      3426                 15              0               6   
2           33061      4235                  0              0              23   
3           64999      4716                  0              0              30   
4           22535      3356                  0              0              30   

   pull_requests_count  contributors_count  deployments_

In [16]:
# Сохраняем результаты в CSV файл
result_df.to_csv('deepfund_metric_git_max.csv', index=False)
