In [1]:
import concurrent.futures
import json
import pickle
import re
from datetime import datetime
from pathlib import Path
from typing import Any

import networkx as nx
import numpy as np
import requests
import torch
from dateutil.parser import parse
from sklearn.linear_model import LinearRegression
from torch.utils.data import Dataset
from torch_geometric.data import Batch, Data
from torch_geometric.loader import DataLoader

from mt.definitions import DATA_DIR, REPO_DIR
from mt.helper import flatten

In [None]:
EDGE_TYPE_PATH = DATA_DIR / "type_to_int.json"


def get_node_type_to_int() -> dict[str, int]:
    if (path := EDGE_TYPE_PATH).exists():
        with open(path) as f:
            node_type_to_int = json.load(f)
    else:
        response = requests.get(
            "https://raw.githubusercontent.com/tree-sitter/tree-sitter-python/master/src/node-types.json"
        )
        types = re.findall(r'"type": "(.+)"', response.text)
        node_type_to_int = {t: i + 1 for i, t in enumerate(list(set(types)))}
        with open(EDGE_TYPE_PATH, "w") as f:
            json.dump(node_type_to_int, f)
    return node_type_to_int


node_type_to_int = get_node_type_to_int()
edge_type_to_int = {
    "child": 0,
    "occurance_of": 1,
    "may_next_use": 2,
}


def get_commit_paths(repo: Path) -> dict[str, Path]:
    commit_data_dir = repo / "commit_data"
    commit_paths = list(commit_data_dir.glob("*.json"))
    commit_paths.sort(key=lambda path: int(path.name.split("_")[0]))
    commit_paths = {
        path.name.split("_")[1].removesuffix(".json"): path for path in commit_paths
    }
    return commit_paths


def issue_open_at(issue: dict[str, Any], date: datetime) -> bool:
    created_at = parse(issue["created_at"])
    closed_at = parse(issue["closed_at"]) if issue["closed_at"] else None
    return created_at.replace(tzinfo=None) < date and (
        not closed_at or closed_at.replace(tzinfo=None) > date
    )


def number_of_issues_open(issues: dict[str, Any], date: datetime) -> int:
    return sum([1 for issue in issues if issue_open_at(issue, date)])


def number_of_stars_at(stars: list[dict[str, Any]], date: datetime) -> int:
    count = 0
    for star in stars:
        if parse(star["starred_at"]).replace(tzinfo=None) <= date:
            count += 1
    return count


def nx_to_pyg_graph(nx_graph: nx.DiGraph) -> tuple[Data, dict[int, str]]:
    node_mapping = {node: i for i, node in enumerate(nx_graph.nodes())}
    original_ids = {i: node for node, i in node_mapping.items()}

    x = [
        node_type_to_int.get(nx_graph.nodes[node]["type"], 0) for node in nx_graph.nodes
    ]
    x = torch.tensor(x, dtype=torch.long).unsqueeze(1)  # Shape (num_nodes, 1)

    edge_list = []
    edge_attr = []
    for u, v, data in nx_graph.edges(data=True):
        edge_list.append([node_mapping[u], node_mapping[v]])
        edge_attr.append(edge_type_to_int[data["type"]])
    # edge_list = [[node_mapping[u], node_mapping[v]] for u, v in nx_graph.edges()]
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.long).unsqueeze(1)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr), original_ids


def file_features_to_graph(
    features: dict[str, str | bool | dict[str, int] | list[str]]
) -> tuple[Data, dict[int, str]]:
    with open(features["feature_file"]) as f:
        nx_graph = nx.node_link_graph(json.load(f)["ast"])
        return nx_to_pyg_graph(nx_graph)


def fit_reg_calc_res(
    no_issues: dict[str, int], no_stars: dict[str, int]
) -> dict[str, int]:
    residuals = {}
    stars = np.array(list(no_stars.values())).reshape(-1, 1)
    issues = np.array(list(no_issues.values()))

    model = LinearRegression()
    model.fit(stars, issues)

    predicted_issues = model.predict(stars)
    residuals_list = issues - predicted_issues

    for sha, residual in zip(no_stars.keys(), residuals_list):
        residuals[sha] = residual

    return residuals


def process_commit(
    commit: dict[str, Any],
    path: Path,
    idx: int,
    pt_dir: Path,
    issues: dict[str, Any],
    stars: dict[str, Any],
) -> tuple[str, int, int]:
    "returns msg, no_issues, no_stars"
    commit_date = parse(commit["commit"]["author"]["date"]).replace(tzinfo=None)

    # y_scale = number_of_stars_at(stars, commit_date)
    # y = number_of_issues_open(issues, commit_date)
    # y = y / (y_scale if y_scale else 1)
    # y = y * 1000

    with open(path) as f:
        raw_data = json.load(f)

    maps, batch = [], []
    for graph, id_map in [
        file_features_to_graph(file_features) for file_features in raw_data.values()
    ]:
        maps.append(id_map)
        batch.append(graph)

    batch = Batch.from_data_list(batch)
    torch.save((commit["sha"], batch), pt_dir / f"batch_{idx}.pt")
    with open(pt_dir / f"map_{idx}.pkl", "wb") as f:
        pickle.dump(maps, f)
    return (
        f"{commit['sha']} Processed",
        number_of_issues_open(issues, commit_date),
        number_of_stars_at(stars, commit_date),
    )


def process_repo(repo: Path) -> None:
    with open(repo / "stars.json") as f:
        stars = flatten([page["items"] for page in json.load(f)])

    with open(repo / "commits.json") as f:
        commits = flatten([page["items"] for page in json.load(f)])

    with open(repo / "issues.json") as f:
        issues = flatten([page["items"] for page in json.load(f)])

    pt_dir = repo / "pts"
    pt_dir.mkdir(exist_ok=True)
    commit_paths = get_commit_paths(repo)

    all_issues, all_stars = {}, {}

    futures, counter = {}, 0
    for commit in commits:
        if path := commit_paths.get(commit["sha"]):
            process_commit(commit, path, counter, pt_dir, issues, stars, maps)

    for future in concurrent.futures.as_completed(futures):
        sha = futures[future]
        msg, no_issues, no_stars = future.result()
        all_issues[sha] = no_issues
        all_stars[sha] = no_stars
        print(msg)

    residuals = fit_reg_calc_res(all_issues, all_stars)

    with open(pt_dir / "no_stars.pkl", "wb") as f:
        pickle.dump(no_stars, f)

    with open(pt_dir / "no_issues.pkl", "wb") as f:
        pickle.dump(no_issues, f)

    with open(pt_dir / "residuals.pkl", "wb") as f:
        pickle.dump(residuals, f)



repo = REPO_DIR / "pytorch/vision"
process_repo(repo)