In [2]:
import pandas as pd
from github import Github
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport
from time import sleep

In [3]:
top_users = pd.read_csv('./data/top-users.csv')

In [4]:
github_token = "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"
# Select your transport with a defined url endpoint
transport = AIOHTTPTransport(url="https://api.github.com/graphql",  headers={'Authorization': 'bearer {}'.format(github_token)})

# Create a GraphQL client using the defined transport
client = Client(transport=transport, fetch_schema_from_transport=True, execute_timeout=120)

g = Github(github_token, per_page=100)


In [5]:
userRepoQuery = gql(
"""
    query getUserConnections($login: String!)
    {
        user(login: $login) {
            id
            name
            company
            bio
            location
            email
            followers (first: 100) {
                nodes {
                    login
                }
            }
            following (first: 100) {
                nodes {
                    login
                }
            }
            starredRepositories(first: 100, orderBy: {field: STARRED_AT, direction: DESC}) {
                nodes {
                    id
                    name
                    owner {
                        login
                    }
                    description
                    repositoryTopics(first: 10) {
                        nodes {
                            topic {
                                name
                            }
                        }
                    }
                    languages(first: 10) {
                        nodes {
                            name
                        }
                    }
                }
                edges {
                    starredAt
                }
            }
            repositoriesContributedTo(first: 100, orderBy: {field: STARGAZERS, direction: DESC}, privacy: PUBLIC, contributionTypes: [COMMIT]) {
                nodes {
                    id
                    name
                    owner {
                        login
                    }
                    description
                    repositoryTopics(first: 10) {
                        nodes {
                            topic {
                                name
                            }
                        }
                    }
                    languages(first: 10) {
                        nodes {
                            name
                        }
                    }
                }
            }
        }
    }
"""
)

In [5]:
L = 4 # Search Depth
NUM_SEED_USERS = 10
# SEARCH_SINCE = datetime.fromisoformat('2023-01-01')

logins = top_users['login'][:NUM_SEED_USERS]

U = [set(logins)]
R = []

searched_users = set()
info_searched_repos = set()
searched_repos = set()

fetched_users_data = []
fetched_commits_data = []
fetched_repos_data = []
fetched_contributions_data = []

for i in range(L):
    print("Search depth:", i + 1)
    U.append(set())
    R.append(set())

    # Stage I: Collect user and repo information
    print('#Users to search:', len(U[i]))
    while len(U[i]) > 0:  
        try:
            while g.get_rate_limit().graphql.remaining < 50:
                print("rate limit almost approached, staling...")
                sleep(10)
                
            cur_user = U[i].pop()
            if cur_user in searched_users:
                continue
            searched_users.add(cur_user)
            print("Searching", cur_user)
        
            res = await client.execute_async(userRepoQuery, { "login": cur_user })
            
            cur_user_repos = ["{login}/{name}".format(login=node["owner"]["login"], name=node["name"]) for node in res["user"]["repositoriesContributedTo"]["nodes"]]
            cur_user_starred_repos = [{
                "name": "{login}/{name}".format(login=node["owner"]["login"], name=node["name"]), 
                "starred_at": edge["starredAt"]
                }
                for node, edge in zip(res["user"]["starredRepositories"]["nodes"], res["user"]["starredRepositories"]["edges"])]
            
            # Store new repo information
            cur_user_repo_data = [{
                "id": node["id"],
                "name": "{login}/{name}".format(login=node["owner"]["login"], name=node["name"]),
                "description": node["description"],
                "topics": [topic_node["topic"]["name"] for topic_node in node["repositoryTopics"]["nodes"]],
                "languages": [language_node["name"] for language_node in node["languages"]["nodes"]]
            } for node in res["user"]["starredRepositories"]["nodes"] + res["user"]["repositoriesContributedTo"]["nodes"] if node["id"] not in info_searched_repos]
            info_searched_repos = info_searched_repos.union(set(repo["id"] for repo in cur_user_repo_data))

            cur_user_followers = [node["login"] for node in res["user"]["followers"]["nodes"]]
            cur_user_following = [node["login"] for node in res["user"]["following"]["nodes"]]
            fetched_users_data.append({
                    "id": res["user"]["id"],
                    "login": cur_user, 
                    "username": res["user"]["name"], 
                    "company": res["user"]["company"], 
                    "bio": res["user"]["bio"], 
                    "location": res["user"]["location"], 
                    "email": res["user"]["email"], 
                    "followers": cur_user_followers,
                    "following": cur_user_following,
                    "repos": cur_user_repos,
                    "starred_repos": cur_user_starred_repos,
                }
            )
            fetched_repos_data = fetched_repos_data + cur_user_repo_data

            # Add repos to search
            R[i] = R[i].union(set(cur_user_repos[:10]))
        except Exception as e:
            print(e)
            continue

    # Stop expansion when reach last layer
    if i == L - 1:
        break

    # Stage II: Expansion based on top repos by users
    print('#Repos to search:', len(R[i]))
    while len(R[i]) > 0:
        try:
            while g.rate_limiting[0] < 50:
                print("rate limit almost approached, staling...")
                sleep(10)
                g.get_rate_limit()

            cur_repo = R[i].pop()
            if cur_repo in searched_repos:
                continue
            searched_repos.add(cur_repo)
            print("Searching", cur_repo)

            # Search by contributors
            contributor_objs = g.get_repo(cur_repo).get_contributors()
            contributors = [user.login for user in contributor_objs[:10] if "bot" not in user.login]
        
            cur_contributions_data = [{ "user_login": user, "repo": cur_repo } for user in contributors]
            fetched_contributions_data = fetched_contributions_data + cur_contributions_data
            U[i + 1] = U[i + 1].union(set(contributors))

        except Exception as e:
            print(e)
            continue


In [6]:
g.rate_limiting[0]

5000

In [7]:
pd.DataFrame.from_records(fetched_users_data).to_csv('./output/users.csv')
pd.DataFrame.from_records(fetched_repos_data).to_csv('./output/repos.csv')
pd.DataFrame.from_records(fetched_contributions_data).to_csv('./output/contributions.csv')