You need to set up your own notebook/python enviroment in order to run this. We do not include a guide on the versions used. However, pydriller==1.15.5 was used, newer versions used 'Repository' instead of 'RepositoryMining'.

In [None]:
github_token = 'YOUR_GITHUB_TOKEN'
github_username = 'YOUR_GITHUB_USERNAME'

In [5]:

from git import Repo
import os
import shutil

def cloneRepo(repo_name):
    path = repo_name.split("/")[1]
    Repo.clone_from("https://github.com/{rn}.git".format(rn=repo_name), path)

def deleteRepo(repo_name):
    path = './' + repo_name.split("/")[1]
    if os.path.isfile(path) or os.path.islink(path):
        os.remove(path)
    elif os.path.isdir(path):
        shutil.rmtree(path)


In [6]:
import requests
import json
import math
from jsonmerge import merge

class BearerAuth(requests.auth.AuthBase):
    def __init__(self, token):
        self.token = token
    def __call__(self, r):
        r.headers["authorization"] = "Bearer " + self.token
        r.headers["user-agent"] = github_username
        return r

def get_full_name_from_gh_user(user_name):
    req = 'https://api.github.com/users/{un}'.format(un=user_name)
    res = requests.get(req, auth=BearerAuth(github_token))
    res_json = res.json()
    name = None
    if "name" in res_json:
        name = res_json["name"]
    else:
        name = None
    if not isinstance(name, str):
        name = None
    return name

def formatForkDetails(n):
    owner = n["owner"]
    name = get_full_name_from_gh_user(owner["login"])
    return json.dumps({"owner_login": owner["login"], "name": name, "created_at": n["created_at"]})

def getForksOfRepo(repo_name):
    reqRepo = 'https://api.github.com/repos/{rn}'.format(rn=repo_name)
    resRepo = requests.get(reqRepo, auth=BearerAuth(github_token))
    print(resRepo.json())
    res_json_repo = resRepo.json()["forks_count"]
    merged_json = None
    for x in range(math.ceil(res_json_repo/100)):
        req = 'https://api.github.com/repos/{rn}/forks?per_page=100&page={p}'.format(rn=repo_name, p=x+1)
        res = requests.get(req, auth=BearerAuth(github_token))
        res_json = res.json()
        if merged_json == None:
            merged_json = res_json
        else:
            merged_json = merged_json + res_json    
    formatted_res = map(formatForkDetails, merged_json)
    return list(formatted_res)

 

In [7]:
from pydriller import RepositoryMining
from datetime import datetime
from dateutil.relativedelta import relativedelta

def commitsFromUser(path_to_repo, usernames, from_date, to_date):
    res = []
    for commit in RepositoryMining(path_to_repo, only_authors=usernames, since=from_date, to=to_date).traverse_commits():
            res.append({"lines": commit.lines, "date": commit.committer_date.isoformat()})
    return res

In [None]:
import csv
import json
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz

utc=pytz.UTC


ratelimiturl = 'https://api.github.com/users/{un}'.format(un=github_username)
limitremaining = requests.get(ratelimiturl, auth=BearerAuth(github_token)).headers['X-RateLimit-Remaining']

if str(limitremaining) != '0':
    file = open('datareps.csv')
    type(file)


    csvreader = csv.reader(file)

    header = []
    header = next(csvreader)
    header
    rows = []
    for row in csvreader:
            rows.append(row)
    rows
    list_of_forkCommits = []


    # Intervals due to rate limit to allow for manual start (start=inclusive, end=exclusive)
    intervals = [(x, x+5) for x in np.arange(20)*5]
    shorter_intervals = [(x+35, x+35+3) for x in np.arange(5)*3]
    # Change this to False after the 0th/first run
    overwrite = False

    # Change this index by 1 for each run - 20 intervals with 5 each (only use 0-9), we had to change to use 5 intervals with 3 each for the last repositores due to the size of the data, adjust according to your needs
    interval = intervals[0]
    # New intervals from 35-50 [0,1,2,3,4]
    # New done=[0,1,2,3,4]
    #interval = shorter_intervals[0]
    MiningDate = datetime(2022, 8, 24).replace(tzinfo=utc)
    # Beaware of intervals include the headers multiple times (which breaks the model notebook if not removed)

    for r in rows[interval[0]:interval[1]]:
        reponame = r[1].split("/")[1]
        print("clone start")
        cloneRepo(r[1])
        print("clone end")
        forks = getForksOfRepo(r[1])
        for f in forks:
            res_json = json.loads(f)
            created_at_utc = datetime.fromisoformat(res_json["created_at"][:-1] + '+00:00').replace(tzinfo=utc)
            if created_at_utc < MiningDate:
                res_json = json.loads(f)
                created_at = res_json["created_at"] 
                dt1 = datetime.fromisoformat(created_at[:-1] + '+00:00')
                owner_login = res_json["owner_login"]
                name = res_json["name"]
                dt1 = datetime.fromisoformat(created_at[:-1] + '+00:00')
                dt2 =   dt1 + relativedelta(months=+3)
                nameArr = []
                if "[]" not in owner_login:
                    nameArr.append(owner_login)
                if name != None and (not(len(name) == 0)) and "[]" not in name:
                    nameArr.append(name)
                commits = commitsFromUser(reponame, nameArr, dt1, dt2)
                forkCommit = {
                "repository" : r[1],
                "owner_login": owner_login,
                "created_at": dt1,
                "end_of_range": dt2,
                "commits": commits
                }
                list_of_forkCommits.append(forkCommit)
        deleteRepo(r[1])   


    import pandas as pd
    df = pd.DataFrame(list_of_forkCommits)
    if (overwrite):
        df.to_csv('forkCommits.csv', index=False, header=True)
    else:
        df.to_csv('forkCommits.csv', mode='a', index=False, header=True)

    file.close()

In [53]:
# This can be used if writing to csv fails as it allows to dump the jupyter variable
import pickle
with open('missingdata.txt', 'wb') as f:
   pickle.dump(list_of_forkCommits, f)

In [3]:
# this allows to read the missingdata file and add it to the csv
import pandas as pd
with open('missingdata.json', 'rb') as f:
    data = pickle.load(f)
    df_missing = pd.DataFrame(data)
    df_missing.to_csv('forkCommits.csv', mode='a', index=False, header=True)

In [None]:
import time
def check_rate_limit():
    ratelimiturl = 'https://api.github.com/users/{un}}'.format(un=github_username)
    limitremaining = requests.get(ratelimiturl, auth=BearerAuth(github_token)).headers['X-RateLimit-Reset']
    value = datetime.fromtimestamp(int(limitremaining))
    b = time.time() #current epoch time
    c = b - int(limitremaining) #returns seconds
    minutes = c // 60 % 60
    print("GMT timestamp until ready: ", value.strftime('%Y-%m-%d %H:%M:%S'))
    print("minutes left: ", 60-minutes)
check_rate_limit()