In [None]:
import time
from github import Github
import requests
import os
from tqdm import tqdm
from datetime import datetime, timedelta, date
import numpy as np
import pandas as pd
import codecs

ACCESS_TOKEN = 'ADD YOUR TOKEN WITH REPO ACCESS RIGHTS HERE'

In [None]:
g = Github(ACCESS_TOKEN)

In [2]:
def get_file_extension(path):
    return path.rsplit(".",1)[-1]

In [3]:
def pre_github_request_checker():
    rate_data = g.get_rate_limit().core.raw_data
    if rate_data['remaining'] < 50:
        time_to_reset = rate_data['reset'] - int(time.time()) + 1
        print(f"Sleeping for {time_to_reset} seconds")
        time.sleep(time_to_reset)

## Repository search

In [14]:
data_list = [
    "id",
    "clone_url",
    "created_at",
    "description",
    "full_name",
    "language",
    "name",
    "size",
    "stargazers_count",
    "updated_at",
    "forks_count"
]

data_funcs_list = ["get_topics","get_license"]

data_list.sort()

df = pd.DataFrame(columns=data_list + ["topics","license_url"])

def add_repo_to_df(df, repo):
    data = [getattr(repo,attr) for attr in data_list]
    data.append(repo.get_topics())
    try:
        license_url = repo.get_license().license.url
    except:
        license_url = "None"
    data.append(license_url)
    df.loc[len(df)] = data
    return df

In [46]:
def search_github(language, start_date, end_date):
    """
    More info:
    https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates
    """
    assert(language == "VHDL" or language == 'Verilog' or language == 'SystemVerilog')
    date_q = f"{start_date.strftime('%Y-%m-%d')}..{end_date.strftime('%Y-%m-%d')}" 
    result = g.search_repositories("",language='Verilog',created=date_q) 
    print(f"Found {result.totalCount} repos for: {language}, {date_q}")
    return result

def process_repo_search_results(df,results):
    for i in range(1000):

        rate_data = g.get_rate_limit().core.raw_data
        now_seconds = int(time.time())
        if rate_data['remaining'] < 100:
            time_to_reset = rate_data['reset'] - int(time.time()) + 1
            print(f"Sleeping for {time_to_reset} seconds")
            time.sleep(time_to_reset)
        page = results.get_page(i)
        page_size = len(page)
        for j in range(page_size):
            df = add_repo_to_df(df,page[j])
        if page_size < 30:
            break   
    return df

In [30]:
def find_repos(df, language, start_date, end_date):
    repo_search_results = search_github(language, start_date, end_date)
    if repo_search_results.totalCount > 0:
        if repo_search_results.totalCount == 1000:
            # Reduce date range (recursively?)
            delta = (end_date - start_date) / 2
            df = find_repos(df, language,start_date, end_date - delta)
            df = find_repos(df, language, start_date + delta, end_date)
        else:
            df = process_repo_search_results(df,repo_search_results)
    print(f"Done: {start_date.strftime('%Y-%m-%d')}..{end_date.strftime('%Y-%m-%d')}, df length: {len(df)}")
    return df

In [None]:
df = pd.DataFrame(columns=data_list + ["topics","license_url"])
language = "Verilog"
# You can split up the search to make it more manageable by splitting your search over certain years
start_date = datetime(1980,1,1)
end_date = datetime.now()
print(df)
df = find_repos(df, language, start_date,end_date)

In [40]:
len(df)

51321

In [41]:
df.to_csv(f"{language}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv")

In [None]:
df2 = pd.DataFrame(columns=data_list + ["topics","license_url"])
language = 'SystemVerilog'
start_date = datetime(1980,1,1)
end_date = datetime.now()
print(df2)
df2 = find_repos(df2, language, start_date, end_date)

In [53]:
len(df2)

16258

In [54]:
df2.to_csv(f"{language}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv")

## Finding licenses

In [43]:
def combine_and_deduplicate_gh_search_results(csvs):
    df = pd.concat(map(lambda x: pd.read_csv(x,na_values=['None']), csvs), ignore_index=True)
    df = df.drop(['Unnamed: 0'],axis=1)
    df = df.drop_duplicates([c for c in df.columns if c != 'updated_at'])
    return df

In [None]:
verilog_df = combine_and_deduplicate_gh_search_results([os.path.relpath('./data/search_repo_indices/Verilog_1980-01-01_2013-01-01.csv'), os.path.relpath('./data/search_repo_indices/Verilog_1980-01-01_2022-10-12_16576.csv'),os.path.relpath('./data/search_repo_indices/Verilog_2018-12-08_2022-10-13.csv')])
systemverilog_df = combine_and_deduplicate_gh_search_results([os.path.relpath('./data/search_repo_indices/SystemVerilog_1980-01-01_2022-10-15_part.csv'),os.path.relpath('./data/search_repo_indices/SystemVerilog_2021-05-14_2022-10-15_part2.csv')])

print(len(verilog_df))
print(len(systemverilog_df))

In [45]:
repo_indices_dir = 'data/search_repo_indices'
verilog_df.to_csv(os.path.join(repo_indices_dir,"full_verilog_repos.csv"))
systemverilog_df.to_csv(os.path.join(repo_indices_dir,"full_systemverilog_repos.csv"))

In [46]:
def add_licenses_from_repo_df_to_dict(repo_df,licenses_dict):
    df_with_unique_licenses = repo_df.loc[repo_df['license_url'].dropna().drop_duplicates().index]
    repo_ids = list(df_with_unique_licenses['id'])
    for rid in repo_ids:
        pre_github_request_checker()
        license_data = g.get_repo(rid).get_license().license.raw_data
        licenses_dict[license_data['url']] = license_data

In [47]:
licenses_dict = {}
add_licenses_from_repo_df_to_dict(verilog_df,licenses_dict)
add_licenses_from_repo_df_to_dict(systemverilog_df,licenses_dict)
df = pd.DataFrame.from_dict(licenses_dict,orient='index')
df.to_csv(os.path.join('data/search_repo_indices','licenses.csv'))

In [48]:
def get_licenses_with_permissions_conditions(license_df,permissions=[],conditions=[]):
    indices = []
    for i, row in license_df.iterrows():
        if len(permissions) == 0 or set(permissions).issubset(set(row['permissions'])):
            if len(conditions) == 0 or len(set(conditions).intersection(set(row['conditions']))) > 0 :
                indices.append(i)
    return license_df.loc[indices]

In [49]:
permissions = ['modifications','distribution']
special_conditions = ['same-license--file','same-license--library','same-license']
permissive_licenses_df = get_licenses_with_permissions_conditions(df,permissions=permissions,conditions=[])
distributive_licenses_df = get_licenses_with_permissions_conditions(df,permissions=['distribution'],conditions=[])

In [102]:
print(len(permissive_licenses_df))
print(len(distributive_licenses_df))

22
22


### Conclusion, all licenses found are permissive in that they allow modifications and distribution! Repos without licenses are not included

In [50]:
p_sv_df = systemverilog_df.dropna(subset=['license_url'])
p_ve_df = verilog_df.dropna(subset=['license_url'])

p_sv_df.to_csv(os.path.join(repo_indices_dir,"permissive_systemverilog_repos.csv"))
p_ve_df.to_csv(os.path.join(repo_indices_dir,"permissive_verilog_repos.csv"))

In [51]:
all_df = pd.concat([verilog_df,systemverilog_df]).drop_duplicates(subset=[c for c in verilog_df.columns if not c in ['language']])
p_all_df = pd.concat([p_ve_df,p_sv_df]).drop_duplicates(subset=[c for c in verilog_df.columns if not c in ['language']])

all_df.to_csv("all_deduplicated_repos.csv")
p_all_df.to_csv("permissive_all_deduplicated_repos.csv")

In [37]:
print(len(all_df))
print(len(p_all_df))

50171
7516


In [89]:
len(df)
all_permissions = []
all_conditions = []
all_limitations = []
for i,row in df.iterrows():
    all_permissions.append(row['permissions'])
    all_conditions.append(row['conditions'])
    all_limitations.append(row['limitations'])

In [72]:
all_conditions

[['include-copyright', 'document-changes', 'disclose-source', 'same-license'],
 ['include-copyright', 'document-changes'],
 ['include-copyright', 'document-changes', 'disclose-source', 'same-license'],
 ['include-copyright'],
 ['include-copyright'],
 ['include-copyright',
  'disclose-source',
  'document-changes',
  'same-license--library'],
 ['include-copyright',
  'disclose-source',
  'document-changes',
  'same-license--library'],
 [],
 ['include-copyright'],
 ['include-copyright',
  'document-changes',
  'disclose-source',
  'network-use-disclose',
  'same-license'],
 [],
 ['include-copyright'],
 [],
 ['disclose-source', 'include-copyright', 'same-license--file'],
 ['include-copyright', 'document-changes'],
 ['include-copyright', 'document-changes', 'same-license'],
 ['disclose-source', 'include-copyright', 'same-license'],
 ['include-copyright--source'],
 ['include-copyright--source', 'document-changes'],
 ['include-copyright', 'document-changes'],
 ['disclose-source', 'include-co

## Clone repos

In [54]:
def create_clone_command(clone_url,directory, depth=1,branch='master'):
    command_parts = [f'git clone']
    # command_parts.append(f"-b {branch}") # TODO: rerun search without master branch (causes error for some searches!)
    command_parts.append(f"--depth {depth}")
    command_parts.append(f"--no-tags")
    # command_parts.append(f"--no-checkout")
    command_parts.append(clone_url)
    command_parts.append(directory)
    return " ".join(command_parts)

def create_clone_script_for_df(repo_df,script_out_path,clone_out_dir):
    with open(script_out_path,'w+') as f:
        for i,row in repo_df.iterrows():
            clone_url = row['clone_url']
            out_dir = os.path.join(clone_out_dir,str(row['id']))
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            f.write(create_clone_command(clone_url,str(os.path.abspath(out_dir))).replace("\\","/") + "\n")


In [55]:
create_clone_script_for_df(p_all_df,"./clone_all_p.sh","data/full_repos/permissive")

In [19]:
create_clone_command("https://github.com/mrehkopf/sd2snes.git","./data/full_repos/exp")

'git clone -b master --depth 1 --no-tags --no-checkout https://github.com/mrehkopf/sd2snes.git ./data/full_repos/exp'

In [33]:
all_verilog_df = pd.concat([p_ve_df,p_sv_df])


In [34]:
print(len(p_sv_df))

7291


In [35]:
print(len(all_verilog_df))
print(len(all_verilog_df.drop_duplicates(subset=[c for c in all_verilog_df.columns if not c in ['language']])))

14537
7478


## Filter repos

In [67]:
def delete_all_files_without_right_extension(start_dir, extensions_to_keep):
    errors = []
    for root, dirs, files in os.walk(start_dir):
        for file in [sf for sf in files if not get_file_extension(sf) in extensions_to_keep]:
            file_path = os.path.join(root,file)
            try:
                os.remove(file_path)
            except Exception as e:
                errors.append(e)
    return errors

def delete_empty_dirs(start_dir):
    errors = []
    for root, dirs, files in os.walk(start_dir,topdown=False):
        for d in dirs:
            dir_path = os.path.join(root,d)
            if len(os.listdir(dir_path)) == 0:
                try:
                    os.rmdir(dir_path)
                except Exception as e:
                    errors.append(e)
    return errors

In [70]:
# https://www.intel.com/content/www/us/en/programmable/quartushelp/17.0/reference/glossary/glosslist.htm
# https://marketplace.visualstudio.com/items?itemName=eirikpre.systemverilog
verilog_extension_files = ['v','verilog','vlg','vh']
system_verilog_extension_files = ['sv','svh','svp']
extra_file_types = ['vo','vt'] # verilog output, verilog test bench

extensions_to_keep = verilog_extension_files + system_verilog_extension_files + extra_file_types

start_dir = os.path.relpath("data/full_repos/permissive")

files_errors = delete_all_files_without_right_extension(start_dir,extensions_to_keep)
dirs_errors = delete_empty_dirs(start_dir)

## Process github searches

In [3]:
def combine_and_deduplicate_gh_search_results(csvs):
    df = pd.concat(map(pd.read_csv, csvs), ignore_index=True)
    df = df.drop(['Unnamed: 0'],axis=1)
    df = df.drop_duplicates([c for c in df.columns if c != 'updated_at'])
    return df

In [4]:
verilog_df = combine_and_deduplicate_gh_search_results([os.path.relpath('./data/search_repo_indices/Verilog_1980-01-01_2013-01-01.csv'), os.path.relpath('./data/search_repo_indices/Verilog_1980-01-01_2022-10-12_16576.csv'),os.path.relpath('./data/search_repo_indices/Verilog_2018-12-08_2022-10-13.csv')])
systemverilog_df = combine_and_deduplicate_gh_search_results([os.path.relpath('./data/search_repo_indices/SystemVerilog_1980-01-01_2022-10-15_part.csv'),os.path.relpath('./data/search_repo_indices/SystemVerilog_2021-05-14_2022-10-15_part2.csv')])

print(len(verilog_df))
print(len(systemverilog_df))

49166
49277
49250


In [6]:
global file_count
file_count = 0

global files_dict
files_dict = {}

In [32]:
def pre_github_request_checker():
    rate_data = g.get_rate_limit().core.raw_data
    if rate_data['remaining'] < 50:
        time_to_reset = rate_data['reset'] - int(time.time()) + 1
        print(f"Sleeping for {time_to_reset} seconds")
        time.sleep(time_to_reset)

In [11]:


def update_files_dict(count, content):
    global files_dict
    files_dict[count] = {
        "path": content.raw_data['path'],
        "size": content.raw_data['size'],
        "count_id": count
    }

def download_files_from_repo(repo,extensions,out_dir):
    global files_dict
    global file_count
    file_count = 0
    files_dict = {}
    pre_github_request_checker()
    contents = repo.get_contents("/")
    for content in contents:
        download_content(repo,content,extensions,out_dir)
    
    df = pd.DataFrame.from_dict(files_dict,orient='index')
    print(f"Saving csv index with {len(df)} entries")
    df.to_csv(os.path.join(out_dir,"index.csv"))
    
def download_content(repo,content,extensions,out_dir):
    content_raw_data = content.raw_data
    content_type = content_raw_data['type']
    if content_type == 'dir':
        pre_github_request_checker()
        new_contents = repo.get_contents(content_raw_data['path'])
        for new_content in new_contents:
            download_content(repo,new_content,extensions,out_dir)
    elif content_type == 'file':
        extension = get_file_extension(content_raw_data['name'])
        if extension in extensions:
            global file_count
            update_files_dict(file_count,content)
            pre_github_request_checker()
            try:
                with open(os.path.join(out_dir,str(file_count) + "." + extension),'wb') as f:
                    f.write(content.decoded_content)
                file_count += 1
            except Exception as e:
                print(f"Caught exception while trying to write content:\n{e}")
    # raise Exception(f"Content type not recognized: {content_type}")

def download_all_repos(df,extensions,out_dir):
    all_repo_ids = list(df['id'])
    for i in range(14,len(df['id'])):
        repo_id = all_repo_ids[i]
        pre_github_request_checker()
        repo = g.get_repo(repo_id)
        repo_dir = os.path.join(out_dir,str(repo_id))
        if not os.path.exists(repo_dir):
            os.makedirs(repo_dir)
        print(f"Searching repo {i} with id: {repo_id}")
        download_files_from_repo(repo,extensions,repo_dir)

In [37]:
download_all_repos(verilog_df,verilog_extension_files,os.path.relpath("data/repos"))

Searching repo 14 with id: 4519428
Caught exception while trying to write content:
unsupported encoding: none
Sleeping for 2158 seconds
Saving csv index with 983 entries
Searching repo 15 with id: 753580
Sleeping for 1946 seconds


In [10]:
def create_files_df(downloaded_repo_dir,extensions_to_keep):
    extensions_map = {ext: True for ext in extensions_to_keep}
    df = pd.DataFrame(columns=['directory','repo_id','file_name','extension'])
    for repo_id in os.listdir(downloaded_repo_dir):
        for root,dirs,files in os.walk(os.path.join(downloaded_repo_dir,repo_id)):
            for file in files:
                extension = get_file_extension(file)
                try:
                    if extensions_map[extension]:
                        directory = os.path.join(root,file)
                        df.loc[len(df)] = [directory, repo_id, file, extension]
                except Exception as e:
                    print(f"Error: {e}")
                    extensions_map[extension] = False
        print(f"Done with repo: {repo_id}")
    print(f"Extensions: {extensions_map}")
    return df

In [None]:
verilog_files_df = create_files_df('data/full_repos/permissive',verilog_extension_files + system_verilog_extension_files)

In [13]:
verilog_files_df.to_csv('./files_index.csv')

## Partition dataset for processing

In [169]:
files_index = pd.read_csv('data/search_repo_indices/files_index.csv',index_col=0)
# files_index

In [170]:
verilog_extension_files = ['v','verilog','vlg','vh']
system_verilog_extension_files = ['sv','svh','svp']

files_index = files_index[files_index['extension'].isin(verilog_extension_files + system_verilog_extension_files)]
# files_index = files_index.reset_index(drop=True)
len(files_index)

314877

In [171]:
files_index = files_index.reset_index(drop=True)
# files_index

In [173]:
few_indices = np.random.choice(len(files_index),replace=False,size=200)
remaining_files_index = files_index.drop(index=few_indices)

In [174]:
len(remaining_files_index)

314677

In [175]:
number_of_partitions = 10
tot_len = len(remaining_files_index)
for i in range(number_of_partitions):
    partition_df = remaining_files_index.iloc[list(range(i*tot_len//number_of_partitions,(i+1)*tot_len//number_of_partitions))]
    partition_df.to_csv(f"data/verilog_partitions/files_index_part_{i}.csv")

In [176]:
tot_len = len(remaining_files_index)
total = 0
for i in range(number_of_partitions):
    length = len(pd.read_csv(f"data/verilog_partitions/files_index_part_{i}.csv"))
    total += length

## Fill partitions with source code

In [189]:
def read_source_code(directory):
    # Biggest error = utf-8 encoding problem
    try:
        # return open(directory,'r').read()
        with codecs.open(directory,encoding='utf-8', errors='replace', mode = 'r') as f:
            data = f.read()
        return data.replace("\x00","") # replacing this might not be needed but someone online said it helps...
    except Exception as e:
        e_string = f"0:FOUND ERROR: {e}"
        print(e_string)
        return e_string

def clean_row_directory(row):
    return row['directory'].replace("\\","/")

def add_source_code_to_index_df(df):
    df['directory'] = df.apply(lambda row: clean_row_directory(row),axis=1)
    df['code'] = ""
    tqdm.pandas(desc='Apply read_source_code')
    df['code'] = df.progress_apply(lambda row: read_source_code(row['directory']),axis=1)
    return df

In [190]:
number_of_partitions = 10
for i in range(number_of_partitions):
    df_dir = f"data/verilog_partitions/files_index_part_{i}.csv"
    print(f"Starting {i}")
    partition_df = pd.read_csv(df_dir,index_col=0)
    new_partition_df = add_source_code_to_index_df(partition_df)
    new_partition_df.to_csv(df_dir)
    del partition_df, new_partition_df
print("All done!")

Starting 9


Apply read_source_code:  94%|█████████▎| 29496/31468 [05:13<00:28, 70.05it/s] 

e_string


Apply read_source_code: 100%|██████████| 31468/31468 [05:48<00:00, 90.35it/s]


All done!
