# This module contains code for filtering the data parsed with java parser

In [2]:
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
tqdm.pandas()

## Gatherig all data in one table, removing duplicate imports and java.lang imports

In [2]:
# list of compilation units inside java.lang package that should be ignored when loadig the data
java_lang = {"Appendable", "AutoCloseable", "CharSequence", "Cloneable", "Comparable", "Iterable", "Readable", 
             "Runnable", "Boolean", "Byte", "Character", "Class", "ClassLoader", "ClassValue", "Compiler", "Double", 
             "Enum", "Float", "InheritableThreadLocal", "Integer", "Long", "Math", "Number", "Object", "Package", 
             "Process", "ProcessBuilder", "Runtime", "RuntimePermission", "SecurityManager", "Short", 
             "StackTraceElement", "StrictMath", "String", "StringBuffer", "StringBuilder", "System", "Thread", 
             "ThreadGroup", "ThreadLocal", "Throwable", "Void", "ArithmeticException", 
             "ArrayIndexOutOfBoundsException", "ArrayStoreException", "ClassCastException", "ClassNotFoundException", 
             "CloneNotSupportedException", "EnumConstantNotPresentException", "Exception", "IllegalAccessException", 
             "IllegalArgumentException", "IllegalMonitorStateException", "IllegalStateException", 
             "IllegalThreadStateException", "IndexOutOfBoundsException", "InstantiationException", 
             "InterruptedException", "NegativeArraySizeException", "NoSuchFieldException", "NoSuchMethodException", 
             "NullPointerException", "NumberFormatException", "ReflectiveOperationException", "RuntimeException", 
             "SecurityException", "StringIndexOutOfBoundsException", "TypeNotPresentException", 
             "UnsupportedOperationException", "AbstractMethodError", "AssertionError", "BootstrapMethodError", 
             "ClassCircularityError", "ClassFormatError", "Error", "ExceptionInInitializerError", "IllegalAccessError",
             "IncompatibleClassChangeError", "InstantiationError", "InternalError", "LinkageError", 
             "NoClassDefFoundError", "NoSuchFieldError", "NoSuchMethodError", "OutOfMemoryError", "StackOverflowError",
             "ThreadDeath", "UnknownError", "UnsatisfiedLinkError", "UnsupportedClassVersionError", "VerifyError", 
             "VirtualMachineError", "Deprecated", "Override", "SafeVarargs", "SuppressWarnings"}
java_lang = {"java.lang." + x for x in java_lang}

In [3]:
def removeDuplicateImports(imports):
    """
    Get a list of strings (imports), remove all duplicates, and return a set of strings (unique imports). 
    """
    unique_imports = []
    for clazz in imports:
        if '.' not in clazz or clazz in java_lang:
            continue
        unique_imports.append(clazz)
    return unique_imports

### Join all data into one table

In [4]:
JSON_FILES = sorted(glob('../data/GitHubNewOriginalParsed/*.json'))

df = []
for filename in tqdm(JSON_FILES):
    tmp = pd.read_json(filename, lines=True)
    tmp.classImports = tmp.classImports.apply(removeDuplicateImports)
    df.append(tmp)

HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




In [5]:
df = pd.concat(df)

In [6]:
df.sort_values(by="repo", inplace=True)  # sorting by repository makes later preprocessing steps easier

## Removing repositories that do not define packages. This is paramount to filtering out duplicates in the future

In [7]:
def remove_repos(df, bad_repos):
    print("Total repos: %d, total files: %d, repos to be removed: %d" 
      %(len(df.repo.unique()), len(df), len(bad_repos)))
    df = df[~df.repo.isin(bad_repos)]
    print("Total repos: %d, total files: %d" %(len(df.repo.unique()), len(df)))
    return df

In [8]:
bad_repos = []
for i in range(len(df)):
    if df.package.values[i] == "":
        bad_repos.append(df.repo.values[i])
bad_repos = set(bad_repos)
df = remove_repos(df, bad_repos)

Total repos: 43048, total files: 8410676, repos to be removed: 6483
Total repos: 36565, total files: 4342539


## Removing repos with duplicate packages. If there are two repositories with the same packages, the one that has been forked most is kept.

In [9]:
forks = pd.read_json("../data/forks.json", lines=True)
# stars = pd.read_json("../data/stars.json", lines=True)
forks = dict(zip(forks.original, forks.f0_))
# stars = dict(zip(stars.repo, forks.f0_))

In [10]:
packages = {} # dictionary of the kind package_name -> list of repository names
for i in tqdm(range(len(df))):
    if df.package.values[i] in packages:
        if df.repo.values[i] not in packages[df.package.values[i]]:
            packages[df.package.values[i]].append(df.repo.values[i])
    else:
        packages[df.package.values[i]] = [df.repo.values[i]]

HBox(children=(FloatProgress(value=0.0, max=4342539.0), HTML(value='')))




In [11]:
bad_repos = {"",} # set of repositories to be removed
for package in tqdm(packages.keys()):
    # list of repositories with the same package: Only one is kept in the end
    repos_tmp = [x for x in packages[package] if x not in bad_repos]
    if len(repos_tmp) <= 1:
        continue
    best_repo = repos_tmp[0]
    max_forks = forks[repos_tmp[0]]
    for repo in repos_tmp:  # search for the most forked repository among potential candidates for removal and keep it
        if forks[repo] >= max_forks:
            bad_repos.add(best_repo)
            max_forks = forks[repo]
            best_repo = repo
        else:
            bad_repos.add(repo)

HBox(children=(FloatProgress(value=0.0, max=578057.0), HTML(value='')))




In [12]:
df= remove_repos(df, bad_repos)

Total repos: 36565, total files: 4342539, repos to be removed: 8997
Total repos: 27569, total files: 2542734


## Removing repos that contain duplicate files. Keeping these repository messes up graph creation process in the future

In [16]:
first = 0
curr_classes = []
duplicate_repos = {"", } # repositories to be removed

for i in tqdm(range(first, len(df))):
    curr_classes.append(df.package.values[i] + '.' + df.name.values[i])
    if first == i:
        curr_repo = df.repo.values[first]
    if (i == len(df) - 1) or (df.repo.values[i+1] != curr_repo):
        # if the number of classes does not equal the number of unique classes
        if len(set(curr_classes)) != len(curr_classes): 
            duplicate_repos.add(curr_repo)
        first = i + 1
        curr_classes = []

HBox(children=(IntProgress(value=0, max=2542734), HTML(value='')))




In [17]:
df = remove_repos(df, duplicate_repos)

Total repos: 27569, total files: 2542734, repos to be removed: 2109
Total repos: 25461, total files: 2106230


In [18]:
df.to_json('../data/all_data_new.json', lines=True, orient="records")