# This module contains code for filtering the data parsed with java parser

In [1]:
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
import joblib
tqdm.pandas()

## Gatherig all data in one table, removing duplicate imports and java.lang imports

In [2]:
# list of compilation units inside java.lang package that should be ignored when loadig the data
java_lang = {"Appendable", "AutoCloseable", "CharSequence", "Cloneable", "Comparable", "Iterable", "Readable", 
             "Runnable", "Boolean", "Byte", "Character", "Class", "ClassLoader", "ClassValue", "Compiler", "Double", 
             "Enum", "Float", "InheritableThreadLocal", "Integer", "Long", "Math", "Number", "Object", "Package", 
             "Process", "ProcessBuilder", "Runtime", "RuntimePermission", "SecurityManager", "Short", 
             "StackTraceElement", "StrictMath", "String", "StringBuffer", "StringBuilder", "System", "Thread", 
             "ThreadGroup", "ThreadLocal", "Throwable", "Void", "ArithmeticException", 
             "ArrayIndexOutOfBoundsException", "ArrayStoreException", "ClassCastException", "ClassNotFoundException", 
             "CloneNotSupportedException", "EnumConstantNotPresentException", "Exception", "IllegalAccessException", 
             "IllegalArgumentException", "IllegalMonitorStateException", "IllegalStateException", 
             "IllegalThreadStateException", "IndexOutOfBoundsException", "InstantiationException", 
             "InterruptedException", "NegativeArraySizeException", "NoSuchFieldException", "NoSuchMethodException", 
             "NullPointerException", "NumberFormatException", "ReflectiveOperationException", "RuntimeException", 
             "SecurityException", "StringIndexOutOfBoundsException", "TypeNotPresentException", 
             "UnsupportedOperationException", "AbstractMethodError", "AssertionError", "BootstrapMethodError", 
             "ClassCircularityError", "ClassFormatError", "Error", "ExceptionInInitializerError", "IllegalAccessError",
             "IncompatibleClassChangeError", "InstantiationError", "InternalError", "LinkageError", 
             "NoClassDefFoundError", "NoSuchFieldError", "NoSuchMethodError", "OutOfMemoryError", "StackOverflowError",
             "ThreadDeath", "UnknownError", "UnsatisfiedLinkError", "UnsupportedClassVersionError", "VerifyError", 
             "VirtualMachineError", "Deprecated", "Override", "SafeVarargs", "SuppressWarnings"}
java_lang = {"java.lang." + x for x in java_lang}

In [3]:
def removeDuplicateImports(imports):
    """
    Get a list of strings (imports), remove all duplicates, and return a set of strings (unique imports). 
    """
    unique_imports = []
    for clazz in imports:
        if '.' not in clazz or clazz in java_lang:
            continue
        unique_imports.append(clazz)
    return unique_imports

In [4]:
JSON_FILES = sorted(glob('../data/GitHubNewOriginalParsed/*.json'))

df = []
for filename in tqdm(JSON_FILES[:1]):
    tmp = pd.read_json(filename, lines=True)
    tmp.classImports = tmp.classImports.apply(removeDuplicateImports)
    df.append(tmp)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [5]:
df = pd.concat(df)

In [6]:
df.sort_values(by="repo", inplace=True)

## Removing repos that do not have packages

In [7]:
def remove_repos(df, bad_repos):
    print("Total repos: %d, total files: %d, repos to be removed: %d" 
      %(len(df.repo.unique()), len(df), len(bad_repos)))
    df = df[~df.repo.isin(bad_repos)]
    print("Total repos: %d, total files: %d" %(len(df.repo.unique()), len(df)))

In [8]:
bad_repos = []
for i in range(len(df)):
    if df.package.values[i] == "":
        bad_repos.append(df.repo.values[i])
bad_repos = set(bad_repos)
remove_repos(df, bad_repos)

Total repos: 20241, total files: 92510, repos to be removed: 657
Total repos: 19584, total files: 77901


## Removing repos with duplicate packages

In [9]:
forks = pd.read_json("../data/forks.json", lines=True)
# stars = pd.read_json("../data/stars.json", lines=True)
forks = dict(zip(forks.original, forks.f0_))
# stars = dict(zip(stars.repo, forks.f0_))

In [10]:
packages = {}
for i in tqdm(range(len(df))):
    if df.package.values[i] in packages:
        if df.repo.values[i] not in packages[df.package.values[i]]:
            packages[df.package.values[i]].append(df.repo.values[i])
    else:
        packages[df.package.values[i]] = [df.repo.values[i]]

HBox(children=(IntProgress(value=0, max=92510), HTML(value='')))




In [13]:
bad_repos = {"",}
for package in tqdm(packages.keys()):
    repos_tmp = [x for x in packages[package] if x not in bad_repos] 
    if len(repos_tmp) <= 1:
        continue
    best_repo = repos_tmp[0]
    max_forks = forks[repos_tmp[0]]
    for repo in repos_tmp:
        if forks[repo] >= max_forks:
            bad_repos.add(best_repo)
            max_forks = forks[repo]
            best_repo = repo
        else:
            bad_repos.add(repo)

HBox(children=(IntProgress(value=0, max=65711), HTML(value='')))




In [14]:
remove_repos(df, bad_repos)

Total repos: 20241, total files: 92510, repos to be removed: 2928
Total repos: 17314, total files: 54331


In [None]:
df.to_json('../data/all_data_new.json', lines=True, orient="records")