# This module contains code for filtering the data parsed with java parser

In [1]:
import pandas as pd
from glob import glob
from tqdm.auto import tqdm
import joblib
tqdm.pandas()

## Gatherig all data in one table, removing duplicate imports and java.lang imports

In [2]:
# list of compilation units inside java.lang package that should be ignored when loadig the data
java_lang = {"Appendable", "AutoCloseable", "CharSequence", "Cloneable", "Comparable", "Iterable", "Readable", 
             "Runnable", "Boolean", "Byte", "Character", "Class", "ClassLoader", "ClassValue", "Compiler", "Double", 
             "Enum", "Float", "InheritableThreadLocal", "Integer", "Long", "Math", "Number", "Object", "Package", 
             "Process", "ProcessBuilder", "Runtime", "RuntimePermission", "SecurityManager", "Short", 
             "StackTraceElement", "StrictMath", "String", "StringBuffer", "StringBuilder", "System", "Thread", 
             "ThreadGroup", "ThreadLocal", "Throwable", "Void", "ArithmeticException", 
             "ArrayIndexOutOfBoundsException", "ArrayStoreException", "ClassCastException", "ClassNotFoundException", 
             "CloneNotSupportedException", "EnumConstantNotPresentException", "Exception", "IllegalAccessException", 
             "IllegalArgumentException", "IllegalMonitorStateException", "IllegalStateException", 
             "IllegalThreadStateException", "IndexOutOfBoundsException", "InstantiationException", 
             "InterruptedException", "NegativeArraySizeException", "NoSuchFieldException", "NoSuchMethodException", 
             "NullPointerException", "NumberFormatException", "ReflectiveOperationException", "RuntimeException", 
             "SecurityException", "StringIndexOutOfBoundsException", "TypeNotPresentException", 
             "UnsupportedOperationException", "AbstractMethodError", "AssertionError", "BootstrapMethodError", 
             "ClassCircularityError", "ClassFormatError", "Error", "ExceptionInInitializerError", "IllegalAccessError",
             "IncompatibleClassChangeError", "InstantiationError", "InternalError", "LinkageError", 
             "NoClassDefFoundError", "NoSuchFieldError", "NoSuchMethodError", "OutOfMemoryError", "StackOverflowError",
             "ThreadDeath", "UnknownError", "UnsatisfiedLinkError", "UnsupportedClassVersionError", "VerifyError", 
             "VirtualMachineError", "Deprecated", "Override", "SafeVarargs", "SuppressWarnings"}
java_lang = {"java.lang." + x for x in java_lang}

In [3]:
def removeDuplicateImports(imports):
    """
    Get a list of strings (imports), remove all duplicates, and return a set of strings (unique imports). 
    """
    unique_imports = []
    for clazz in imports:
        if '.' not in clazz or clazz in java_lang:
            continue
        unique_imports.append(clazz)
    return unique_imports

In [4]:
JSON_FILES = sorted(glob('../data/GitHubNewOriginalParsed/*.json'))

df = []
for filename in tqdm(JSON_FILES):
    tmp = pd.read_json(filename, lines=True)
    tmp.classImports = tmp.classImports.apply(removeDuplicateImports)
    df.append(tmp)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [5]:
df = pd.concat(df)

In [6]:
df.sort_values(by="repo", inplace=True)

## Removing repos that do not have packages

In [7]:
bad_repos = []
for i in range(len(df.package.values)):
    if df.package.values[i] == "":
        bad_repos.append(df.repo.values[i])
bad_repos = set(bad_repos)
print("Total repos: %d, total files: %d, repos without packages: %d" 
      %(len(df.repo.value_counts()), len(df), len(bad_repos)))

Total repos: 20241, total files: 92510, repos without packages: 657


In [8]:
df = df[~df.repo.isin(bad_repos)]
print("Total repos: %d, total files: %d" %(len(df.repo.value_counts()), len(df)))

Total repos: 19584, total files: 77901


## Removing repos with duplicate packages

In [None]:
forks = pd.read_json("../data/forks.json", lines=True)
stars = pd.read_json("../data/stars.json", lines=True)

forks = {forks.original.values[i]: forks.f0_.values[i] for i in range(len(forks))}
stars = {stars.original.values[i]: stars.f0_.values[i] for i in range(len(stars))}

In [12]:
forks.head()

Unnamed: 0,f0_,original
0,4,lscsoft/docker-ligo-lalsuite-dev
1,20,gabrielfeitosa/ci-spring-boot
2,19,univie-tnt-2018-summer/univie-tnt-2018-summer....
3,7,lcg0124/clouddo-view
4,65,Azoy/Sword


In [37]:
# df.to_json('../data/all_data_new.json', lines=True, orient="records")