In [5]:
import pandas as pd
import os
import javalang

df = pd.DataFrame(columns=['path','class_name', 'number_of_methods'])

for path_walk, _, files_walk in os.walk('./resources/xerces2-j-src/'):
    for file in files_walk:
        if file.endswith('.java'):
            with open(os.path.join(path_walk, file), 'r') as f:
                tree = javalang.parse.parse(f.read())
            for _, class_declaration in tree.filter(javalang.tree.ClassDeclaration):
                method_number = len(class_declaration.methods)
                df.loc[len(df.index)] = [path_walk+'/'+file,class_declaration.name, method_number]

df['is_god'] = df['number_of_methods'].apply(lambda x: x > df['number_of_methods'].mean() + 6*df['number_of_methods'].std())


In [7]:
df_filtered = df.loc[df['is_god'] == True, ['path', 'class_name']]


In [20]:
df_concatenated = pd.DataFrame({'concatenated': df_filtered['path']})
df_concatenated = df_concatenated['concatenated'].values.tolist()

In [21]:
import javalang
import pandas as pd
from typing import List
import sys

def get_fields(class_declaration: javalang.tree.ClassDeclaration) -> set[str]:
    set_fields = set()
    for m in class_declaration.fields:
        set_fields.add(m.declarators[0].name)
    return set_fields

def get_methods(class_declaration: javalang.tree.ClassDeclaration) -> set[str]:
    set_methods = set()
    for m in class_declaration.methods:
        set_methods.add(m.name)
    return set_methods

def get_fields_accessed_by_method(method_declaration: javalang.tree.MethodDeclaration) -> set[str]:
    set_field_accesses = set()
    for _,p in method_declaration.filter(javalang.tree.MemberReference):
        set_field_accesses.add(p.qualifier if p.qualifier != '' else p.member)
    return set_field_accesses

def get_methods_accessed_by_method(method_declaration: javalang.tree.MethodDeclaration) -> set[str]:
    set_method_accesses = set()
    for _,p in method_declaration.filter(javalang.tree.MethodInvocation):
        set_method_accesses.add(p.member)
    return set_method_accesses

def extract_feature_vectors(path_java_file: str, save_directory_path: str = './') -> pd.DataFrame:
    with open(path_java_file, 'r') as f:
        tree = javalang.parse.parse(f.read())
    class_name = path_java_file.split('/')[-1].split('.')[0]
    for _,n in tree.filter(javalang.tree.ClassDeclaration):
        if(n.name == class_name):
            features = set()
            features.update(get_fields(n))
            features.update(get_methods(n))
            idx_to_features = list(features)
            df = generate_feature_dataframe(n, idx_to_features)
            df = df.fillna(0)
            df[idx_to_features] = df[idx_to_features].astype(int)
            if not save_directory_path.endswith('/') != './':
                save_directory_path = save_directory_path+'/'
            df.to_csv(save_directory_path+class_name+'.csv', index=False)

def generate_feature_dataframe(node: javalang.tree.ClassDeclaration, idx_to_features: List[str]) -> pd.DataFrame:
    df = pd.DataFrame(columns=['method_name']+idx_to_features)
    for m in node.methods:
        method_name = m.name
        method_features = set()
        method_features = method_features.union(get_fields_accessed_by_method(m))
        method_features = method_features.union(get_methods_accessed_by_method(m))
        for f in list(method_features):
            if f in idx_to_features:
                if not df['method_name'].isin([method_name]).any():
                    df.loc[len(df)] = {'method_name': method_name}
                df.loc[df['method_name'] == method_name, f] = 1
    return df



In [22]:
for l in df_concatenated:
    extract_feature_vectors(l)