In [None]:
from util.graph_util import GraphUtil
import matplotlib.pyplot as plt
import json
import numpy as np

In [None]:
filename = "../data/JavaFX-Point-of-Sales_pdg.json"

In [None]:
with open(filename, "r") as f:
    data = json.loads(f.read())
entities = data['pdg']['entities']
relations = data['pdg']['relations']

entities_dict = {entity['entityId']: entity for entity in entities}

In [None]:
print(len(entities))
print(len(relations))

In [None]:
gu = GraphUtil("PYTHON_2")

In [None]:
entities_by_type = gu.get_entities_by_type(entities)
print(entities_by_type.keys())

In [None]:
print(len(entities_by_type['CLASS']))
print(len(entities_by_type['INTERFACE']))

In [None]:
MG = gu.build_program_dependency_graph(
                                    entities,
                                    relations,
                                    allowed_entity_types=[],
                                    allowed_relation_types=['CALL'],
                                    entity_attributes=[gu.ENTITY_ID, gu.ENTITY_TYPE, gu.ENTITY_FQN],
                                    directed=True,
                                    isolated_nodes=True,
                                    self_loops=False,
                                    weight="weight",
                                    multi_edges=True)

MG_ORM = gu.build_program_dependency_graph(
                                    entities,
                                    relations,
                                    allowed_entity_types=[],
                                    allowed_relation_types=['ORM'],
                                    entity_attributes=[gu.ENTITY_ID, gu.ENTITY_TYPE, gu.ENTITY_FQN],
                                    directed=True,
                                    isolated_nodes=True,
                                    self_loops=False,
                                    weight="weight",
                                    multi_edges=True)

In [None]:
print(MG)

In [None]:
entities_cc:dict = {}
for entity in entities:
    max_cc = 0
    for method in entity['methods']:
        metric_container = method['metric_container']
        cc = 0
        for metric in metric_container:
            if metric['name'] == 'CyclomaticComplexity':
                cc = metric['value']

        if cc > max_cc:
            max_cc = cc
    entities_cc[entity['entityId']] = max_cc

In [None]:
print(entities_cc)
plt.hist(entities_cc.values())
plt.xlabel('cyclomatic complexity')
plt.ylabel('count')
plt.show()

In [None]:
#### high in_degree --> high fan-in; high out_degree --> high fan-out

in_degree = sorted([(entity_id, degree) for  (entity_id, degree) in MG.in_degree(weight="weight")], key=lambda x:x[1], reverse=True)
out_degree = sorted([(entity_id, degree) for  (entity_id, degree) in MG.out_degree(weight="weight")], key=lambda x:x[1], reverse=False)

In [None]:
print(in_degree)
plt.hist(list(o[1] for o in in_degree))
plt.xlabel('in degree')
plt.ylabel('count')
plt.show()
print(out_degree)
plt.hist(list(o[1] for o in out_degree))
plt.xlabel('out degree')
plt.ylabel('count')
plt.show()

In [None]:
_high_in_degree = 10#15
_very_high_in_degree = 20#200
_low_out_degree = 3#3
_low_in_degree = 2#4
_high_c_complexity = 3#4

allowed_types = ['CLASS', 'INTERFACE']

# not persistent == no ORM relation

allowed_types_set = set()
for(entity_id) in entities_dict:
    if entities_dict[entity_id]['entityType'] in allowed_types:
        allowed_types_set.add(entity_id)
print('allowed_types_set: ' + repr(allowed_types_set))

high_in_degree = set()
for (entity_id, degree) in in_degree:
    if degree > _high_in_degree:
        high_in_degree.add(entity_id)
high_in_degree.intersection_update(allowed_types_set)
print('high_in_degree: ' + repr(high_in_degree))

very_high_in_degree = set()
for (entity_id, degree) in in_degree:
    if degree > _very_high_in_degree:
        very_high_in_degree.add(entity_id)
very_high_in_degree.intersection_update(allowed_types_set)
print('very_high_in_degree: ' + repr(very_high_in_degree))

persistent_set = set()
for(entity_id) in entities_dict:
    if len(MG_ORM.edges(entity_id, data=True)) > 0:
        persistent_set.add(entity_id)
persistent_set.intersection_update(allowed_types_set)
print('persistent_set: ' + repr(persistent_set))
print('not_persistent_set: ' + repr(allowed_types_set.difference(persistent_set)))

low_out_degree = set()
for (entity_id, degree) in out_degree:
    if degree < _low_out_degree:
        low_out_degree.add(entity_id)
low_out_degree.intersection_update(allowed_types_set)
print('low_out_degree: ' + repr(low_out_degree))

low_in_degree = set()
for (entity_id, degree) in in_degree:
    if degree < _low_in_degree:
        low_in_degree.add(entity_id)
low_in_degree.intersection_update(allowed_types_set)
print('low_in_degree: ' + repr(low_in_degree))

err_handling_set = set()
for(entity_id) in entities_dict:
    for method in entities_dict.get(entity_id)['methods']:
        if 'catch' in method['body']:
            err_handling_set.add(entity_id)
            break
err_handling_set.intersection_update(allowed_types_set)
print('err_handling_set: ' + repr(err_handling_set))

high_c_complexity = set()
for (entity_id) in entities_cc:
    if entities_cc.get(entity_id) > _high_c_complexity:
        high_c_complexity.add(entity_id)
high_c_complexity.intersection_update(allowed_types_set)
print('high_c_complexity: ' + repr(high_c_complexity))

additional_util_set = set()
for (entity_id) in entities_dict:
    if 'lombok.experimental.UtilityClass' in entities_dict.get(entity_id)['imports']:
        additional_util_set.add(entity_id)
additional_util_set.intersection_update(allowed_types_set)
print('additional_util_set: ' + repr(additional_util_set))
print(len(persistent_set))

In [None]:
utility_services = very_high_in_degree.intersection(low_out_degree).difference(persistent_set).union(additional_util_set)#.union({54}) #Very High Fan-in AND Very Low Fanout AND Not persistent
for entityId in utility_services:
    print(entityId, entities_dict[entityId]['entityFullQualifiedName'])

In [None]:
#Not Utility service AND High Fan-in AND Low Fanout AND Persistent AND Access to infrastructure AND Fine grained
persistentEntity_services:set = high_in_degree.difference(utility_services).intersection(low_out_degree).intersection(persistent_set)

for entityId in persistentEntity_services:
    print(entityId, entities_dict[entityId]['entityFullQualifiedName'])

In [None]:
entity_names = []
for entity_id in persistentEntity_services:
    entity_names.append(str(entities_dict[entity_id]['entityFullQualifiedName']))

calls_entity_set = set()
for entity_id in allowed_types_set:
    if len(entities_dict.get(entity_id)['methods']) == 0: continue
    for method in entities_dict.get(entity_id)['methods']:
        if len(method['invocations']) == 0: continue
        for all_calls in method['invocations']:
            for persistentEntity_call in entity_names:
                if persistentEntity_call in all_calls:
                    calls_entity_set.add(entity_id)
                    break
print('calls_entity_set: ' + repr(calls_entity_set))

In [None]:
#Not Utility AND Not Entity AND Low Fan-in AND ( Call to Entity ≥1 OR High CComplexity OR Error Handling)
application_services:set = calls_entity_set.union(high_c_complexity).union(err_handling_set).intersection(low_in_degree).difference(persistentEntity_services).difference(utility_services)#.difference({54})
for entityId in application_services:
    print(entityId, entities_dict[entityId]['entityFullQualifiedName'])

In [None]:
print( 'Percentage of labeled ' + repr(allowed_types) + ': ' + repr((len(application_services.union(utility_services).union(persistentEntity_services)) / len(allowed_types_set)) * 100))

## Create second Vew for Co-Learning

In [None]:
second_vew_dict_dict:dict = {}
for _entity in allowed_types_set:
    _vew_list:list = [next(iter([i[1] for i in in_degree if i[0] == _entity])),
                      next(iter([i[1] for i in out_degree if i[0] == _entity])),
                      entities_cc.get(_entity)]
    _err = 0
    for(entity_id) in entities_dict:
        for method in entities_dict.get(entity_id)['methods']:
            if 'catch' in method['body']:
                _err = _err + 1
    _vew_list.append(_err)
    _pers = 0
    for(entity_id) in entities_dict:
        if len(MG_ORM.edges(entity_id, data=True)) > 0:
            _pers = 1
    _vew_list.append(_pers)
    _vew_list.append(entities_dict[_entity]['loc'])
    second_vew_dict_dict[_entity] = np.array(_vew_list)
print(second_vew_dict_dict)

## Get Entity Wording

In [None]:
import re
_filtered_words:list = ['id','count','name','date']
_list_persistent_words:list = []
for(entity_id) in entities_dict:
    if entities_dict[entity_id]['entityType'] == "TABLE":
        _list_persistent_words.extend(entities_dict[entity_id]['fields'])
    #print(_list_persistent_words)
    for word in _list_persistent_words:
        del _list_persistent_words[_list_persistent_words.index(word)]
        _list_persistent_words.extend(re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', word)).split())
_list_persistent_words = [x.lower() for x in _list_persistent_words]
s:set = set(_filtered_words)
temp3 = [x for x in _list_persistent_words if x not in s]
_persistent_word_dict = {i: temp3.count(i) for i in temp3}
print(_persistent_word_dict)

## Regeneration of code

In [None]:
def get_position_invert(code:int, pos:int)-> bool:
    mask = 1 << pos
    return not (code&mask) == mask

def get_method(method_dict, **kwargs) -> str:
    #optional kwargs to deactivate features can be passed like 'annotations'=False
    annotations = kwargs.get('annotations',True)
    annotation_arguments = kwargs.get('annotation_arguments',True)
    method_head = kwargs.get('method_head',True)
    method_modifiers = kwargs.get('method_modifiers',True)
    return_types = kwargs.get('returnTypes',True)
    method_parameters = kwargs.get('method_parameters',True)
    parameter_annotations = kwargs.get('parameter_annotations',True)
    parameter_types = kwargs.get('parameter_types',True)
    parameter_names = kwargs.get('parameter_names',True)
    method_body = kwargs.get('method_body',True)
    comments = kwargs.get('comments',True)
    method_java_doc = kwargs.get('method_java_doc',True)

    methodText:str = ''
    #annotations
    if method_java_doc:
        methodText += method_dict['javaDoc']
    if annotations:
        for annotation in method_dict['annotations']:
            methodText += annotation['type'] + ' '
            if annotation_arguments:
                for argument in annotation['arguments']:
                    methodText = methodText + str(argument) + ' '
            methodText += '\n'
    #method head
    if method_head:
        if method_modifiers:
            for modifier in method_dict['modifiers']:
                methodText += modifier + ' '
        if return_types:
            for returnType in method_dict['returnTypes']:
                methodText += returnType + ' '
        if method_parameters:
            methodText += method_dict['name'] + '('
            if len(method_dict['parameters']) > 0:
                for i,parameter in enumerate(method_dict['parameters']):
                    if parameter_annotations:
                        for annotation in parameter['annotations']:
                            if type(annotation[0]) == str:
                                methodText += annotation[0] + ' '
                            else:
                                methodText += annotation['type'] + ' '
                                for argument in annotation['arguments']:
                                    methodText += str(argument) + ' '
                    if parameter_types:
                        methodText += parameter['type'] + ' '
                    if parameter_names:
                        methodText += parameter['name']
                    if not i == len(method_dict['parameters'])-1:
                        methodText += ', '
                    else:
                        methodText += ')'
            else:
                methodText += ')'
    #method body
    if method_body:
        methodText += method_dict['body']
    #comments
    if comments:
        for comment in method_dict['comments']:
            methodText += ' ' + comment
    methodText += '\n'
    return methodText

def get_class(entity_dict, **kwargs) -> str:
    inherited_methods = kwargs.get('inherited_methods',True)
    package = kwargs.get('package',True)
    imports = kwargs.get('imports',True)
    comments = kwargs.get('comments',True)
    java_doc = kwargs.get('java_doc',True)
    instance_head = kwargs.get('instance_head',True)
    instance_modifiers = kwargs.get('instance_modifiers',True)
    instance_type = kwargs.get('instance_type',True)
    instance_name = kwargs.get('instance_name',True)
    instance_extension = kwargs.get('instance_extension',True)
    instance_implementations = kwargs.get('instance_implementations',True)
    fields = kwargs.get('fields',True)
    field_annotations = kwargs.get('field_annotations',True)
    field_modifiers = kwargs.get('field_modifiers',True)
    field_type = kwargs.get('field_type',True)
    field_name = kwargs.get('field_name',True)
    methods = kwargs.get('methods',True)
    method_code = kwargs.get('method_code',0)

    entityText:str = ''
    #package
    if package:
        entityText += 'package ' + entity_dict['package'] + '\n\n'
    #imports
    if imports:
        for codeImport in entity_dict['imports']:
            entityText += 'import ' + codeImport + '\n'
        entityText += '\n'
    #commments
    if comments:
        for comment in entity_dict['comments']:
                entityText += comment + '\n'
    if java_doc:
        for doc in entity_dict['javaDoc']:
                entityText += doc + '\n'
    #head
    if instance_head:
        if instance_modifiers:
            for modifier in entity_dict['modifiers']:
                    entityText += modifier + ' '
        if instance_type:
            entityText += repr(entity_dict['entityType']).strip('\'').lower() + ' '
        if instance_name:
            entityText += entity_dict['entityName'] + ' '
        if instance_extension:
            if len(entity_dict['extends']) > 0:
                entityText += 'extends '
            for i, interface in enumerate(entity_dict['extends']):
                if not i == len(entity_dict['extends']) - 1:
                    entityText += interface + ', '
                else:
                    entityText += interface + ' '
        if instance_implementations:
            if len(entity_dict['interfaces']) > 0:
                entityText += 'implements '
            for i, interface in enumerate(entity_dict['interfaces']):
                if not i == len(entity_dict['interfaces']) - 1:
                    entityText += interface + ', '
                else:
                    entityText += interface + ' '
        entityText += '{\n'
    #fields
    if fields:
        for codeField in entity_dict['fields']:
            if field_annotations:
                for annotation in codeField['annotations']:
                        entityText += annotation['type'] + '\n'
            if field_modifiers:
                for modifier in codeField['modifiers']:
                    entityText += modifier + ' '
            if field_type:
                entityText += codeField['type'] + ' '
            if field_name:
                entityText += codeField['name'] + '\n'
        entityText += '\n'
    #mehtods
    if methods:
        for codeMethod in entity_dict['methods']:
            if not codeMethod['inherited_from_superclass'] or inherited_methods: #kick out methods (which are inherited)
                entityText += get_method(codeMethod, annotations = get_position_invert(method_code, 0),
                                         annotation_arguments = get_position_invert(method_code, 1),
                                         method_head = get_position_invert(method_code, 2),
                                         method_modifiers = get_position_invert(method_code, 3),
                                         return_types = get_position_invert(method_code, 4),
                                         method_parameters = get_position_invert(method_code, 5),
                                         parameter_annotations = get_position_invert(method_code, 6),
                                         parameter_types = get_position_invert(method_code, 7),
                                         parameter_names = get_position_invert(method_code, 8),
                                         method_body = get_position_invert(method_code, 9),
                                         comments = get_position_invert(method_code, 10),
                                         method_java_doc = get_position_invert(method_code, 11))
    #everything after methods
    entityText += '}\n'
    return entityText

## Setup of torch and GPU device testing

In [None]:
import torch
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

# Set max_split_size_mb to 512 MB
torch.backends.cuda.split_kernel_size = 8

## Setup of codebert model and tokenizer

In [None]:
from transformers import AutoTokenizer, RobertaModel, RobertaConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = RobertaConfig.from_pretrained("microsoft/codebert-base")
config.output_hidden_states = True #should also be working but does not
model = RobertaModel.from_pretrained("microsoft/codebert-base",config)
model.config.output_hidden_states = True
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

## Getting all embeddings for the different classified and unclassified entities

In [None]:
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions

def get_embeddings_class_options(_service, _code_for_class: int, _code_for_method: int) -> torch.tensor:
    _embeddings_dict: dict = {}

    text = get_class(entities_dict[_service], inherited_methods=get_position_invert(_code_for_class, 0),
                     package=get_position_invert(_code_for_class, 1),
                     imports=get_position_invert(_code_for_class, 2),
                     comments=get_position_invert(_code_for_class, 3),
                     java_doc=get_position_invert(_code_for_class, 4),
                     instance_head=get_position_invert(_code_for_class, 5),
                     instance_modifiers=get_position_invert(_code_for_class, 6),
                     instance_type=get_position_invert(_code_for_class, 7),
                     instance_name=get_position_invert(_code_for_class, 8),
                     instance_extension=get_position_invert(_code_for_class, 9),
                     instance_implementations=get_position_invert(_code_for_class, 10),
                     fields=get_position_invert(_code_for_class, 11),
                     field_annotations=get_position_invert(_code_for_class, 12),
                     field_modifiers=get_position_invert(_code_for_class, 13),
                     field_type=get_position_invert(_code_for_class, 14),
                     field_name=get_position_invert(_code_for_class, 15),
                     methods=get_position_invert(_code_for_class, 16),
                     method_code=_code_for_method)
    #print(text)
    tokenized_text = tokenizer.tokenize(text)

    chunks, chunk_size = len(tokenized_text), 510
    split_tokenized_text: list = [tokenized_text[i:i + chunk_size] for i in range(0, chunks, chunk_size)]
    for n, tokenized_text_slice in enumerate(split_tokenized_text):
        tokenized_text_slice = [tokenizer.cls_token] + tokenized_text_slice + [tokenizer.sep_token]
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text_slice)
        segments_ids = [1] * len(tokenized_text_slice)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(device)
        segments_tensor = torch.tensor([segments_ids])
        segments_tensor = segments_tensor.to(device)
        with torch.no_grad():
            model_output: BaseModelOutputWithPoolingAndCrossAttentions = model(tokens_tensor, segments_tensor)
        _all_hidden_layers_of_slice: dict = {}
        _mean: torch.tensor() = torch.zeros(768).to(device)
        for i, slice_hidden_state_tensor in enumerate(model_output.hidden_states):
            _all_hidden_layers_of_slice[i] = (slice_hidden_state_tensor[-1][-1])
            if i < 13:
                _mean = torch.add(_mean, slice_hidden_state_tensor[-1][-1])
                #print(f"added {i}")
        _all_hidden_layers_of_slice[13] = _mean #there are 13 regular hidden layers 0-12 and one mean of the above specified located at pos 13
        _embeddings_dict[n] = _all_hidden_layers_of_slice
    print(f"Made {len(_embeddings_dict.keys())} embeddings for {entities_dict[_service]['entityFullQualifiedName']} with class code {bin(_code_for_class)} and method code {bin(_code_for_method)}")
    _mean: torch.tensor() = torch.zeros(768).to(device)
    for _mean_embedding_of_slice in _embeddings_dict.keys():
        _mean = torch.add(_mean, _embeddings_dict.get(_mean_embedding_of_slice).get(13))  # summ all the averages to make an overall class average
    return _mean/len(_embeddings_dict)

embeddings_list_dict: dict = {}

if True:
    persistentEntity_services_embeddings: dict = {}
    for _service in persistentEntity_services:
        persistentEntity_services_embeddings[_service] = get_embeddings_class_options(_service, 0 << 1, 1 << 10)
        #print(len(persistentEntity_services_embeddings[_service][0][0]))
    application_services_embeddings: dict = {}
    for _service in application_services:
        application_services_embeddings[_service] = get_embeddings_class_options(_service, 0 << 1, 1 << 10)
    utility_services_embeddings: dict = {}
    for _service in utility_services:
        utility_services_embeddings[_service] = get_embeddings_class_options(_service, 0 << 1, 1 << 10)
    unclassified_services_embeddings: dict = {}
    for _service in allowed_types_set.difference(utility_services).difference(persistentEntity_services).difference(
            application_services):
        unclassified_services_embeddings[_service] = get_embeddings_class_options(_service, 0 << 1, 1 << 10)
    embeddings_list_dict[(0, 8)] = [persistentEntity_services_embeddings, application_services_embeddings,
                                    utility_services_embeddings, unclassified_services_embeddings]

In [None]:
classes = ['persistentEntity', 'application', 'utility']

file_app = open("../data/posapplication.txt", "r")
file_util = open("../data/posutility.txt", "r")
file_entity = open("../data/posentity.txt", "r")

app_text = file_app.read()
entity_text = file_entity.read()
util_text = file_util.read()

_first_code_embeddings_list:list = list(embeddings_list_dict.values())[0]

TP_util = 0
TP_pers = 0
TP_app = 0

_services: dict = _first_code_embeddings_list[2]#classified util
_number_of_classified_util = len(_first_code_embeddings_list[2])
_number_of_actual_util = util_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        _err = False
    if name in app_text:
        _err = False
    if name in util_text:
        TP_util = TP_util + 1
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")

_services: dict = _first_code_embeddings_list[0]#classified pers
_number_of_classified_pers = len(_first_code_embeddings_list[0])
_number_of_actual_pers = entity_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        TP_pers = TP_pers + 1
        _err = False
    if name in app_text:
        _err = False
    if name in util_text:
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")

_services: dict = _first_code_embeddings_list[1]#classified app
_number_of_classified_app = len(_first_code_embeddings_list[1])
_number_of_actual_app = app_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        #print(f"{name} is {classes[0]}.")
        _err = False
    if name in app_text:
        #print(f"{name} is {classes[1]}.")
        TP_app = TP_app + 1
        _err = False
    if name in util_text:
        #print(f"{name} is {classes[2]}.")
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")
if TP_pers + TP_app + TP_util > 0:
    FP_util = _number_of_classified_util - TP_util
    FN_util = _number_of_actual_util - TP_util
    TN_util =  _number_of_actual_app + _number_of_actual_pers - FP_util
    FP_pers = _number_of_classified_pers - TP_pers
    FN_pers = _number_of_actual_pers - TP_pers
    TN_pers = _number_of_actual_app + _number_of_actual_util - FP_pers
    FP_app = _number_of_classified_app - TP_app
    FN_app = _number_of_actual_app - TP_app
    TN_app = _number_of_actual_util + _number_of_actual_pers - FP_app
    _accuracy = (TP_util + TP_app + TP_pers + TN_app + TN_util + TN_pers) / (TP_util + TP_app + TP_pers + TN_app + TN_util + TN_pers + FP_app + FP_pers + FP_util + FN_util + FN_app + FN_pers)
    print(f"accuracy: {_accuracy}")
    if not TP_util == 0:
        _util_precision = TP_util / (TP_util + FP_util)
        _util_recall = TP_util / (TP_util + FN_util)
        print(f"precision util: {_util_precision} | recall util: {_util_recall} | F-1 measure: {2 * ((_util_precision * _util_recall) / (_util_precision + _util_recall))}")
    if not TP_app == 0:
        _app_precision = TP_app / (TP_app + FP_app)
        _app_recall = TP_app / (TP_app + FN_app)
        print(f"precision application: {_app_precision} | recall application: {_app_recall} | F-1 measure: {2 * ((_app_precision * _app_recall) / (_app_precision + _app_recall))}")
    if not TP_app == 0:
        _pers_precision = TP_pers / (TP_pers + FP_pers)
        _pers_recall = TP_pers / (TP_pers + FN_pers)
        print(f"precision persistent entity: {_pers_precision} | recall application: {_pers_recall} | F-1 measure: {2 * ((_pers_precision * _pers_recall) / (_pers_precision + _pers_recall))}")
    _total_precision = (TP_util + TP_app + TP_pers) / (TP_util + TP_app + TP_pers + FP_app + FP_util + FP_pers)
    _total_recall = (TP_util + TP_app + TP_pers) / (TP_util + TP_app + TP_pers + FN_app + FN_util + FN_pers)
    print(f"precision total: {_total_precision} | recall total: {_total_recall} | F-1 measure total: {2 * ((_total_precision * _total_recall) / (_total_precision + _total_recall))}")
else: print("Could not get truth.")

## Creating Datastructures for sklearn with numpy

In [None]:
classes = ['persistentEntity', 'application', 'utility']

def move_embeddings_to_numpy() -> dict:
    _learn_dict = {}
    for _embedding_list_key in embeddings_list_dict.keys():
        _X: list = []
        _X2: list = []
        _y: list = []
        _persistentEntity_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[0]
        for _persistentEntity_services_embedding_key in _persistentEntity_services_embeddings.keys():
            _y.append(classes[0])
            _X.append(_persistentEntity_services_embeddings.get(_persistentEntity_services_embedding_key).cpu().detach().numpy())
            _X2.append(second_vew_dict_dict.get(_persistentEntity_services_embedding_key))
        _application_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[1]
        for _application_services_embedding_key in _application_services_embeddings.keys():
            _y.append(classes[1])
            _X.append(_application_services_embeddings.get(_application_services_embedding_key).cpu().detach().numpy())
            _X2.append(second_vew_dict_dict.get(_application_services_embedding_key))
        _utility_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[2]
        for _utility_services_embedding_key in _utility_services_embeddings.keys():
            _y.append(classes[2])
            _X.append(_utility_services_embeddings.get(_utility_services_embedding_key).cpu().detach().numpy())
            _X2.append(second_vew_dict_dict.get(_utility_services_embedding_key))
        _learn_dict[_embedding_list_key] = (np.array(_X), np.array(_y),np.array(_X2))
    return _learn_dict

## Making a pipeline for SVM and fitting(training)

In [None]:
def remove_duplicates(lst): #removes tuples based on first element occurrence
    visited = {}
    Output = [] # Output list initialization

    for a, b in lst:# Iterate through the list of tuples
        if a not in visited:        # Check if the first value is already present in the dictionary
            visited[a] = True       # If it is not present, add the key-value pair to the dictionary
            Output.append((a, b))   # Append the tuple to the output list
    return Output

In [None]:
def train_clf(kernel:str,X,y,Z):
    clf = make_pipeline(StandardScaler(),SVC(C=1.0, kernel=kernel,probability=True))
    clf.fit(X, y)
    return clf.predict_proba(np.array(Z))

In [None]:
def get_highest_N(L,n)-> list:
    _A:np.array = np.array(L)              #make 2D array of all prediction and their 3 probabilities shape = len(_Z):3
    _A_1d:np.array = _A.flatten()           #make 1D array of all probabilities
    _tuple_list_highest:list = []           #prepare list to store top N predictions based on probability
    for _c_1d in np.flipud(_A_1d.argsort()[-n:]): #getting the top 5 values ->
        _tuple_list_highest.append(np.unravel_index(_c_1d, _A.shape)) #making a tuple to get original 2D location and save it to list
    return remove_duplicates(_tuple_list_highest) #making sure there is no two same values for D1

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

for _embedding_list_key in embeddings_list_dict.keys():
    _Z = []
    _Z2 = []
    _unclassified_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[3]
    for _unclassified_services_embedding_key in _unclassified_services_embeddings.keys():
        _Z.append(_unclassified_services_embeddings.get(_unclassified_services_embedding_key).cpu().detach().numpy())
        _Z2.append(second_vew_dict_dict.get(_unclassified_services_embedding_key))
    while len(_Z) > 0 and len(_Z2) > 0:
        _np_dict = move_embeddings_to_numpy()
        _y = _np_dict.get(_embedding_list_key)[1]
        _X = _np_dict.get(_embedding_list_key)[0]
        _X2 = _np_dict.get(_embedding_list_key)[2]
        _L2 = train_clf('linear',_X2,_y,_Z2)
        _L = train_clf('poly',_X,_y,_Z) #make predictions with probability
        top_1:list = get_highest_N(_L,5)
        top_2:list = get_highest_N(_L2,5)
        _combined_top_list:list = top_2 + top_1
        _tuple_list_highest_cleansed = remove_duplicates(_combined_top_list)
        for (list_index, probability_array_index) in sorted(_tuple_list_highest_cleansed,reverse=True): #sorted because of removal order
            if (list_index, probability_array_index) in top_1:
                _probability_array = _L[list_index]
                print(_probability_array)
                _probability_value = _probability_array[probability_array_index]
            else:
                _probability_array = _L2[list_index]
                print(_probability_array)
                _probability_value = _probability_array[probability_array_index]
            if _probability_value < 0.33334:
                raise Exception("prediction err")

            if probability_array_index == 2:
                print(f"unclassified service: {entities_dict[list(_unclassified_services_embeddings.keys())[list_index]]['entityFullQualifiedName'],list(_unclassified_services_embeddings.keys())[list_index]} was classified:\x1b[34m {classes[probability_array_index]}\x1b[0m with probability of: {_probability_value*100}%.")
            elif probability_array_index == 0:
                print(f"unclassified service: {entities_dict[list(_unclassified_services_embeddings.keys())[list_index]]['entityFullQualifiedName'],list(_unclassified_services_embeddings.keys())[list_index]} was classified:\x1b[31m {classes[probability_array_index]}\x1b[0m with probability of: {_probability_value*100}%.")
            elif probability_array_index == 1:
                print(f"unclassified service: {entities_dict[list(_unclassified_services_embeddings.keys())[list_index]]['entityFullQualifiedName'],list(_unclassified_services_embeddings.keys())[list_index]} was classified:\x1b[32m {classes[probability_array_index]}\x1b[0m with probability of: {_probability_value*100}%.")

            _some_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[probability_array_index] #get list of respective class
            _some_services_embeddings[list(_unclassified_services_embeddings.keys())[list_index]] = _unclassified_services_embeddings.get(list(_unclassified_services_embeddings.keys())[list_index]) #map unclassified service to new list
            del _Z[list_index] #remove from results
            del _Z2[list_index]
            del _unclassified_services_embeddings[list(_unclassified_services_embeddings.keys())[list_index]] #remove from unclassified
            print(f"left to classify: {len(_Z),len(_Z2)}.")

In [None]:
classes = ['persistentEntity', 'application', 'utility']

file_app = open("../data/posapplication.txt", "r")
file_util = open("../data/posutility.txt", "r")
file_entity = open("../data/posentity.txt", "r")

app_text = file_app.read()
entity_text = file_entity.read()
util_text = file_util.read()

_first_code_embeddings_list:list = list(embeddings_list_dict.values())[0]

TP_util = 0
TP_pers = 0
TP_app = 0

_services: dict = _first_code_embeddings_list[2]#classified util
_number_of_classified_util = len(_first_code_embeddings_list[2])
_number_of_actual_util = util_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        _err = False
    if name in app_text:
        _err = False
    if name in util_text:
        TP_util = TP_util + 1
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")

_services: dict = _first_code_embeddings_list[0]#classified pers
_number_of_classified_pers = len(_first_code_embeddings_list[0])
_number_of_actual_pers = entity_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        TP_pers = TP_pers + 1
        _err = False
    if name in app_text:
        _err = False
    if name in util_text:
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")

_services: dict = _first_code_embeddings_list[1]#classified app
_number_of_classified_app = len(_first_code_embeddings_list[1])
_number_of_actual_app = app_text.count('\n')
for _out in _services.keys():
    name = entities_dict[_out]['entityFullQualifiedName']
    _err = True
    if name in entity_text:
        #print(f"{name} is {classes[0]}.")
        _err = False
    if name in app_text:
        #print(f"{name} is {classes[1]}.")
        TP_app = TP_app + 1
        _err = False
    if name in util_text:
        #print(f"{name} is {classes[2]}.")
        _err = False
    if _err:
        print(f"{name} could not be found in truth files.")
if TP_pers + TP_app + TP_util > 0:
    FP_util = _number_of_classified_util - TP_util
    FN_util = _number_of_actual_util - TP_util
    TN_util =  _number_of_actual_app + _number_of_actual_pers - FP_util
    FP_pers = _number_of_classified_pers - TP_pers
    FN_pers = _number_of_actual_pers - TP_pers
    TN_pers = _number_of_actual_app + _number_of_actual_util - FP_pers
    FP_app = _number_of_classified_app - TP_app
    FN_app = _number_of_actual_app - TP_app
    TN_app = _number_of_actual_util + _number_of_actual_pers - FP_app
    _accuracy = (TP_util + TP_app + TP_pers + TN_app + TN_util + TN_pers) / (TP_util + TP_app + TP_pers + TN_app + TN_util + TN_pers + FP_app + FP_pers + FP_util + FN_util + FN_app + FN_pers)
    print(f"accuracy: {_accuracy}")
    _util_precision = TP_util / (TP_util + FP_util)
    _util_recall = TP_util / (TP_util + FN_util)
    print(f"precision util: {_util_precision} | recall util: {_util_recall} | F-1 measure: {2 * ((_util_precision * _util_recall) / (_util_precision + _util_recall))}")
    _app_precision = TP_app / (TP_app + FP_app)
    _app_recall = TP_app / (TP_app + FN_app)
    print(f"precision application: {_app_precision} | recall application: {_app_recall} | F-1 measure: {2 * ((_app_precision * _app_recall) / (_app_precision + _app_recall))}")
    _pers_precision = TP_pers / (TP_pers + FP_pers)
    _pers_recall = TP_pers / (TP_pers + FN_pers)
    print(f"precision persistent entity: {_pers_precision} | recall application: {_pers_recall} | F-1 measure: {2 * ((_pers_precision * _pers_recall) / (_pers_precision + _pers_recall))}")
    _total_precision = (TP_util + TP_app + TP_pers) / (TP_util + TP_app + TP_pers + FP_app + FP_util + FP_pers)
    _total_recall = (TP_util + TP_app + TP_pers) / (TP_util + TP_app + TP_pers + FN_app + FN_util + FN_pers)
    print(f"precision total: {_total_precision} | recall total: {_total_recall} | F-1 measure total: {2 * ((_total_precision * _total_recall) / (_total_precision + _total_recall))}")
else: print("Could not get truth.")

In [None]:
for _embedding_list_key in embeddings_list_dict.keys():
    _persistentEntity_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[0]    #pers
    _application_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[1]        #app
    _utility_services_embeddings = embeddings_list_dict.get(_embedding_list_key)[2]        #util
    #print(embeddings_list_dict.get(_embedding_list_key)[3]) #unclassed
    for _entity in _persistentEntity_services_embeddings.keys():
        if not _entity in persistentEntity_services:
            persistentEntity_services.add(_entity)
            print(f"pers {persistentEntity_services}")
    for _entity in _application_services_embeddings.keys():
        if not _entity in application_services:
            application_services.add(_entity)
            print(f"app {application_services}")
    for _entity in _utility_services_embeddings.keys():
        if not _entity in utility_services:
            utility_services.add(_entity)
            print(f"util {utility_services}")

In [None]:
import networkx as nx

G = nx.Graph()
R = nx.MultiDiGraph()

for entity in entities:
    if entity['entityId'] in allowed_types_set:
        if entity['entityId'] in persistentEntity_services:
            color = "#691313"  #red
            entity_service = "pers"
        elif entity['entityId'] in utility_services:
            color = "#133569"  #blue
            entity_service = "util"
        elif entity['entityId'] in application_services:
            color = "#13692a"  #green
            entity_service = "app"
        else:
            entity_service = "undefined"
            print(f"unexpected entity type {entity['entityId']}.")
        G.add_node(entity['entityId'], entity_service=entity_service, name=entity['entityFullQualifiedName'],
                   color=color)
R = G.copy()
for relation in relations:
    source_id = relation['from']
    target_id = relation['to']
    relation_type = relation['relationType']
    old_weight = 0
    try:
        old_weight = G.edges[source_id, target_id]['weight']
    except KeyError:
        old_weight = 0

    weight = 0
    if relation_type == 'CALL':
        weight = 5 + old_weight
        R.add_edge(source_id, target_id, weight=5)
    elif relation_type == 'INTERFACE':
        weight = 100 + old_weight
        R.add_edge(source_id, target_id, weight=100)
    elif relation_type == 'INHERITANCE':
        weight = 100 + old_weight
        R.add_edge(source_id, target_id, weight=100)
    elif relation_type == 'FIELD':
        weight = 25 + old_weight
        R.add_edge(source_id, target_id, weight=25)
    else:
        print(f"unconsidered relation type {relation_type}.")
    if source_id in G.nodes and target_id in G.nodes and not weight == 0:  #filter unwanted nodes
        G.add_edge(source_id, target_id, weight=weight)
print(G)

In [None]:
for node_id in G.nodes():
    print(f"{node_id:3d} {G.nodes[node_id]}")

In [None]:
def visualize_graph(_graph: nx.Graph,size=6,iter=50):
    # Visualize the graph
    plt.figure(figsize=(size, size))
    pos = nx.spring_layout(_graph, seed=42, iterations=iter)
    colors = nx.get_node_attributes(_graph, "color")
    nx.draw_networkx_nodes(_graph, pos, node_color=colors.values(), node_size=500)
    nx.draw_networkx_edges(_graph, pos, edge_color="grey")
    nx.draw_networkx_labels(_graph, pos, font_size=9, font_family="sans-serif", font_color="#ffffff")
    nx.draw_networkx_edge_labels(
        _graph, pos, edge_labels={(u, v): d["weight"] for u, v, d in _graph.edges(data=True)}
    )
    plt.axis("off")
    plt.show()


visualize_graph(G,16,200)

In [None]:
_sublist_app: list = []
_sublist_util: list = []
_sublist_pers: list = []
for node in G.nodes():
    if G.nodes[node]['entity_service'] == 'app':
        _sublist_app.append(node)
    elif G.nodes[node]['entity_service'] == 'util':
        _sublist_util.append(node)
    elif G.nodes[node]['entity_service'] == 'pers':
        _sublist_pers.append(node)
    else:
        print('unexpected node entity_service type!')

subgraph_app = nx.subgraph(G, _sublist_app)
print(subgraph_app)
subgraph_util = nx.subgraph(G, _sublist_util)
print(subgraph_util)
subgraph_pers = nx.subgraph(G, _sublist_pers)
print(subgraph_pers)
sub_graphs: list = [subgraph_app, subgraph_pers, subgraph_util]
#for graph in sub_graphs:
#    visualize_graph(graph)

In [None]:
service_part_graph_list: list = []
for sub_graph in sub_graphs:
    if not len(sub_graph.edges) < 1:
        groupings = nx.community.louvain_communities(sub_graph, seed=42)
        for subset in groupings:
            grouping_graph = nx.subgraph(sub_graph, subset)
            for node_id in grouping_graph.nodes():
                print(f"{node_id:3d} {G.nodes[node_id]}")
            #visualize_graph(G_test)
            service_part_graph_list.append(grouping_graph)
    else:
        for node_n in range(len(list(sub_graph.nodes))):
            grouping_graph = nx.subgraph(sub_graph, list(sub_graph.nodes)[node_n])
            for node_id in grouping_graph.nodes():
                print(f"{node_id:3d} {G.nodes[node_id]}")
            service_part_graph_list.append(grouping_graph)

In [None]:
new_summ_graph: nx.Graph = nx.Graph()
_appl = 1
_util = 1
_pers = 1
for i, _service_graph in enumerate(service_part_graph_list):
    name = ''
    color = ''
    contained_nodes: list = list(_service_graph.nodes.keys())
    if 'app' in list(nx.get_node_attributes(_service_graph, 'entity_service').values()):
        name = f'appl_{_appl}'
        _appl = _appl + 1
        color = "#13692a"  #green
    elif 'pers' in list(nx.get_node_attributes(_service_graph, 'entity_service').values()):
        name = f'pers_{_pers}'
        _pers = _pers + 1
        color = "#691313"  #red
    elif 'util' in list(nx.get_node_attributes(_service_graph, 'entity_service').values()):
        name = f'util_{_util}'
        _util = _util + 1
        color = "#133569"  #blue
    else:
        print(print('unexpected node entity_service type!'))
    _inter_service_edges: list = []
    for _node_id in _service_graph:
        all_edges = G.edges
        for edge in all_edges:
            if _node_id == edge[0] and not edge[1] in _service_graph.nodes:
                _inter_service_edges.append(edge)
            elif _node_id == edge[1] and not edge[0] in _service_graph.nodes:
                _inter_service_edges.append(edge)
    new_summ_graph.add_node(i, name=name, color=color, contained_nodes=contained_nodes,
                            inter_service_edges=_inter_service_edges)

_inter_service_edges: set = set()
for new_node in new_summ_graph.nodes():
    _inter_service_edges.update(new_summ_graph.nodes.get(new_node)['inter_service_edges'])

_dict_old_to_new: dict = {}
for new_node in new_summ_graph.nodes():
    for old_node in new_summ_graph.nodes.get(new_node)['contained_nodes']:
        _dict_old_to_new[old_node] = new_node

for _inter_service_edge in _inter_service_edges:
    weight = G.edges.get(_inter_service_edge)['weight']
    start_node = _dict_old_to_new.get(_inter_service_edge[0])
    end_node = _dict_old_to_new.get(_inter_service_edge[1])
    try:
        old_weight = new_summ_graph.edges[start_node, end_node]['weight']
    except KeyError:
        old_weight = 0
    weight = weight + old_weight
    new_summ_graph.add_edge(start_node, end_node, weight=weight)

print(new_summ_graph)
visualize_graph(new_summ_graph,16,200)

In [None]:
heaviest = 0
for edge in new_summ_graph.edges():
    _weight = new_summ_graph.edges.get(edge)['weight']
    if _weight > heaviest:
        heaviest = _weight
static_weight_dict: dict = {}
for edge in new_summ_graph.edges():
    static_weight_dict[edge] = round(new_summ_graph.edges.get(edge)['weight'] / heaviest, 5)
print(static_weight_dict)
print(heaviest)

In [None]:
import re

sorted_text_dict_dict: dict = {}


def get_words_for_cluster(node_id: int, graph: nx.Graph, filter_list: list) -> dict:
    _cluster_text: str = ''
    _code_for_class:int = 24775 #00110000011000111
    _code_for_method:int = 218  #000011011000

    """METHOD VALUES
    annotations = 0
    annotation_arguments = 1
    method_head = 2
    method_modifiers = 3
    return_types = 4
    method_parameters = 5
    parameter_annotations = 6
    parameter_types = 7
    parameter_names = 8
    method_body = 9
    comments = 10
    method_java_doc = 11
    """

    for contained_node in graph.nodes.get(node_id)['contained_nodes']:
        _cluster_text = _cluster_text + " " + get_class(entities_dict[contained_node],
                                                  inherited_methods=get_position_invert(_code_for_class, 0),
                                                  package=get_position_invert(_code_for_class, 1),
                                                  imports=get_position_invert(_code_for_class, 2),
                                                  comments=get_position_invert(_code_for_class, 3),
                                                  java_doc=get_position_invert(_code_for_class, 4),
                                                  instance_head=get_position_invert(_code_for_class, 5),
                                                  instance_modifiers=get_position_invert(_code_for_class, 6),
                                                  instance_type=get_position_invert(_code_for_class, 7),
                                                  instance_name=get_position_invert(_code_for_class, 8),
                                                  instance_extension=get_position_invert(_code_for_class, 9),
                                                  instance_implementations=get_position_invert(_code_for_class, 10),
                                                  fields=get_position_invert(_code_for_class, 11),
                                                  field_annotations=get_position_invert(_code_for_class, 12),
                                                  field_modifiers=get_position_invert(_code_for_class, 13),
                                                  field_type=get_position_invert(_code_for_class, 14),
                                                  field_name=get_position_invert(_code_for_class, 15),
                                                  methods=get_position_invert(_code_for_class, 16),
                                                  method_code=_code_for_method)
    _cluster_text = re.sub(r'[^A-Za-z ]+', ' ', _cluster_text)
    content_list = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', _cluster_text)).split()
    s: set = set(filter_list)
    content_list = [x.lower() for x in content_list]
    temp3 = [x for x in content_list if x not in s]
    my_dict = {i: temp3.count(i) for i in temp3}
    keys = list(my_dict.keys())
    values = list(my_dict.values())
    sorted_value_index = np.argsort(values)[::-1][:]
    return {keys[i]: values[i] for i in sorted_value_index}


filter_list: list = []
with open("../data/java_keywords.txt", "r") as file_entity:
    for line in file_entity:
        filter_list.append(line.rstrip())

for node_id in new_summ_graph.nodes():
    sorted_text_dict_dict[node_id] = get_words_for_cluster(node_id, new_summ_graph, filter_list)
    print(get_words_for_cluster(node_id, new_summ_graph, filter_list))

In [None]:
from numpy import ndarray
import gensim.downloader

google_model = gensim.downloader.load('word2vec-google-news-300')
weighted_service_average_vectors_dict: dict = {}
for node in new_summ_graph.nodes():
    vec = np.zeros((300,))
    summ_of_weights = 0
    for word in list(sorted_text_dict_dict.get(node).keys()):
        try:
            multiplicity = sorted_text_dict_dict.get(node).get(word)
            #if word in _persistent_word_dict:
            #    multiplicity = multiplicity*_persistent_word_dict.get(word)*2
            vec:ndarray = ((google_model[word] * multiplicity) + (vec * summ_of_weights)) / (multiplicity + summ_of_weights)
            summ_of_weights = summ_of_weights + multiplicity
        except KeyError:
            print(f"\x1b[33m word not found: {word}\x1b[0m")
            del sorted_text_dict_dict.get(node)[word]
    weighted_service_average_vectors_dict[node] = vec

In [None]:
from scipy import spatial

linguistic_cosine_similarity_edges: dict = {}
_temp_done_list: list = []
for edge in new_summ_graph.edges():
    distance = 1 - spatial.distance.cosine(weighted_service_average_vectors_dict.get(edge[0]),
                                           weighted_service_average_vectors_dict.get(edge[1]))
    print(f"for {edge[0]} and {edge[1]} the cosine lingual distance is {distance} ")
    linguistic_cosine_similarity_edges[edge] = distance
#for node_1 in new_summ_graph.nodes():
#    for node_2 in new_summ_graph.nodes():
#        if not (node_1, node_2) in linguistic_cosine_similarity_edges.keys() and not (node_2, node_1) in linguistic_cosine_similarity_edges.keys() and not node_1 == node_2:
#            distance = 1 - spatial.distance.cosine(weighted_service_average_vectors_dict.get(node_1), weighted_service_average_vectors_dict.get(node_2))
#            print(f"for {node_1} and {node_2} the cosine lingual distance is {distance} ")
#            linguistic_cosine_similarity_edges[(node_1,node_2)] = distance

In [None]:
import decimal

def drange(x, y, jump) -> list:
    result: list = [float(x)]
    x = decimal.Decimal(x)
    while x < y:
        result.append(round(float(x), 9))
        x += decimal.Decimal(jump)
    return result

l = list(drange(0.0, 1.01, '0.01'))

In [None]:
plt.hist(linguistic_cosine_similarity_edges.values(), bins=l)
plt.xlabel('w(i,j)')
plt.ylabel('occurrence')
plt.show()

In [None]:
centers: list = []
for node in new_summ_graph.nodes():
    if 'appl' in new_summ_graph.nodes.get(node)['name']:
        centers.append(node)
print(centers)
A = nx.to_numpy_array(new_summ_graph)
print(A.shape)
print(A / heaviest)

In [None]:
def get_weight_of_pair(i: int, j: int, alpha: float = 1.0, beta: float = 1.0) -> float:
    weight_static = A[(i, j)] / heaviest
    weight_lingual = linguistic_cosine_similarity_edges.get((i, j), 0) + linguistic_cosine_similarity_edges.get((j, i),
                                                                                                                0)
    if i == j:
        return 1.0
    return (alpha * weight_static) + (beta * weight_lingual) / (alpha + beta)

In [None]:
a: float = 0.7
b: float = 0.3
MS = np.zeros((len(new_summ_graph.nodes()), len(centers)))
for center_id in centers:
    for node_id in new_summ_graph.nodes():
        if not get_weight_of_pair(center_id, node_id, a, b) == 0 and not 'app' in new_summ_graph.nodes.get(node_id)[
            'name']:
            summ: float = 0.0
            for c in centers:
                summ = summ + get_weight_of_pair(c, node_id, a, b)
            MS[node_id, center_id] = round(pow((get_weight_of_pair(center_id, node_id, a, b) / summ), 2), 6)
        else:
            MS[node_id, center_id] = 0
print(MS)

In [None]:
final_services: dict = {}
for center_id in centers:
    _service_graph = nx.Graph()
    _service_graph.add_node(center_id, name=new_summ_graph.nodes.get(center_id)['name'],
                            #add center application cluster
                            color=new_summ_graph.nodes.get(center_id)['color'],
                            contained_nodes=new_summ_graph.nodes.get(center_id)['contained_nodes'],
                            inter_service_edges=new_summ_graph.nodes.get(center_id)['inter_service_edges'])
    for node_id in new_summ_graph.nodes():
        if MS[node_id, center_id] > 0.02:
            _service_graph.add_node(node_id, name=new_summ_graph.nodes.get(node_id)['name'],
                                    color=new_summ_graph.nodes.get(node_id)['color'],
                                    contained_nodes=new_summ_graph.nodes.get(node_id)['contained_nodes'],
                                    inter_service_edges=new_summ_graph.nodes.get(node_id)['inter_service_edges'])
            _service_graph.add_edge(node_id, center_id, weight=MS[node_id, center_id])
    final_services[new_summ_graph.nodes.get(center_id)['name']] = _service_graph
all_classes_used: dict = {}
for o,i in enumerate(final_services.values()):
    print(f"{o}.")
    for c in i.nodes():
        print(c,i.nodes.get(c)['name'])
        for n in new_summ_graph.nodes.get(c)['contained_nodes']:
            print(G.nodes.get(n)['name'])
            try:
                all_classes_used[n] = all_classes_used[n] + 1
            except KeyError:
                all_classes_used[n] = 1
    visualize_graph(i)

In [None]:
multiple_used_aplication: list = []
for i in allowed_types_set:
    try:
        if all_classes_used[i] > 1 and i in application_services:
            print(f"\x1b[33m{G.nodes.get(i)['name']} was used in {all_classes_used[i]} services.\x1b[0m")
            multiple_used_aplication.append(i)
        else:
            if G.nodes.get(i)['entity_service'] == "util":
                print(f"\x1b[34m{G.nodes.get(i)['name']}\x1b[0m was used in {all_classes_used[i]} services.")
            elif G.nodes.get(i)['entity_service'] == "pers":
                print(f"\x1b[31m{G.nodes.get(i)['name']}\x1b[0m was used in {all_classes_used[i]} services.")
            elif G.nodes.get(i)['entity_service'] == "app":
                print(f"\x1b[32m{G.nodes.get(i)['name']}\x1b[0m was used in {all_classes_used[i]} services.")
            else:
                print(f"{G.nodes.get(i)['name']} was used in {all_classes_used[i]} services.")
    except KeyError:
        _numbers = G.edges(i)
        _edges: list = []
        for c in _numbers:
            _edges.append(G.edges.get(c))
        _weight = sum(x["weight"] for x in _edges)
        print(
            f"\x1b[31m\033[01m\033[04m{G.nodes.get(i)['name']} was not used in any service and has {len(_edges)} edges originally with a weight {_weight} !\x1b[0m")  #print it red

In [None]:
from collections import Counter
from networkx import NetworkXError, MultiDiGraph
yellow = "#B2B200"
_resulting_ms_graph:MultiDiGraph = R.copy()
_final_services_detailed:dict = {}
_internal_weights:dict = {}
_external_weights:dict = {}
_rename_mappings:list = []
_ms_sets:list = []
_used_node_from_outer_scope:dict = {}
for o,i in enumerate(final_services.values()):
    print(f"Service Candidate: {o}.")
    _new_ms_candidate_set = set()
    _new_ms_names_set = set()
    _used_node_from_outer_scope[o] = 0
    _reference_node = None
    for c in i.nodes():
        print(c,i.nodes.get(c)['name'])
        if 'appl' in i.nodes.get(c)['name']:
            _reference_node = new_summ_graph.nodes.get(c)['contained_nodes'][0]
        for n in new_summ_graph.nodes.get(c)['contained_nodes']:
            print(R.nodes.get(n)['name'])
            _new_ms_candidate_set.add(n)
            _new_ms_names_set.add(G.nodes.get(n)['name'])
    #collecting incoming edges to different nodes
    R:MultiDiGraph = nx.MultiDiGraph(R)
    _already_exposed:set = set()
    for _inner_node in _new_ms_candidate_set:
        for _in_edge in R.in_edges(_inner_node):
            if not _inner_node in _already_exposed and not set(_in_edge).difference(_new_ms_candidate_set) == set() and 'javafx.fxml.Initializable' in entities_dict.get(_inner_node)['interfaces']:
                _used_node_from_outer_scope[o] = _used_node_from_outer_scope[o] + 1
                _already_exposed.add(_inner_node)

    _ms_sets.append(_new_ms_names_set.copy())
    _temp_copy_graph = nx.subgraph(R, _new_ms_candidate_set)
    _temp_copy_graph = nx.MultiDiGraph(_temp_copy_graph)
    _temp_copy_graph.remove_edges_from(nx.selfloop_edges(_temp_copy_graph))
    _final_services_detailed[o] = _temp_copy_graph
    visualize_graph(_final_services_detailed.get(o),8,50)
    _internal_weights[f"MS{o}"] = _final_services_detailed.get(o).size(weight="weight")
    _new_ms_candidate_set.remove(_reference_node)
    while len(_new_ms_candidate_set) >= 1:
        try:
            _resulting_ms_graph = nx.contracted_nodes(_resulting_ms_graph, _reference_node, _new_ms_candidate_set.pop())
        except NetworkXError:
            continue
    _resulting_ms_graph.nodes.get(_reference_node)['color'] = yellow
    _rename_mappings.append({_reference_node : f"MS{o}"})

while len(_rename_mappings) >= 1:
    _resulting_ms_graph = nx.relabel_nodes(_resulting_ms_graph, _rename_mappings.pop())
_resulting_ms_graph.remove_edges_from(nx.selfloop_edges(_resulting_ms_graph))
_resulting_ms_graph:MultiDiGraph = MultiDiGraph(_resulting_ms_graph)

print(f"Representative call graph for all forgotten nodes and ms candidates (yellow).")
visualize_graph(_resulting_ms_graph,10,80)

#recall&precision
file_MS_GT = open("../data/service_candidates_GT.txt", "r")
_bounded_sets:list = []
_set_mappings:dict = {}
_ms_recall:dict = {}
_ms_precision:dict = {}
for line in file_MS_GT:
    _bounded_context_list:list = line.strip('\n').split(',')
    _bounded_sets.append(set(_bounded_context_list))
for s,_ms in enumerate(_ms_sets):
    _max_value = 0
    for b,_bs in enumerate(_bounded_sets):
        if len(_bs)/(len(_bs)+len(set(_bs).difference(_ms)))> _max_value:
            _max_value = len(_bs)/(len(_bs)+len(set(_bs).difference(_ms)))
            _set_mappings[s] = b
    _ms_recall[s] = _max_value
#print(_set_mappings)
for map_key in _set_mappings.keys():
    _ms_precision[map_key] = (len(_bounded_sets[_set_mappings.get(map_key)])/len(set(_bounded_sets[_set_mappings.get(map_key)]).union(_ms_sets[map_key])))

_average_precision = sum(_ms_precision.values())/len(_ms_precision)
_average_recall = sum(_ms_recall.values())/len(_ms_recall)
_more_ms = len(_ms_sets)-len(_bounded_sets)
_multi_uses = list(filter(lambda x: (x > 1),Counter(_set_mappings.values()).values()))
_actual_multiple_uses = sum(_multi_uses)-len(_multi_uses)
_non_used_bs = len(_bounded_sets)-len(Counter(_set_mappings.values()))
#for map in _set_mappings.keys():
#    print(map,_ms_recall.get(map)+_ms_precision.get(map))


#gathering external weigths
for _node in _resulting_ms_graph.nodes():
    _external_weights[_node] = float(_resulting_ms_graph.out_degree(_node, weight="weight"))
#print(_internal_weights,_external_weights)
_list_of_independent_nodes = list(filter(lambda e: 'MS' not in str(e),_resulting_ms_graph.nodes()))
_list_of_independent_interfaces = list(filter(lambda e: 'javafx.fxml.Initializable' in entities_dict.get(e)['interfaces'],_list_of_independent_nodes))
print(f"IFN: {round((sum(_used_node_from_outer_scope.values())+len(_list_of_independent_interfaces)) / (len(_used_node_from_outer_scope.values())+len(_list_of_independent_nodes)),4)}")
_cohesion_dict:dict = {}
for inner_key in _internal_weights.keys():
    #print(_external_weights.get(inner_key),_internal_weights.get(inner_key))
    _cohesion_dict[inner_key] = _internal_weights.get(inner_key)/_external_weights.get(inner_key)
#print(_cohesion_dict)
print(f"average cohesion is: {round(sum(list(_cohesion_dict.values()))/len(list(_cohesion_dict.values())),4)}")
print(f"average coupling is: {round(sum(list(_external_weights.values()))/len(list(_external_weights.values())),4)}")
print(f"average precision is: {_average_precision} compared to Ground Truth services.")
print(f"average recall is: {_average_recall} compared to Ground Truth services.")
print(f"there were: {_more_ms} more microservice candidates identified than anticipated.\n"
      f"{len(_multi_uses)} Ground Truth services were found {_actual_multiple_uses} times too often, and {_non_used_bs} Ground Truth services were not found.")