In [None]:
import xml.etree.ElementTree as ET
import sys
import os
import gensim
import re

In [None]:
# A entity can be a Class-0 or Method-1
class Entity:
    def __init__(self, id, pid, name, typ):
        self.id = id
        self.pid = pid
        self.name = name
        self.content = []
        self.typ = typ
        
    def add_content(self, content):
        self.content.append(content)

In [None]:
def custom_replace(text):       
    # Because restriction for naming files, '<'  and '>' tokens wil be replaced
    # '<' by '{' and
    # '>' by '}'
    text = text.replace('<','{')
    text = text.replace('>', '}')
    return text

In [None]:
def camel_case_split(text):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', text)
    result = [m.group(0) for m in matches]
    return ' '.join(result)

In [None]:
pattern = re.compile('([^\s\w]|_)+')

def content_pre_processing(text):    
    # Split the tokens based on camel case, underscores, and non-letters
    processed_text = camel_case_split(text)
    
    # Non-Lettering filtering
    processed_text = pattern.sub(' ', processed_text)
    
    # Remove extra space characters and convert all to lower case
    processed_text = ' '.join(processed_text.split()).lower()
    
    # Remove numbers
    result = ''.join(i for i in processed_text if not i.isdigit())    
    
    # Remove english Stopwords and words shorten than 1 character long
    result = [token for token in result.split() if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2] 
        
    return ' '.join(result)

In [None]:
def extract_corpus(file, root_folder):    
    entities_dict = {}
    tree = ET.parse(file)
    root = tree.getroot()  
    
    # Extract root entities: Classes and Methods identifications
    for entity in root.iter('jent'):
        if entity.attrib['type'] == 'method':   
            method_id = entity.attrib['id']           
            method_pid = entity.attrib['pid']
            name = custom_replace(entity.text)
            entities_dict[method_id] = Entity(method_id, method_pid, name, 1)
            print(method_id,entities_dict[method_id].name)
        elif entity.attrib['type'] == 'class' or entity.attrib['type'] == 'interface' or entity.attrib['type'] == 'enum':  
            class_id = entity.attrib['id']
            class_pid = entity.attrib['pid']
            name = custom_replace(entity.text)
            entities_dict[class_id] = Entity(class_id, class_pid, name, 0)
            print(class_id, entities_dict[class_id].name)         
    
    # Add the content for each attribute that references the entities described above
    for token in root.iter('jtok'):        
        eid = token.attrib['eid']  
        if eid in entities_dict:            
            entities_dict[eid].add_content(token.text)              
    
    # Write the dictionary to the corpus file (txt)
    for key, value in entities_dict.items():    
        full_path = os.path.join(root_folder, value.name + '.txt')
        full = ' '.join(value.content)        
        content = content_pre_processing(full)      
        f = open(full_path, 'w', encoding='utf-8')            
        f.write(content)  
        f.close()

    sys.stdout.flush()

In [None]:
directories_to_process = []
directories_to_process.append('RandomConfig00001')
directories_to_process.append('RandomConfig00002')
directories_to_process.append('RandomConfig00003')
directories_to_process.append('RandomConfig00004')
directories_to_process.append('RandomConfig00005')
directories_to_process

In [None]:
for directory in directories_to_process:
    print('Processing Directory: ', directory)
    doc_labels = []
    jdoc_path = os.path.join(directory, 'JDOC')
    text_path = os.path.join(directory, 'TEXT')
    doc_labels = [f for f in os.listdir(jdoc_path) if f.endswith('.jdoc')]    
    for doc in doc_labels:
        file_path = os.path.join(jdoc_path, doc)
        extract_corpus(file_path, text_path)