In [None]:
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import corpora
from gensim import models
from gensim import similarities
import os

In [None]:
def build_lsi(data, num_topics):         
    dictionary = corpora.Dictionary(data)    
    corpus = [dictionary.doc2bow(doc) for doc in data]    
    
    tfidf = models.TfidfModel(corpus)       
    corpus_tfidf = tfidf[corpus]
    
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics) 
    
    return dictionary, corpus, lsi

In [None]:
def get_most_similar_documents(query, indexer):
    # transform corpus to LSI space and index it
    index = similarities.MatrixSimilarity(indexer)
    
    # Perform a similarity query against the corpus
    sims = index[query]    
    sims = sorted(enumerate(sims), key=lambda item: -item[1])    
    
    return sims

In [None]:
def feature_location(dictionary, corpus, lsi, output_path, rank_sizes): 
    queries = []
    queries.append(['state', 'diagram'])
    queries.append(['activity', 'diagram'])
    queries.append(['use', 'case', 'diagram'])
    queries.append(['collaboration', 'diagram'])
    queries.append(['deployment', 'diagram'])
    queries.append(['sequence', 'diagram'])
    queries.append(['cognitive', 'support'])
    queries.append(['logging'])

    for q in queries:
        bow_vector = dictionary.doc2bow(q)

        # convert the query to LSI space
        vec_lsi = lsi[bow_vector]    

        lsi_space = lsi[corpus]

        sims_rank = get_most_similar_documents(vec_lsi, lsi_space)                               
                                
        name_result = name_converter['_'.join(q)] + '.txt'
        
        for size in rank_sizes:                    
            dir_path = os.path.join(output_path, str(size), name_result)
            print("Processing: R: ", size)
            
            rank_size = size
            if size == 0:
                rank_size = len(sims_rank)

            f = open(dir_path, 'w')
            for doc in sims_rank[:rank_size]:            
                el = doc[0]        
                entity = docLabels[el].replace('.txt', '')              

                # Remove Refinement of Inner Methods
                if '$' in entity:                    
                    continue                    
                  
                # Replace because the naming file restrictions
                entity = entity.replace('{', '<')
                entity = entity.replace('}', '>')

                # Method Result
                if '(' in entity:            
                    write_form = entity.rsplit('.', 1)
                    method = write_form[0]
                    method += ' ' + write_form[1]       
                    f.write(method + '\n')
                # Class Result
                else:    
                    f.write(entity + '\n')
            f.close()    

In [None]:
name_converter = {}
name_converter['state_diagram'] = 'STATEDIAGRAM'
name_converter['activity_diagram'] = 'ACTIVITYDIAGRAM'
name_converter['use_case_diagram'] = 'USECASEDIAGRAM'
name_converter['collaboration_diagram'] = 'COLLABORATIONDIAGRAM'
name_converter['deployment_diagram'] = 'DEPLOYMENTDIAGRAM'
name_converter['sequence_diagram'] = 'SEQUENCEDIAGRAM'
name_converter['cognitive_support'] = 'COGNITIVE'
name_converter['logging'] = 'LOGGING'

In [None]:
directories_to_process = []
directories_to_process.append('RandomConfig00001')
directories_to_process.append('RandomConfig00002')
directories_to_process.append('RandomConfig00003')
directories_to_process.append('RandomConfig00004')
directories_to_process.append('RandomConfig00005')
directories_to_process

In [None]:
num_topics_comb = [100,200,300,400,500]
rank_size_comb = [10, 100, 1000, 0] #0 == Full rank

for config in directories_to_process:
    print("Processing:", config)
    text_path = os.path.join(config, 'TEXT')
    docLabels = [] 
    docLabels = [f for f in os.listdir(text_path) if f.endswith('.txt')]
    data = []
    
    for doc in docLabels:
        full_path = os.path.join(text_path, doc)
        f = open(full_path, 'r')
        content = f.read()
        data.append(content)
        
    corp = [d.split() for d in data]  
    
    for n_comb in num_topics_comb:    
        print("Processing: N: ", n_comb)
        dictionary, corpus, lsi = build_lsi(corp, n_comb)
        out_path = os.path.join(config, 'RESULTS', 'LSI', str(n_comb)) 
        feature_location(dictionary, corpus, lsi, out_path, rank_size_comb)     