In [None]:
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim import corpora
from gensim import models
from gensim import similarities
import numpy as np
from scipy.stats import entropy

import os
import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
def train_lda(data, num_topics):
    
    dictionary = corpora.Dictionary(data)    
    corpus = [dictionary.doc2bow(doc) for doc in data]
    lda = LdaModel(corpus=corpus,
                   num_topics=num_topics, 
                   id2word=dictionary,
                   alpha=1e-2,
                   eta=0.5e-2, 
                   chunksize=300,
                   minimum_probability=0.0,
                   passes=2)
    
    return dictionary, corpus, lda

In [None]:
# Jensen-Shannon Divergence
def jensen_shannon(query, matrix):
    P = query[None,:].T 
    Q = matrix.T
    M = 0.5 * (P + Q)
    return np.sqrt(0.5 * (entropy(P,M) + entropy(Q,M)))

In [None]:
def get_most_similar_documents(query,matrix, rank_size):    
    sims = jensen_shannon(query,matrix)
    return sims.argsort()[:rank_size] 

In [None]:
def feature_location(dictionary, corpus, lda, output_path, rank_sizes):  
    queries = []
    queries.append(['state', 'diagram'])
    queries.append(['activity', 'diagram'])
    queries.append(['use', 'case', 'diagram'])
    queries.append(['collaboration', 'diagram'])
    queries.append(['deployment', 'diagram'])
    queries.append(['sequence', 'diagram'])
    queries.append(['cognitive', 'support'])
    queries.append(['logging'])
    
    for q in queries:        
        bow_vector = dictionary.doc2bow(q)
        
        # Creating topic distribution for the query
        new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow_vector)])
        doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
        
        most_sim_ids = get_most_similar_documents(new_doc_distribution,doc_topic_dist, doc_topic_dist.shape[0])
    
        name_result = name_converter['_'.join(q)] + '.txt'
            
        for size in rank_sizes:                    
            dir_path = os.path.join(output_path, str(size), name_result)
            print("Processing: R: ", size)
            
            rank_size = size
            if size == 0:
                rank_size = doc_topic_dist.shape[0]
        
            f = open(dir_path, 'w')
            for el in most_sim_ids[:rank_size]:
                entity = docLabels[el].replace('.txt', '')
                
                # Remove Inner Methods
                if '$' in entity:                    
                    continue
                    
                # Replace because the naming file restrictions
                entity = entity.replace('{', '<')
                entity = entity.replace('}', '>')
                
                # Method result        
                if '(' in entity:            
                    write_form = entity.rsplit('.', 1)
                    method = write_form[0]
                    method += ' ' + write_form[1]       
                    f.write(method + '\n')
                # Class Result
                else:    
                    f.write(entity + '\n') 
            f.close()    

In [None]:
name_converter = {}
name_converter['state_diagram'] = 'STATEDIAGRAM'
name_converter['activity_diagram'] = 'ACTIVITYDIAGRAM'
name_converter['use_case_diagram'] = 'USECASEDIAGRAM'
name_converter['collaboration_diagram'] = 'COLLABORATIONDIAGRAM'
name_converter['deployment_diagram'] = 'DEPLOYMENTDIAGRAM'
name_converter['sequence_diagram'] = 'SEQUENCEDIAGRAM'
name_converter['cognitive_support'] = 'COGNITIVE'
name_converter['logging'] = 'LOGGING'

In [None]:
directories_to_process = []
directories_to_process.append('RandomConfig00001')
directories_to_process.append('RandomConfig00002')
directories_to_process.append('RandomConfig00003')
directories_to_process.append('RandomConfig00004')
directories_to_process.append('RandomConfig00005')
directories_to_process

In [None]:
num_topics_comb = [100,200,300,400,500]
rank_size_comb = [10, 100, 1000, 0] #0 == Full rank

for config in directories_to_process:
    text_path = os.path.join(config, 'TEXT')
    docLabels = [] 
    docLabels = [f for f in os.listdir(text_path) if f.endswith('.txt')]
    data = []
    
    for doc in docLabels:
        full_path = os.path.join(text_path, doc)
        f = open(full_path, 'r')
        content = f.read()
        data.append(content)
        
    corp = [d.split() for d in data]  
    
    for n_comb in num_topics_comb:
        print("Processing: N: ", n_comb)
        dictionary, corpus, lda = train_lda(corp, n_comb)        
        out_path = os.path.join(config, 'RESULTS', 'LDA', str(n_comb))                         
        feature_location(dictionary, corpus, lda, out_path, rank_size_comb)