In [None]:
import gensim
from gensim.models.doc2vec import TaggedDocument,Doc2Vec, LabeledSentence 
import os
import pandas as pd
import seaborn as sns
import re
import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
class DocIterator(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])            

In [None]:
def train_dv(corpus, vec_size, doc_labels):
    it = DocIterator(corpus, doc_labels)
    
    model = gensim.models.Doc2Vec(vector_size=vec_size, window=10, min_count=2, workers=12,alpha=0.025, min_alpha=0.025, epochs=20, negative=5)
    model.build_vocab(it)    
    model.train(it, epochs=model.epochs, total_examples=model.corpus_count)
    
    return model    

In [None]:
def feature_location(model, output_path, rank_sizes):
    queries = []

    queries.append(['state', 'diagram'])
    queries.append(['activity', 'diagram'])
    queries.append(['use', 'case', 'diagram'])
    queries.append(['collaboration', 'diagram'])
    queries.append(['deployment', 'diagram'])
    queries.append(['sequence', 'diagram'])
    queries.append(['cognitive', 'support'])
    queries.append(['logging'])

    for q in queries:                
        # Infer a vector for the query
        new_vector = model.infer_vector(q)    
        
        # Find the most similar documents for the infered vector
        simil = model.docvecs.most_similar([new_vector], topn=len(model.docvecs))  
        
        name_result = name_converter['_'.join(q)] + '.txt'      
        
        for size in rank_sizes:       
            dir_path = os.path.join(output_path, str(size), name_result)
            print("Processing: R: ", size)
            
            rank_size = size
            if size == 0:
                rank_size = len(model.docvecs)
            
            f = open(dir_path, 'w')
            for el in simil[:rank_size]:
                entity = el[0].replace('.txt', '')     
                
                # Remove Inner Methods
                if '$' in entity:                    
                    continue
                
                # Replace because the naming file restrictions
                entity = entity.replace('{', '<')                        
                entity = entity.replace('}', '>')
                
                # Method result 
                if '(' in entity:            
                    write_form = entity.rsplit('.', 1)
                    method = write_form[0]
                    method += ' ' + write_form[1]            
                    f.write(method + '\n')
                # Class Result
                else:    
                    f.write(entity + '\n')

            f.close()

In [None]:
name_converter = {}
name_converter['state_diagram'] = 'STATEDIAGRAM'
name_converter['activity_diagram'] = 'ACTIVITYDIAGRAM'
name_converter['use_case_diagram'] = 'USECASEDIAGRAM'
name_converter['collaboration_diagram'] = 'COLLABORATIONDIAGRAM'
name_converter['deployment_diagram'] = 'DEPLOYMENTDIAGRAM'
name_converter['sequence_diagram'] = 'SEQUENCEDIAGRAM'
name_converter['cognitive_support'] = 'COGNITIVE'
name_converter['logging'] = 'LOGGING'

In [None]:
directories_to_process = []
directories_to_process.append('RandomConfig00001')
directories_to_process.append('RandomConfig00002')
directories_to_process.append('RandomConfig00003')
directories_to_process.append('RandomConfig00004')
directories_to_process.append('RandomConfig00005')
directories_to_process

In [None]:
num_topics_comb = [100,200,300,400,500]
rank_size_comb = [10, 100, 1000, 0] #0 == Full rank

for config in directories_to_process:
    text_path = os.path.join(config, 'TEXT')
    doc_labels = [] 
    doc_labels = [f for f in os.listdir(text_path) if f.endswith('.txt')]
    data = []
    
    for doc in doc_labels:
        full_path = os.path.join(text_path, doc)
        f = open(full_path, 'r')
        content = f.read()
        data.append(content)
    
    for n_comb in num_topics_comb:
        print("Processing: N: ", n_comb)
        dv_model = train_dv(data, n_comb, doc_labels) 
        out_path = os.path.join(config, 'RESULTS', 'DV', str(n_comb))
        feature_location(dv_model, out_path, rank_size_comb)     