In [None]:
import regex  # the cooler "re"
from typing import *
from biterm.btm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
from scipy.spatial import distance
import pdb
%run LocalRepo.ipynb

In [None]:
#
# TODO: https://pypi.org/project/biterm/
#

# constants
MIN_WORD_LENGTH = 1
MAX_WORD_LENGTH = 50
MIN_WORD_USAGES = 2  # any word used less often will be ignored
MAX_DF = 0.95  # any terms that appear in a bigger proportion of the documents than this will be ignored (corpus-specific stop-words)
MAX_FEATURES = 1500  # the size of the LDA thesaurus - amount of words to consider for topic learning
TOPIC_COUNT = 10 # 40  # 100 according to paper
BTM_ITERATIONS = 20  # 100 according to docs?
# LDA_PASSES = 200  # how often to go through the corpus
LDA_RANDOM_SEED = 42
DOCUMENT_SIMILARITY_EXP = 8 # higher = lower equality values, lower = equality values are all closer to 1
DOCUMENT_SIMILARITY_CUTOFF = 0.1  # in range [0 .. 1]: everything below this is dropped

In [None]:
def extract_topic_model_documents(files) -> List[Tuple[RepoTree,List[str]]]:  # List of (RepoTree-Node,wordList) - tuples
    # keywords from python, TS and Java
    custom_stop_words = ["abstract", "and", "any", "as", "assert", "async", "await", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "constructor", "continue", "debugger", "declare", "def", "default", "del", "delete", "do", "double", "elif", "else", "enum", "except", "export", "extends", "false", "False", "final", "finally", "float", "for", "from", "function", "get", "global", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "is", "lambda", "let", "long", "module", "new", "None", "nonlocal", "not", "null", "number", "of", "or", "package", "pass", "private", "protected", "public", "raise", "require", "return", "set", "short", "static", "strictfp", "string", "super", "switch", "symbol", "synchronized", "this", "throw", "throws", "transient", "true", "True", "try", "type", "typeof", "var", "void", "volatile", "while", "with", "yield"]
    stop_words = set(list(get_stop_words('en')) + custom_stop_words)  # TODO ignored "list(stopwords.words('english'))" because it had "y" and other weird ones
    splitter = r"(?:[\W_]+|(?<![A-Z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z]))"
    lemma = WordNetLemmatizer()
    printable_characters = set(string.printable)

    def _normalize_word(word):
        return lemma.lemmatize(lemma.lemmatize(word.lower(), pos = "n"), pos = "v")

    def _get_text(content_string):
        # https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
        # https://agailloty.rbind.io/en/project/nlp_clean-text/
        content_string = ''.join(c for c in content_string if c in printable_characters)
        words = regex.split(splitter, content_string, flags=regex.VERSION1)  # regex V1 allows splitting on empty matches
        words = [word for word in words if not word in stop_words]
        words = [_normalize_word(word) for word in words]
        words = [word for word in words if len(word) >= MIN_WORD_LENGTH and len(word) <= MAX_WORD_LENGTH]
        words = [word for word in words if not word in stop_words]
        return words


    # see https://docs.python.org/2/library/collections.html#collections.Counter
    freq_dist = FreqDist()

    node_words: List[Tuple[RepoTree,List[str]]]  = []  # List of (RepoTree-Node,wordList) - tuples
    for file in log_progress(files, desc="Extracting language corpus"):
        node = file.get_repo_tree_node()  # TODO unify with structural view code
        if node is None:
            continue  # TODO why / when does this happen?

        # TODO keep in sync with evolutionary and structural view as well as RepoFile class
        classes = node.get_descendants_of_type("class") + node.get_descendants_of_type("interface") + node.get_descendants_of_type("enum")
        for class_node in classes:
            fields = class_node.get_children_of_type("field")
            methods = class_node.get_children_of_type("method") + class_node.get_children_of_type("constructor")
            # print("Class " + class_node.name + ": " + str(len(methods)) + " methods and " + str(len(fields)) + " fields")

            for member in fields + methods:
                text = member.get_comment_and_own_text(file)
                # words = list(_get_text(class_node.get_path() + " " + text))
                words = list(_get_text(class_node.name + " " + text))
                for word in words:
                    freq_dist[word] += 1
                # TODO: handle the empty list?!?
                node_words.append((member, words))
                # print(" ".join(words))

    # random.seed(LDA_RANDOM_SEED)
    # random.shuffle(node_words)

    for word in freq_dist:
        if freq_dist[word] < MIN_WORD_USAGES:
            del freq_dist[word]

    print("Amount of documents: " + str(len(node_words)))
    print("Total Amount of words: " + str(sum([len(b) for a, b in node_words])))
    print("Vocab size: " + str(len(freq_dist)))
    return node_words

In [None]:
def train_topic_model(node_words: List[Tuple[RepoTree,List[str]]]):
    """returns the document - topic matrix, stating how much of each topic is present in each document"""

    def vec_to_sparse_btm(X):
        B_d = []
        for x in X:
            nonzero = np.nonzero(x)[0]
            b_i = [(nonzero[i], nonzero[i+1]) for i in range(len(nonzero)-1)]
            B_d.append(b_i)
        return B_d
    
    # https://pypi.org/project/biterm/
    print("Vectorizing words...")
    pdb.set_trace()
    texts = [" ".join(words) for (node, words) in node_words]
    print("Here are the texts:")
    print("\n".join(texts))
    print("------")
    vec = CountVectorizer(max_df=MAX_DF, max_features=MAX_FEATURES, stop_words=None)
    X = vec.fit_transform(texts).toarray()
    vocab = np.array(vec.get_feature_names())

    print("Instantiating BTM...")
    btm = oBTM(num_topics=TOPIC_COUNT, V=vocab)
    biterms = vec_to_biterms(X)
    print("Total Amount of biterms: " + str(sum([len(x) for x in biterms])))

    print("Training model...")
    pdb.set_trace()
    btm.fit(biterms, iterations=BTM_ITERATIONS)
    topics = btm.transform(biterms)

    print("Generating topic output...")
    words_to_show_per_topic = 10
    topic_summuary(btm.phi_wz.T, X, vocab, words_to_show_per_topic)
    
    return topics

In [None]:
def couple_by_topic_similarity(node_words: List[Tuple[RepoTree,List[str]]], topics, coupling_graph):

    def array_similarity(a, b):
        """given two arrays of numbers, how equal are they?"""
        return math.pow(1. - distance.cosine(a, b), DOCUMENT_SIMILARITY_EXP)  # TODO check for alternative distance metrics?
    
    print("Generating coupling graph...")
    # paths = [n.get_path() for n, w in node_words]  # TODO speed up by using this pths array?
    for n1 in log_progress(range(len(node_words)), desc="Generating coupling graph"):
        for n2 in range(len(node_words)):
            if n1 >= n2:
                continue
            t1 = topics[n1]
            t2 = topics[n2]
            if np.isnan(t1[0]) or np.isnan(t2[0]):
                continue  # TODO filter out those earlier (happen when a btm-document is empty)
            similarity = array_similarity(t1, t2)
            coupling_graph.add(node_words[n1][0].get_path(), node_words[n2][0].get_path(), similarity)

    for node, words in log_progress(node_words, desc="Generating coupling graph step 2"):
        coupling_graph.add_support(node.get_path(), len(words))

    print("Trimming graph...")
    coupling_graph.cutoff_edges(DOCUMENT_SIMILARITY_CUTOFF)

    return coupling_graph