"""
COMP 614
Homework 5: Bag of Words
"""


In [None]:
import math
import numpy
import re
import string

In [None]:
def get_title_and_text(filename):
    """
    Given a the name of an XML file, extracts and returns the strings contained 
    between the <title></title> and <text></text> tags.
    """
    from bs4 import BeautifulSoup

    root = "/content/drive/MyDrive/hw5/"  #change the root as per your system
    filepath = root + filename

    file_contents = open(filepath, encoding="utf-8").read()
    soup = BeautifulSoup(file_contents)

    title = soup.find('title')
    title = title.text

    text = soup.find('text')
    text = text.text
    return title, text


In [None]:
def get_words(text):
    """
    Given the full text of an XML file, filters out the non-body text (text that
    is contained within {{}}, [[]], [], <>, etc.) and punctuation and returns a 
    list of the remaining words, each of which should be converted to lowercase.
    """
    remove_punc = re.sub(r'[^\w\s]', '', text)
    patn = re.sub(r"[\([{})\]]", "", remove_punc)
    all_lower = text.lower()
    word_list = all_lower.split()
    return word_list


In [None]:
def count_words(words):
    """
    Given a list of words, returns the total number of words as well as a 
    dictionary mapping each unique word to its frequency of occurrence.
    """
    unq_words = set(words)          # getting unique words
    word_to_dict = {word: words.count(word) for word in unq_words}      #counting frequency of each unique word in a dict
    return 0, word_to_dict


In [None]:
def count_all_words(filenames):
    """
    Given a list of filenames, returns three things. First, a list of the titles,
    where the i-th title corresponds to the i-th input filename. Second, a
    dictionary mapping each filename to an inner dictionary mapping each unique
    word in that file to its relative frequency of occurrence. Last, a dictionary 
    mapping each unique word --- including all words found across all files --- 
    to its total frequency of occurrence across all of the input files.
    """
    all_titles = []
    unk_word_each_file = {}
    title_to_counter = {}

    for filename in filenames:
        title, text = get_title_and_text(filename)
        all_titles.append(title)    # creating title list
        all_words_in_file = get_words(text)
        _, unk_word_in_file = count_words(all_words_in_file)
        unk_word_each_file[filename] = unk_word_in_file         #dictionary => filename: count_word_dict
        total_words = sum(unk_word_in_file.values())
        count_normalize = {}
        for word in unk_word_in_file:
            count_normalize[word] = float(unk_word_in_file[word] / total_words)
        title_to_counter[filename] = count_normalize            # dictionary => filename: word_counts_normalized



    unk_word_all_file = list(set(word for filename in unk_word_each_file for word in unk_word_each_file[filename]))  # unique words in all files 
    total_counts = {}
    temp = []
    for word in unk_word_all_file:
        for filename in unk_word_each_file:
            temp.append(unk_word_each_file[filename].get(word))
            word_count = [count for count in temp if count]
            total_counts[word] = sum(word_count)           #dictionary => unique words : unique_words_count
        temp.clear()
    
    

    return all_titles, title_to_counter, total_counts

In [None]:
def encode_word_counts(all_titles, title_to_counter, total_counts, num_words):



    """
    Given two dictionaries in the format output by count_all_words and an integer
    num_words representing the number of top words to encode, finds the top 
    num_words words in total_counts and builds a matrix where the element in 
    position (i, j) is the relative frequency of occurrence of the j-th most 
    common overall word in the i-th article (i.e., the article corresponding to 
    the i-th title in titles).
    """
    sorted_words = sorted(total_counts.items(), key=lambda tup: (-1*tup[1], tup[0]))
    top_k_words = sorted_words[:num_words]

    top_k_counter = [[] for i in range((len(top_k_words)))]

    top_words = numpy.zeros((len(title_to_counter), len(top_k_words)))

    counter = 0

    for filename in title_to_counter:
        for top_word in range(len(top_k_words)):
            top_k_counter[top_word] = title_to_counter[filename].get(top_k_words[top_word][0])

            if top_k_counter[top_word] is None:
                top_k_counter[top_word] = 0

        top_words[counter] = top_k_counter

        counter += 1



    
    return numpy.matrix(top_words)


In [None]:
def nearest_neighbors(matrix, all_titles, title, num_nbrs):
    """
    Given a matrix, a list of all titles whose data is encoded in the matrix, such
    that the i-th title corresponds to the i-th row, a single title whose data is
    encoded in the matrix, and the desired number of neighbors to be found, finds 
    and returns the closest neighbors to the article with the given title.
    """
    title_num = all_titles.index(title)
    distance_mat = []
    for i in range(matrix.shape[0]):
        distance = numpy.sqrt(numpy.sum(numpy.square(matrix[title_num] - matrix[i])))
        distance_mat.append(distance)

    nearest_nbr = numpy.argsort(distance_mat)

    nearest_nbr_titles = [all_titles[title] for title in nearest_nbr]

    k_nearest_nbr_titles = nearest_nbr_titles[1: (num_nbrs+1)]


    return k_nearest_nbr_titles

In [None]:
#if you're using colab, uncomment the below lines:
'''
import sys
root = "/content/drive/MyDrive/hw5/"
sys.path.append(root)
import comp614_module5
'''

In [None]:
def run():
    """
    Encodes the wikipedia dataset into a matrix, prompts the user to choose an
    article, and then runs the knn algorithm to find the 5 nearest neighbors
    of the chosen article.
    """

    # Encode the wikipedia dataset in a matrix
    

    filenames = comp614_module5.ALL_FILES   # change the comp614_module as per your filepath
    all_titles, title_to_counter, total_counts = count_all_words(filenames)
    mat = encode_word_counts(all_titles, title_to_counter, total_counts, 20000)

    # Print all articles
    print("Enter the integer corresponding to the article whose nearest" +
          " neighbors you would like to find. Your options are:")
    for idx in range(len(all_titles)):
        print("\t" + str(idx) + ". " + all_titles[idx])

    # Prompt the user to choose an article
    while True:
        choice = input("Enter your choice here: ")
        try:
            choice = int(choice)
            break
        except ValueError:
            print("Error: you must enter an integer between 0 and " +
                  str(len(all_titles) - 1) + ", inclusive.")

    # Compute and print the results
    nbrs = nearest_neighbors(mat, all_titles, all_titles[choice], 5)
    print("\nThe 5 nearest neighbors of " + all_titles[choice] + " are:")
    for nbr in nbrs:
        print("\t" + nbr)

In [None]:
# Additional Function to answer discussion question

def top_words_in_article(id):
    filenames = comp614_module5.ALL_FILES
    filename = filenames[id]
    file_title, file_text = get_title_and_text(filename)
    file_words = get_words(file_text)
    _, file_word_count = count_words(file_words)
    file_word_count_sorted = sorted(file_word_count.items(), key=lambda x:x[1], reverse = True)
    top_30_words = [file_word_count_sorted[:30]]

    return top_30_words