# TF-IDF Matrix Creation

Authors: Brandon Fan, Jordan Seiler
Last Edit Date: 11/29/2017

## Import Necessary Packages

In [5]:
# common imports
import json
import string
import re
import pickle

# important imports
import nltk
import numpy as np

# nltk imports
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# install necessary nltk corpuses as needed
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brandonfan1256/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brandonfan1256/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocess Data

In [6]:
# load Bible data
BIBLE = json.load(open('../../bible-files/english-web-bible.json', 'r', encoding='utf-8-sig'))

In [7]:
# create corpus of data
corpus = []
for name in bible:
    chapters = name["data"]
    for chapter in chapters:
        verses = chapter["verses"]
        for verse in verses:
            text = verse["text"]
            corpus.append({'ref': verse['verse'], 'tokenized': word_tokenize(text)}) 
corpus = np.array(corpus)

In [8]:
# make punctuation table for substitution
PUNCT_TABLE = str.maketrans({key: None for key in string.punctuation})
# get stopwords from nltk
STOP_WORDS = stopwords.words("english")
# create regex pattern to get rid of unicode hyphen and quotations
PATTERN = r'[\—\'\"]'

In [9]:
for entry in corpus:
    # gets tokenized words
    words = entry['tokenized']
    # remove hyphens, quotes and punctuations from each word
    tokenized_cp = [re.sub(PATTERN, '', word.translate(PUNCT_TABLE)) for word in words if word not in STOP_WORDS]
    # remove all non-unicode/alphanumeric characters from each word 
    # and only get words that have a length greater than 1
    tokenized_cp = np.array([''.join(e for e in word if e.isalnum()) for word in tokenized_cp if len(word) > 1])
    entry['tokenized'] = tokenized_cp

## Construct TF-IDF

### Calculate Term Frequencies

To calculate term frequencies per document we follow the formula:

$$TF(t,d \in D) = \frac{\text{Number of times term } t \text{ appears in document }d}{\text{Total number of terms in document } d}$$

Where $D$ is the corpus of documents

In [10]:
for entry in corpus:
    words = entry['tokenized']
    # get unique words and their counts per document
    unique_words_and_counts = np.unique(words, return_counts=True)
    unique_words = unique_words_and_counts[0]
    counts = unique_words_and_counts[1]
    # set dictionary values of such
    entry['unique_words'] = unique_words
    entry['term_frequency'] = {}
    # calculate term frequencies for each word
    # TF = (Number of times term t appears in a document) / (Total number of terms in the document)
    # using numpy matrix division to calculate TF
    for index, value in enumerate(counts / len(words)):
        entry['term_frequency'][unique_words[index]] = value

### Calculate Inverse Document Frequencies

To calculate inverse document frequencies we follow the formula:

$$ IDF(t, D) = \log_e\left(\frac{\text{Total number of documents } D}{\text{Number of documents with term } t \text{ in it}}\right) $$

In [11]:
# get all unique words per document and compile into list
unique_words = []
for entry in corpus:
    unique_words += entry['unique_words'].tolist()
unique_words = np.array(unique_words)

In [12]:
# get all unique words of all the documents and get their counts
unique_of_unique_words = np.unique(unique_words, return_counts=True)
words_list = unique_of_unique_words[0]
# calculate inverse document frequencies
# IDF = ln(Total number of documents / Number of documents with term t in it)
# using numpy matrix division and log to calculate IDF
idf = np.log(len(corpus) / unique_of_unique_words[1])

In [13]:
# construct idf weights dictionary
idf_dict = {}
for i in range(len(idf)):
    idf_dict[words_list[i]] = idf[i]

### Calculate TF-IDF Weights Per Document

To calculate the TF-IDF weights per document we do the following:

$$ W(t, d \in D) = TF(t, d) \times IDF(t, D) $$

Where $W$ is the TF-IDF weight function

In [14]:
# calculate TF-IDF Weights
for entry in corpus:
    # initialize dictionary to hold weights per document
    entry['weights'] = {}
    for word in entry['term_frequency'].keys():
        # calculate weight for each word per document
        # TF * IDF = weight of term t per document
        entry['weights'][word] = entry['term_frequency'][word] * idf_dict[word] 

## Create Index For Search

In [15]:
# creates dictionary of indices
indices = {}
for entry in corpus:
    # iterate through all of the weights
    for term in entry['weights'].keys():
        # if term exists already in dictionary then just append
        if term in indices.keys():
            indices[term].append((entry['ref'], entry['weights'][term]))
        # else create a new key for word
        else:
            indices[term] = [(entry['ref'], entry['weights'][term])]

In [16]:
# iterate through indices and sort the results
for term in indices.keys():
    indices[term] = sorted(indices[term], key=lambda x: x[1], reverse=True)

## Save Computed Matrix

In [17]:
# use pickle to output into a .pkl file for production
pickle.dump(indices, open('tf-idf-table.pkl', 'wb'))