In [224]:
import numpy as np
import nltk
import scipy as sp
from nltk.tokenize import word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/heejinchae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [225]:
corpus=["I AM SAM",
"I LIKE SAM I AM",
"I LIKE GREEN EGGS AND HAM"]

### Problem 1 (a) 
Create a term document matrix using simple word counts (number of times a word appears in a document) for all the documents

In [226]:
def simple_word_counts(courpus)->np.ndarray:
    """ Create a term-document matrix using simple word counts

    Arguments:
    corpus-- list of sentences in the corpus

    Return:
    term_doc_matrix -- term-document matrix where each element is the number of times a word appears in a document. The order of words DOES NOT matter.
    """
    term_doc_matrix=np.empty([3,8])
    #### YOUR CODE HERE (~ 6 lines) ####
    term_doc_matrix = np.zeros(term_doc_matrix.shape)
    token_in_sentence = []
    for sentence in courpus:
        tokens = word_tokenize(sentence)
        token_in_sentence.append(tokens)
    vocab = np.unique(
        np.concatenate(token_in_sentence).ravel()
        )
    vocab.sort()
    vocab_to_idx = {word:i for i,word in enumerate(vocab)}
    
    for i,tokens in enumerate(token_in_sentence):
        for token in tokens:
            term_doc_matrix[i][vocab_to_idx[token]] += 1
            
    #### YOUR CODE HERE ####
    return term_doc_matrix

In [227]:
simple_word_counts(corpus)

array([[1., 0., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 2., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 0.]])

### Problem 1 (b)
Using the vector space model and cosine similarity find the closest document to the user query ”I LIKE EGGS”

In [228]:
def cos_distance(a:np.ndarray, b:np.ndarray)->float:
    return 1 - sp.spatial.distance.cosine(a, b)

def find_the_closest_document(corpus, query: str)->str:
    """ Find the closest document with query from corpus

    Arguments:
    corpus-- list of sentences in the corpus
    query-- a string to compute cosine similarity with

    Return:
    closest_document --the string in the corpus which is the closest to the query
    """
    closest_document=str()
    #### YOUR CODE HERE ####
    unique_token = set() 
    token_list = [] 
    for sentence in corpus:
        tokens = word_tokenize(sentence)
        token_list.append(list(tokens))
        unique_token.update(tokens)
    unique_token_list = sorted(list(unique_token)) # header
    token_to_id = {token:i for i,token in enumerate(unique_token_list)}

    term_doc_matrix = simple_word_counts(corpus)
    query_tokens = word_tokenize(query)
    query_token_ids = [token_to_id[token] for token in query_tokens]
    query_token_vecs = np.zeros([term_doc_matrix.shape[0],term_doc_matrix.shape[1]])
    query_token_vecs[:,query_token_ids] += 1

    term_doc_matrix = list(term_doc_matrix)
    query_token_vecs = list(query_token_vecs)
    score = 0 
    ind = 0
    for i,(cor,qu) in enumerate(zip(term_doc_matrix, query_token_vecs)):
        if cos_distance(cor,qu) > score:
            score = cos_distance(cor,qu)
            ind = i
    closest_document = corpus[ind]
    #### make use of cos_distance function above!
    return closest_document

In [229]:
query="I LIKE EGGS"
find_the_closest_document(corpus, query)

'I LIKE GREEN EGGS AND HAM'

### Problem 2 (a) 
Now instead of using the raw counts, use TF-IDF for each entry in the term-document matrix. Using the vector space model and cosine similarity find the closest document to the user query “I LIKE EGGS” for the new index

In [230]:
def find_the_closest_document_tfidf(corpus, query: str)->str:
    """ Find the closest document with query from corpus using tf-idf

    Arguments:
    corpus-- list of sentences in the corpus
    query-- a string to compute cosine similarity with

    Return:
    closest_document --the string in the corpus which is the closest to the query
    """
    closest_document=str()
    #### YOUR CODE HERE ####
    #### NOTE the base of log is 2, not 10 or e
    docs = corpus[:] + [query]
    vocab = list(set(w for doc in docs for w in word_tokenize(doc)))
    token_in_docs = list(word_tokenize(doc) for doc in docs )
    vocab.sort()
    vocab_to_idx = {vocab:i for i,vocab in enumerate(vocab)}
    tf_ = np.zeros((len(docs), len(vocab)))
    idf_ = tf_.copy() 
    for i, tokens in enumerate(token_in_docs):
        for token in tokens:
            tf_[i][vocab_to_idx[token]] += 1
    df_ = np.sum(tf_>0,axis=0)
    idf_ = np.log2(
        len(docs) / df_
        )
    tf_idf = tf_*idf_

    score = 0
    idx = 0
    corpus_vecs = list(tf_idf[:len(docs)-1])
    query_vec = np.tile(
        tf_idf[len(docs)-1:],(len(docs)-1,1)
    )
    for i, (cor,qu) in enumerate(zip(corpus_vecs,query_vec)):
        if( cos_distance(cor, qu) > score):
            score = cos_distance(cor,qu)
            idx = i
    closest_document = docs[i]
    return closest_document



In [231]:
find_the_closest_document_tfidf(corpus, query)

0.0
0.10794633570596113
0.2983191769122352


'I LIKE GREEN EGGS AND HAM'

### Problem 2 (c)
nstead of using cosine similarity we could also use the L2 distance. Which similarity function (L2 or cosine) would work better here?

In [232]:
def find_the_closest_document_L2(corpus, query: str)->str:
    """ Find the closest document with query from corpus using L2 distance

    Arguments:
    corpus-- list of sentences in the corpus
    query-- a string to compute cosine similarity with

    Return:
    closest_document --the string in the corpus which is the closest to the query
    """
    closest_document=str()
    docs = corpus[:] + [query]
    vocab = list(set(w for doc in docs for w in word_tokenize(doc)))
    token_in_docs = list(word_tokenize(doc) for doc in docs )
    vocab.sort()
    vocab_to_idx = {vocab:i for i,vocab in enumerate(vocab)}
    tf_ = np.zeros((len(docs), len(vocab)))
    idf_ = tf_.copy() 
    for i, tokens in enumerate(token_in_docs):
        for token in tokens:
            tf_[i][vocab_to_idx[token]] += 1
    df_ = np.sum(tf_>0,axis=0)
    idf_ = np.log2(
        len(docs) / df_
        )
    tf_idf = tf_*idf_

    corpus_vecs = tf_idf[:len(docs)-1]
    query_vec = np.tile(
        tf_idf[len(docs)-1:],(len(docs)-1,1)
    )
    scores = np.sqrt(
        np.sum(
            np.power(
                corpus_vecs - query_vec, 2
            ),axis=1
        )
    )
    idx = np.argmax(scores)
    closest_document = corpus[idx]
    #### YOUR CODE HERE ####
    
    return closest_document


In [233]:

find_the_closest_document_L2(corpus, query)


[[1.        0.        0.        0.        0.        0.        0.
  1.       ]
 [1.        0.        0.        0.        0.        0.        0.4150375
  1.       ]
 [0.        2.        1.        2.        2.        0.        0.4150375
  0.       ]]
[1.78108285 1.73205081 3.46410162]


'I LIKE GREEN EGGS AND HAM'