#### Objective

- Implement a basic Tf-Idf search.


#### Workflow

- Load all relevant Python libraries and a spaCy language model.

- Access the tokenized text in your new dataset from the previous milestone. Each document dictionary should now include a new key-value pair with the lemmatized text of the articles.

- Create a corpus vocabulary. It should simply be a list of unique tokens in the provided set of documents. Count how many times each unique token appears in the corpus, you will need these counts for the next step.

- Calculate Tf-Idf vectors for every article in the dataset and add these vectors to the article dictionaries. You should end up the same list of dictionaries as before, but with a new key-value pair containing Tf-Idf vectors:

- title: Title of the Wikipedia article the text is taken from.
- text: Wikipedia article text. (In this dataset we included only the summary.)
- tokenized_text: Tokenized Wikipedia article text.
- url: Link to the Wikipedia article.
- tf_idfs: Tf_Idf vector.

Now we can try to search our list of dictionaries using this Tf-Idf field using existing tools for similarity. We suggest you use scikit-learn library and its cosine_similarity function.

In [1]:
import sys
import spacy
import json
import numpy as np
from itertools import chain
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load a Spacy Language model
sp = spacy.load("en_core_web_sm")

In [3]:
with open("data/result.json", mode = "r") as file:
    file_obj = json.load(file)

----

In [4]:
# Create a corpus vocabulary. It should simply be a list of unique tokens in the provided
# set of documents. Count how many times each unique token appears in the corpus, 
# you will need these counts for the next step.

In [6]:
def create_corpus(docs):
    """
    Create a corpus itering all documents.
    
    :return corpus: Return a list of strings.
    """
    corpus = [doc["tokenized_text"]  for doc in docs]
    corpus = list(chain(*corpus))
    return   list(set(corpus))

In [9]:
corpus = create_corpus(file_obj)
### Contiene las palabras de todos los documentos y su frecuecnia en todos los documentos.
corpus[0:10]

['social',
 'driver',
 'dropping',
 'rafe',
 'donor',
 'previous',
 'fatigue',
 'import',
 'latin',
 'say']

----

In [33]:
def count_word_document(corpus, docs):
    """
    Conteo de aparacion de una palabra en todos los documentos.
    """
    count = 0
    dict_count = {}
    
    for item in corpus:
        ### Conteo de veces que la palabra en el corpus
        ### Aparece en todos los documentos
        for doc in docs:
            if item in doc["tokenized_text"]:
                count+=1
        
        ## Finalizing
        dict_count[item] = count
        count = 0
        
    return dict_count


def compute_tf(tokens):
    """
    Calculate tf for each list of tokens.
    """
    count_obj = Counter(tokens)
    total_terms = len(tokens)
    
    dict_count = {}
    
    for key, value in count_obj.items():
        dict_count[key] = value
   
    return count_obj

    
def compute_idf(docs, dict_count):
    """
    log(N/C)
    
    - N:  Total number of documents.
    - nw: Number of documents containing the word w.
    """
    N = len(docs)
    idf_dict = {}
    
    for key, count in dict_count.items():
        idf_dict[key] = np.log(N/count)
    return idf_dict

In [34]:
with open("data/result.json", mode = "rb") as file:
    docs = json.load(file)
    corpus = create_corpus(docs)
    
    ### Idf
    dict_count_document = count_word_document(corpus, docs)
    idf = compute_idf(docs, dict_count_document)
    
    ### Term Frequency
    for index, doc in enumerate(docs):
        dict_tf = compute_tf(doc["tokenized_text"])

        ### For each term in a document, we calculate the TF-IDF
        tf_idf_list = []
        
        for term in corpus:
            tf_idf_list.append((dict_tf[term]/len(doc["tokenized_text"])) * idf[term])
            

        docs[index]["tf_idfs"] = tf_idf_list
        
    ## At last the result is saved.            
    with open('data/tfidf_result.json', 'w') as output:
        json.dump(docs, output)      

In [35]:
with open('data/corpus.json', 'w') as corpusfile:
    json.dump(corpus, corpusfile)

### search function
- Create a search function to compute cosine similarities 
- between the document Tf-Idf vectors and the query Tf-Idf vector.
- Save this new list of dictionaries as a JSON file.

In [36]:
def tokenizer(content_doc):
    """
    
    """
    # doc contains the text lemmas and their attributes
    # https://spacy.io/api/doc
    doc = sp(content_doc.lower())
    
    # remove stop words
    all_stopwords = sp.Defaults.stop_words
    tokens_without_sw = [word for word in doc if word not  in all_stopwords]
    
    # remove punctuation and stop words
    tokens_without_pct = [token for token in tokens_without_sw 
                          if not token.is_punct and not token.is_stop]
    
    # Get Lemmas
    token_lemmas = [
                        token.lemma_ for token
                            in tokens_without_pct
                               if len(token.dep_.strip())>0
                                and token.lemma_ != "\n"
                                and token.dep_
                        ]
    
    return token_lemmas 


def vectorize(query, docs, corpus):
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    
    for token in corpus:
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(docs) /  dict_count_document[token])
        tfidf = tf * idf
        query_vec.append(tfidf)
    return query_vec

def search_similarity(query, docs, corpus):
    query_vec = vectorize(query, docs, corpus)
    query_arr = np.array(query_vec)

    rankings = []
    for doc in docs:
        doc_rank = {}
        coefficient = cosine_similarity(
                                     query_arr.reshape(1, -1), 
                                     np.array(doc["tf_idfs"]).reshape(1, -1)
                                    )[0][0]

        if coefficient > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = coefficient
            rankings.append(doc_rank)

    return sorted(rankings, key=lambda k: k['rank'], reverse=True)

In [47]:
def load_file(name = 'result'):
    """
    Load any file in json format, within the folder data.
    :param name: Nmae of the file.
    :type name: str
    """
    with open(f"data/{name}.json", mode = "rb") as file:
        file_obj = json.load(file)
        return file_obj
    
###############################
###############################
###############################
corpus = load_file('corpus')
docs = load_file('tfidf_result')

# query = "highest pandemic casualties"
query = "flu"
search_similarity(query, docs, corpus)

[{'title': 'Swine influenza', 'rank': 0.3753601130402447},
 {'title': 'Spanish flu', 'rank': 0.2641577193914792},
 {'title': 'Pandemic', 'rank': 0.0828231483433225},
 {'title': 'Unified Victim Identification System',
  'rank': 0.03771210435958485}]