#### Objective

- Implement a basic Tf-Idf search.


#### Workflow

- Load all relevant Python libraries and a spaCy language model.

- Access the tokenized text in your new dataset from the previous milestone. Each document dictionary should now include a new key-value pair with the lemmatized text of the articles.

- Create a corpus vocabulary. It should simply be a list of unique tokens in the provided set of documents. Count how many times each unique token appears in the corpus, you will need these counts for the next step.

- Calculate Tf-Idf vectors for every article in the dataset and add these vectors to the article dictionaries. You should end up the same list of dictionaries as before, but with a new key-value pair containing Tf-Idf vectors:

- title: Title of the Wikipedia article the text is taken from.
- text: Wikipedia article text. (In this dataset we included only the summary.)
- tokenized_text: Tokenized Wikipedia article text.
- url: Link to the Wikipedia article.
- tf_idfs: Tf_Idf vector.

Now we can try to search our list of dictionaries using this Tf-Idf field using existing tools for similarity. We suggest you use scikit-learn library and its cosine_similarity function.

In [100]:
import spacy
import json
import numpy as np
from itertools import chain
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
with open("data/result.json", mode = "rb") as file:
    file_obj = json.load(file)
    
    for index, file in enumerate(file_obj):
        print(file.keys())
        print(file["title"])
        print(file["text"][0:25])
        print(file["tokenized_text"])
        break

dict_keys(['title', 'text', 'url', 'tokenized_text'])
Pandemic
A pandemic (from Greek πᾶ
['pandemic', 'greek', 'πᾶν', 'pan', 'δῆμος', 'demos', 'people', 'epidemic', 'infectious', 'disease', 'spread', 'large', 'region', 'instance', 'multiple', 'continent', 'worldwide', 'affect', 'substantial', 'number', 'people', 'widespread', 'endemic', 'disease', 'stable', 'number', 'infected', 'people', 'pandemic', 'widespread', 'endemic', 'disease', 'stable', 'number', 'infected', 'people', 'recurrence', 'seasonal', 'influenza', 'generally', 'exclude', 'occur', 'simultaneously', 'large', 'region', 'globe', 'spread', 'worldwide', 'human', 'history', 'number', 'pandemic', 'disease', 'smallpox', 'tuberculosis', 'fatal', 'pandemic', 'record', 'history', 'black', 'death', 'know', 'plague', 'kill', 'estimate', '75–200', 'million', 'people', '14th', 'century', 'term', 'later', 'pandemic', 'include', '1918', 'influenza', 'pandemic', 'spanish', 'flu', 'current', 'pandemic', 'include', 'covid-19', 'sar', 'cov

----

In [None]:
# Create a corpus vocabulary. It should simply be a list of unique tokens in the provided
# set of documents. Count how many times each unique token appears in the corpus, 
# you will need these counts for the next step.

In [101]:
def create_corpus(docs):
    """
    Create corpus itering all documents.
    """
    corpus = [doc["tokenized_text"]  for doc in docs]
    corpus = list(chain(*corpus))
    return OrderedDict(Counter(corpus))

In [102]:
corpus = create_corpus(file_obj)
### Contiene las palabras de todos los documentos y su frecuecnia en todos los documentos.
corpus

OrderedDict([('pandemic', 59),
             ('greek', 1),
             ('πᾶν', 1),
             ('pan', 1),
             ('δῆμος', 1),
             ('demos', 1),
             ('people', 29),
             ('epidemic', 11),
             ('infectious', 8),
             ('disease', 39),
             ('spread', 17),
             ('large', 13),
             ('region', 5),
             ('instance', 1),
             ('multiple', 3),
             ('continent', 1),
             ('worldwide', 7),
             ('affect', 15),
             ('substantial', 1),
             ('number', 15),
             ('widespread', 3),
             ('endemic', 4),
             ('stable', 3),
             ('infected', 10),
             ('recurrence', 1),
             ('seasonal', 1),
             ('influenza', 26),
             ('generally', 3),
             ('exclude', 1),
             ('occur', 6),
             ('simultaneously', 1),
             ('globe', 1),
             ('human', 21),
             ('history', 4

----

In [107]:
def compute_tf(tokens):
    """
    - Calculate tf for each list of tokens.
    """
    count_obj = Counter(tokens)
    total_terms = len(count_obj)
    
    dict_count = {}

    for key, value in count_obj.items():
        dict_count[key] = value/total_terms     
    return OrderedDict(dict_count)


def count_word_document(corpus, docs):
    count = 0
    dict_count = {}
    
    for item in corpus:
        ### Conteo de veces que la plabra en el corpus
        ### aparece en todos los documentos
        for doc in docs:
            if item in  doc["tokenized_text"]:
                count+=1
        
        ## Finalizing
        dict_count[item]=count
        count = 0
    return dict_count

    
def compute_idf(docs, dict_count):
    """
    Log(N/C)
    N: Total number of documents
    nw: Number of documents containing the word w
    """
    N = len(docs)
    idf_dict = {}
    
    for key, count in dict_count.items():
        idf_dict[key] = np.log(N/count)
        
    return idf_dict

In [118]:
with open("data/result.json", mode = "rb") as file:
    docs = json.load(file)
    corpus = create_corpus(docs)
    
    ### IDF
    dict_count_document = count_word_document(corpus.keys(), docs)
    idf = compute_idf(docs, dict_count_document)
    
    ### TERM FREQUENCY
    for index, doc in enumerate(docs):
        doc["tf_idfs"] = compute_tf(doc["tokenized_text"])

        ### For each term in a document, we calculate the TFIDF
        for index, (term, tfreq) in enumerate(doc["tf_idfs"].items()):
             doc["tf_idfs"][term]= tfreq * idf[term]
                
    ## at last the result is saved.            
    with open('data/tfidf_result.json', 'w') as output:
        json.dump(docs, output)      