In [3]:
import spacy
import json

### explore data

In [95]:
with open("data/data.json", mode = "rb") as file:
    file_obj = json.load(file)
    
    for index, file in enumerate(file_obj):
        print(file.keys())
        print(file["title"])
        print(file["text"][0:25])
        print(file["url"])
        break

dict_keys(['title', 'text', 'url'])
Pandemic
A pandemic (from Greek πᾶ
https://en.wikipedia.org/wiki/Pandemic


----

In [53]:
# Create a Python function that takes in a text string,
# performs all operations described in the previous step, 
# and outputs a list of tokens (lemmas).
# Lowercases the text string.
# Creates a spaCy document with the text lemmas and their attributes 
# using a spaCy model of your choice.
# Removes stop words, punctuation, and other unclassified lemmas.
# Returns a list of tokens (lemmas) found in the text.

## Doc -> Span(Slice of items) -> Token (a specific item)

In [87]:
def preprocess(content_doc):
    """
    This function perform the following operations:
    - Lowercases the text string.
    - Creates a spaCy document with the text lemmas and 
        their attributes using a spaCy model of your choice.
    - Removes stop words, punctuation, and other unclassified lemmas.
    - Returns a list of tokens (lemmas) found in the text.    
    
    :param doc_content: Document content
    :type content_doc: str
    :return: List of tokens (lemmas)
    
    """
    content_doc = content_doc.lower()
    
    nlp = spacy.load('en_core_web_sm')
    content_doc = content_doc.lower()
    
    # doc contains the text lemmas and their attributes
    # https://spacy.io/api/doc
    doc = nlp(content_doc)
       
     # remove stop words
    all_stopwords = nlp.Defaults.stop_words
    tokens_without_sw = [word for word in doc if word not  in all_stopwords]
    
    # remove punctuation and stop words
    tokens_without_pct = [token for token in tokens_without_sw 
                          if not token.is_punct and not token.is_stop]
    
    # Get Lemmas
    token_lemmas = [
                        token.lemma_ for token
                            in tokens_without_pct
                               if len(token.dep_.strip())>0
                                and token.lemma_ != "\n"
                                and token.dep_
                        ]
    
    return token_lemmas

In [88]:
with open("data/data.json", mode = "rb") as file:
    file_obj = json.load(file)
    for index, file in enumerate(file_obj):
        print(file["title"])
        file["tokenized_text"] = preprocess(file["text"])

Pandemic
Epidemiology of HIV/AIDS
Antonine Plague
Basic reproduction number
Bills of mortality
Cholera
COVID-19 pandemic
Crimson Contagion
Disease X
Event 201
HIV/AIDS
HIV/AIDS in Yunnan
Pandemic prevention
Pandemic Severity Assessment Framework
Pandemic severity index
Plague of Cyprian
PREDICT (USAID)
1929–1930 psittacosis pandemic
Science diplomacy and pandemics
Spanish flu
Superspreader
Swine influenza
Targeted immunization strategies
Unified Victim Identification System
Viral load
Virus


In [91]:
with open('data/result.json', 'w') as output:
    json.dump(file_obj, output)

### Test spacy
https://spacy.io/usage/spacy-101#annotations-token

In [54]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
doc = nlp("It's over Anakin! I have the high ground")

In [55]:
type(doc)

spacy.tokens.doc.Doc

In [57]:
type(doc[0:2])

spacy.tokens.span.Span

In [58]:
type(doc[0])

spacy.tokens.token.Token

In [56]:
doc[2].is_punct 

False