# 0.4 Tokenize

This notebook will use Spacy to tokenize the answers.

In [1]:
import os

import pandas as pd
import engarde.decorators as ed
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokenizer import Tokenizer
from spacy.attrs import ORTH, LEMMA

In [2]:
PROJ_ROOT = os.path.join(os.pardir)

In [3]:
# Read the filtered data


@ed.is_shape((None, 6))
def load_data():
    PROJ_ROOT = os.path.join(os.pardir)
    read_path = os.path.join(PROJ_ROOT + "/data/interim/" + "data_statements.feather")

    df = pd.read_feather(read_path)

    return df

In [4]:
filtered_questions = load_data()

In [5]:
filtered_questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197899 entries, 0 to 197898
Data columns (total 6 columns):
site_name           197899 non-null object
documentid          197899 non-null object
customquestionid    197899 non-null int64
questiontext        197899 non-null object
answertext          127714 non-null object
submissiondate      191066 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 9.1+ MB


In [6]:
filtered_questions = filtered_questions.dropna(subset=["answertext", "submissiondate"])

filtered_questions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124089 entries, 0 to 197896
Data columns (total 6 columns):
site_name           124089 non-null object
documentid          124089 non-null object
customquestionid    124089 non-null int64
questiontext        124089 non-null object
answertext          124089 non-null object
submissiondate      124089 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 6.6+ MB


In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
# Add custom stop words to Spacy's list

customize_stop_words = ["wiley", "br", "href", "url", "et", "al"]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [9]:
# Define a special case to not change "data" to "datum"

case = [{ORTH: "data", LEMMA: "data"}]
nlp.tokenizer.add_special_case("data", case)

In [10]:
def spacy_tokenizer(sentence):

    doc = nlp(sentence)

    mytokens = [
        token.lemma_
        for token in doc
        if token.pos_ in ["NOUN", "ADJ", "PROPN"]
        and not token.is_stop
        and not token.like_url
    ]

    prepared_text = " ".join(mytokens)

    return prepared_text

In [None]:
%time filtered_questions["proc_answers"] = filtered_questions["answertext"].apply(spacy_tokenizer)