# Part 2 - 3: POS Tagging

In [75]:
import pandas as pd
# add util.py to the path
import sys
sys.path.append('../')
from util import clean_text
import spacy
nlp_spacy =spacy.load("en_core_web_sm")

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bleuze3u\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\bleuze3u\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [76]:
# Load the set of SharedTokensInSentences
df = pd.read_csv("../../data/part2/part2-shared-sentences.csv")
df

Unnamed: 0,shared_sentence
0,"Values emerge from science, both as product an..."
1,The scientific enterprise is embedded in parti...
2,"This discipline overlaps with metaphysics, ont..."
3,"Yet, although various measures of simplicity h..."
4,The chicken may therefore use inductive reason...
5,One approach is to acknowledge that induction ...
6,Philosophers have tried to make this heuristic...
7,There is no consensus among philosophers about...
8,So the chicken would be right to conclude from...
9,In addition to these general questions about s...


In [77]:
def get_tokens_and_POS_NLTK(text):
    """A function that returns a list of tokens and their POS tags recognized by NLTK"""

    return pos_tag(word_tokenize(text), tagset='universal')

def get_tokens_and_POS_spacy(text):
    """A function that returns a list of tokens and their POS tags recognized by Spacy"""

    doc_spacy = nlp_spacy(text)
    return [(token.text, token.pos_) for token in doc_spacy]

In [78]:
# Clean the text
df["cleaned_sentence"] = df["shared_sentence"].apply(clean_text)

# Tokenize using NLTK and store the POS
df["tokens_and_POS_NLTK"] = df["cleaned_sentence"].apply(get_tokens_and_POS_NLTK)

# Tokenize using Spacy and store the POS
df["tokens_and_POS_spacy"] = df["cleaned_sentence"].apply(get_tokens_and_POS_spacy)

In [79]:
df.head()

Unnamed: 0,shared_sentence,cleaned_sentence,tokens_and_POS_NLTK,tokens_and_POS_spacy
0,"Values emerge from science, both as product an...","values emerge from science, both as product an...","[(values, NOUN), (emerge, VERB), (from, ADP), ...","[(values, NOUN), (emerge, VERB), (from, ADP), ..."
1,The scientific enterprise is embedded in parti...,the scientific enterprise is embedded in parti...,"[(the, DET), (scientific, ADJ), (enterprise, N...","[(the, DET), (scientific, ADJ), (enterprise, N..."
2,"This discipline overlaps with metaphysics, ont...","this discipline overlaps with metaphysics, ont...","[(this, DET), (discipline, NOUN), (overlaps, V...","[(this, DET), (discipline, NOUN), (overlaps, V..."
3,"Yet, although various measures of simplicity h...","yet, although various measures of simplicity h...","[(yet, ADV), (,, .), (although, ADP), (various...","[(yet, ADV), (,, PUNCT), (although, SCONJ), (v..."
4,The chicken may therefore use inductive reason...,the chicken may therefore use inductive reason...,"[(the, DET), (chicken, NOUN), (may, VERB), (th...","[(the, DET), (chicken, NOUN), (may, AUX), (the..."


In [80]:
df["common_tokens"] = df.apply(lambda row: set([token[0] for token in row["tokens_and_POS_NLTK"]]).intersection(set([token[0] for token in row["tokens_and_POS_spacy"]])), axis=1)
df["common_tokens_count"] = df["common_tokens"].apply(lambda x: len(x))
df

Unnamed: 0,shared_sentence,cleaned_sentence,tokens_and_POS_NLTK,tokens_and_POS_spacy,common_tokens,common_tokens_count
0,"Values emerge from science, both as product an...","values emerge from science, both as product an...","[(values, NOUN), (emerge, VERB), (from, ADP), ...","[(values, NOUN), (emerge, VERB), (from, ADP), ...","{in, values, be, both, .., as, product, emerge...",20
1,The scientific enterprise is embedded in parti...,the scientific enterprise is embedded in parti...,"[(the, DET), (scientific, ADJ), (enterprise, N...","[(the, DET), (scientific, ADJ), (enterprise, N...","{through, practitioners, in, particular, is, t...",14
2,"This discipline overlaps with metaphysics, ont...","this discipline overlaps with metaphysics, ont...","[(this, DET), (discipline, NOUN), (overlaps, V...","[(this, DET), (discipline, NOUN), (overlaps, V...","{metaphysics, .., discipline, epistemology, ex...",20
3,"Yet, although various measures of simplicity h...","yet, although various measures of simplicity h...","[(yet, ADV), (,, .), (although, ADP), (various...","[(yet, ADV), (,, PUNCT), (although, SCONJ), (v...","{.., yet, accepted, forward, as, thing, that, ...",26
4,The chicken may therefore use inductive reason...,the chicken may therefore use inductive reason...,"[(the, DET), (chicken, NOUN), (may, VERB), (th...","[(the, DET), (chicken, NOUN), (may, AUX), (the...","{every, bring, food, the, inductive, will, may...",17
5,One approach is to acknowledge that induction ...,one approach is to acknowledge that induction ...,"[(one, NUM), (approach, NOUN), (is, VERB), (to...","[(one, NUM), (approach, NOUN), (is, AUX), (to,...","{approach, certainty, .., achieve, general, on...",26
6,Philosophers have tried to make this heuristic...,philosophers have tried to make this heuristic...,"[(philosophers, NOUN), (have, VERB), (tried, V...","[(philosophers, NOUN), (have, AUX), (tried, VE...","{in, other, .., or, philosophers, to, this, ha...",19
7,There is no consensus among philosophers about...,there is no consensus among philosophers about...,"[(there, DET), (is, VERB), (no, DET), (consens...","[(there, PRON), (is, VERB), (no, DET), (consen...","{about, be, .., unobservable, whether, includi...",32
8,So the chicken would be right to conclude from...,so the chicken would be right to conclude from...,"[(so, ADV), (the, DET), (chicken, NOUN), (woul...","[(so, ADV), (the, DET), (chicken, NOUN), (woul...","{those, be, food, .., will, if, farmer, to, th...",31
9,In addition to these general questions about s...,in addition to these general questions about s...,"[(in, ADP), (addition, NOUN), (to, PRT), (thes...","[(in, ADP), (addition, NOUN), (to, ADP), (thes...","{in, particular, about, .., general, (, as, or...",27


In [81]:
nltk_list = df["tokens_and_POS_NLTK"].tolist()
spacy_list = df["tokens_and_POS_spacy"].tolist()

common_tokens_list = []
common_tokens_and_pos_list = []

for i in range(len(nltk_list)):

    nltk_, spacy_ = nltk_list[i], spacy_list[i]
    common_tokens = set([token[0] for token in nltk_]).intersection(set([token[0] for token in spacy_]))
    common_tokens_list.append(common_tokens)

df["common_tokens"] = common_tokens_list
df["common_tokens_count"] = df["common_tokens"].apply(lambda x: len(x))

Below: to be finished
- find a way to retrieve (token, pos) common 

In [None]:
# common_list = df["common_tokens"].tolist()
# common_token_and_pos = []

# for i, shared_token in enumerate(common_list):

#     nltk_, spacy_ = nltk_list[i], spacy_list[i]

#     common_token_and_pos.append([(token[0], token[1]) for token in nltk_ if token[0] in shared_token and token[0] in [token[0] for token in spacy_]])

In [82]:
df.head()

Unnamed: 0,shared_sentence,cleaned_sentence,tokens_and_POS_NLTK,tokens_and_POS_spacy,common_tokens,common_tokens_count
0,"Values emerge from science, both as product an...","values emerge from science, both as product an...","[(values, NOUN), (emerge, VERB), (from, ADP), ...","[(values, NOUN), (emerge, VERB), (from, ADP), ...","{in, values, be, both, .., as, product, emerge...",20
1,The scientific enterprise is embedded in parti...,the scientific enterprise is embedded in parti...,"[(the, DET), (scientific, ADJ), (enterprise, N...","[(the, DET), (scientific, ADJ), (enterprise, N...","{through, practitioners, in, particular, is, t...",14
2,"This discipline overlaps with metaphysics, ont...","this discipline overlaps with metaphysics, ont...","[(this, DET), (discipline, NOUN), (overlaps, V...","[(this, DET), (discipline, NOUN), (overlaps, V...","{metaphysics, .., discipline, epistemology, ex...",20
3,"Yet, although various measures of simplicity h...","yet, although various measures of simplicity h...","[(yet, ADV), (,, .), (although, ADP), (various...","[(yet, ADV), (,, PUNCT), (although, SCONJ), (v...","{.., yet, accepted, forward, as, thing, that, ...",26
4,The chicken may therefore use inductive reason...,the chicken may therefore use inductive reason...,"[(the, DET), (chicken, NOUN), (may, VERB), (th...","[(the, DET), (chicken, NOUN), (may, AUX), (the...","{every, bring, food, the, inductive, will, may...",17


In [83]:
print(df["tokens_and_POS_NLTK"][0])
print()
print(df["tokens_and_POS_spacy"][0])

[('values', 'NOUN'), ('emerge', 'VERB'), ('from', 'ADP'), ('science', 'NOUN'), (',', '.'), ('both', 'CONJ'), ('as', 'ADP'), ('product', 'NOUN'), ('and', 'CONJ'), ('process', 'NOUN'), ('and', 'CONJ'), ('can', 'VERB'), ('be', 'VERB'), ('distributed', 'VERB'), ('among', 'ADP'), ('several', 'ADJ'), ('cultures', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('society', 'NOUN'), ('..', 'NOUN')]

[('values', 'NOUN'), ('emerge', 'VERB'), ('from', 'ADP'), ('science', 'NOUN'), (',', 'PUNCT'), ('both', 'CCONJ'), ('as', 'ADP'), ('product', 'NOUN'), ('and', 'CCONJ'), ('process', 'NOUN'), ('and', 'CCONJ'), ('can', 'AUX'), ('be', 'AUX'), ('distributed', 'VERB'), ('among', 'ADP'), ('several', 'ADJ'), ('cultures', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('society', 'NOUN'), ('..', 'PUNCT')]
