In [38]:
import pandas as pd
import spacy
import stanza
import string
import nltk
import ast
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [25]:
df = pd.read_csv('tokenization_after_segmentation.csv')
df.head()

Unnamed: 0,texts,sentences_stanza,sentences_spacy,num_sentences_stanza,num_sentences_spacy,unique_sent_stanza,unique_sent_spacy,tokens_stanza,tokens_spacy,tokens_occurence_stanza,tokens_occurence_spacy
0,Some notable French Huguenots or people with F...,['Some notable French Huguenots or people with...,['Some notable French Huguenots or people with...,1642,1630,"Jean Jacques Favre, pastor.","Antoine Barnave (1761-1783), French revolution...","['notable', 'french', 'huguenot', 'people', 'f...","['notable', 'french', 'huguenot', 'people', 'f...","Counter({'de': 290, 'pastor': 280, 'french': 2...","Counter({'de': 290, 'pastor': 280, 'french': 2..."
1,Abel Boyer (1667? – 16 November 1729) was a Fr...,['Abel Boyer (1667? – 16 November 1729) was a ...,['Abel Boyer (1667? – 16 November 1729) was a ...,54,51,Glen Buxton said he could listen to Barrett's ...,[The psychiatric evaluation of Jesus.,"['abel', 'boyer', 'november', 'french', 'engli...","['abel', 'boyer', 'november', 'french', 'engli...","Counter({'boyer': 27, 'french': 17, 'english':...","Counter({'boyer': 27, 'french': 17, 'english':..."
2,"Abolitionism, or the abolitionist movement, is...","['Abolitionism, or the abolitionist movement, ...","['Abolitionism, or the abolitionist movement, ...",332,302,"Francis Durand, convert from Roman Catholicism...","Faneuil hall and Faneuil Hall Market: or, Pete...","['abolitionism', 'abolitionist', 'movement', '...","['abolitionism', 'abolitionist', 'movement', '...","Counter({'slavery': 144, 'slave': 118, 'state'...","Counter({'slavery': 171, 'slave': 118, 'state'..."
3,"In the United States, abolitionism, the moveme...","['In the United States, abolitionism, the move...","['In the United States, abolitionism, the move...",545,518,"Renaud (1952-), pop-rock singer, anti-military...","Michael Pertwee (1916-1991), playwright and sc...","['united', 'state', 'abolitionism', 'movement'...","['united', 'state', 'abolitionism', 'movement'...","Counter({'slavery': 151, 'slave': 127, 'abolit...","Counter({'slavery': 207, 'slave': 127, 'abolit..."
4,Abraham Bosse (c. 1604 – 14 February 1676) was...,['Abraham Bosse (c.\u20091604 – 14 February 16...,['Abraham Bosse (c.\u20091604 – 14 February 16...,65,75,"Charles Chauvel (1897–1959), Australian film-m...","Ludwig Devrient (1784–1832), German actor.\n","['abraham', 'bosse', 'february', 'french', 'ar...","['abraham', 'bosse', 'february', 'french', 'ar...","Counter({'de': 34, 'la': 16, 'bosse': 14, 'le'...","Counter({'de': 34, 'la': 16, 'bosse': 14, 'le'..."


In [40]:
df['tokens_stanza'] = df['tokens_stanza'].apply(ast.literal_eval)
df['tokens_spacy'] = df['tokens_spacy'].apply(ast.literal_eval)

In [50]:
# creating vocabularies of unique tokens for each library
vocab_stanza = set(token for tokens in df['tokens_stanza'] for token in tokens)
vocab_spacy = set(token for tokens in df['tokens_spacy'] for token in tokens)

# tokens which simalteneously present in both vocabularies
SharedTokenInSentences = vocab_stanza.intersection(vocab_spacy)

# creating dataframe for storing pos tags
df_pos = pd.DataFrame(columns=['token', 'stanza_pos', 'spacy_pos'])
df_pos['token'] = list(SharedTokenInSentences)

In [None]:
# pos tagging using stanza
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos')
stanza_doc = df_pos['token'].apply(nlp_stanza)
df_pos['stanza_pos'] = [token.sentences[0].words[0].upos for token in stanza_doc]

In [78]:
# pos tagging using spacy
nlp_spacy = spacy.load('en_core_web_sm')
df_pos['spacy_pos'] = [t.pos_ for token in df_pos['token'].apply(nlp_spacy) for t in token]

In [79]:
df_pos.head()

Unnamed: 0,token,stanza_pos,spacy_pos
0,tower,NOUN,NOUN
1,word,NOUN,NOUN
2,hess,PROPN,NOUN
3,inoculated,VERB,VERB
4,lifeboat,NOUN,NOUN


In [81]:
print(f"Number of tokens in the dataset: {df_pos.shape[0]}")
print(f"Number of times the token is assigned the same UPOS by both libraries: {df_pos[df_pos['stanza_pos'] == df_pos['spacy_pos']].shape[0]}")
print(f"Ratio of the times the token is assigned the same UPOS by both libraries: {df_pos[df_pos['stanza_pos'] == df_pos['spacy_pos']].shape[0] / df_pos.shape[0]}")

Number of tokens in the dataset: 15349
Number of times the token is assigned the same UPOS by both libraries: 9666
Ratio of the times the token is assigned the same UPOS by both libraries: 0.6297478663105088


In [82]:
# list of pos tags in each library's results
upos_spacy = df_pos['spacy_pos'].unique()
upos_stanza = df_pos['stanza_pos'].unique()

In [92]:
def frequency_mapping(lib, upos_list):
    '''
    Map frequency of UPOS tags in one library to the other library
    lib: str, 'spacy' or 'stanza'
    upos_list: list of UPOS tags in the library
    '''
    if lib == 'spacy':
        other_lib = 'stanza'
    else:
        other_lib = 'spacy'

    print(f"Frequencies of {lib} UPOS tags")
    for tag in upos_list:
        print(f"\nFor all tokens labelled {tag} in {lib}:")
        sub_df = df_pos[df_pos[f'{lib}_pos'] == tag]
        other_lib_tag_list = df_pos[df_pos[f'{lib}_pos'] == tag][f'{other_lib}_pos'].unique()

        mapping = {}
        for other_tag in other_lib_tag_list:
            mapping[other_tag] = len(sub_df[sub_df[f'{other_lib}_pos'] == other_tag]) / len(sub_df) * 100
        for key, value in mapping.items():
            print(f"{key}: {value}%", end=", ")

In [93]:
frequency_mapping('spacy', upos_spacy)

Frequencies of spacy UPOS tags

For all tokens labelled NOUN in spacy:
NOUN: 63.20148749154835%, PROPN: 31.11899932386748%, ADJ: 2.603110209601082%, VERB: 2.0453008789722786%, INTJ: 0.4394861392832995%, X: 0.4563894523326572%, SYM: 0.016903313049357674%, ADV: 0.08451656524678837%, NUM: 0.016903313049357674%, PRON: 0.016903313049357674%, 
For all tokens labelled VERB in spacy:
VERB: 53.573085846867755%, NOUN: 23.132250580046403%, ADJ: 5.638051044083527%, PROPN: 17.006960556844547%, X: 0.18561484918793503%, AUX: 0.02320185614849188%, INTJ: 0.37122969837587005%, SYM: 0.02320185614849188%, PUNCT: 0.02320185614849188%, ADV: 0.02320185614849188%, 
For all tokens labelled PROPN in spacy:
PROPN: 74.64377055169894%, X: 0.7307270734380709%, ADJ: 3.1421264157837046%, NOUN: 19.108512970405553%, INTJ: 1.3153087321885275%, VERB: 0.8768724881256851%, PRON: 0.03653635367190354%, ADV: 0.10960906101571063%, PUNCT: 0.03653635367190354%, 
For all tokens labelled ADV in spacy:
ADV: 61.19402985074627%, NOUN

In [94]:
frequency_mapping('stanza', upos_stanza)

Frequencies of stanza UPOS tags

For all tokens labelled NOUN in stanza:
NOUN: 67.92007266121708%, VERB: 18.110808356039964%, PROPN: 9.500454132606722%, ADV: 0.9990917347865577%, ADJ: 2.96094459582198%, ADP: 0.10899182561307902%, X: 0.10899182561307902%, INTJ: 0.1271571298819255%, PRON: 0.05449591280653951%, NUM: 0.018165304268846504%, CCONJ: 0.03633060853769301%, PUNCT: 0.05449591280653951%, 
For all tokens labelled PROPN in stanza:
NOUN: 36.68061366806137%, PROPN: 40.705319784817696%, ADJ: 3.9649332536361825%, ADP: 0.43833432954771867%, INTJ: 0.2988643156007173%, VERB: 14.604502889021717%, ADV: 2.510460251046025%, PUNCT: 0.1394700139470014%, PRON: 0.11954572624028689%, X: 0.35863717872086076%, NUM: 0.019924287706714484%, AUX: 0.11954572624028689%, CCONJ: 0.03984857541342897%, 
For all tokens labelled VERB in stanza:
VERB: 93.17998385794995%, NOUN: 4.882970137207425%, ADJ: 0.8878127522195319%, PROPN: 0.9685230024213075%, X: 0.04035512510088781%, ADV: 0.04035512510088781%, 
For all tok