In [2]:
import requests
from bs4 import BeautifulSoup
import random
import string
import nltk
import zipfile
import wikipedia
import pandas as pd
import spacy
import stanza
import ast
import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Data Collection

In [3]:
def create_dataset(src, n):
    if "wikipedia.org" in src:
        dataset = data_from_url(src, n)
    elif src[-4:] == '.zip' and src[:4] != 'http':
      dataset = data_from_archive(n)
    else:
        raise ValueError("Invalid source specified. Please choose 'archive' or 'url'.")
    return dataset

In [4]:
def data_from_archive(path_to_zip_file, n):
    '''
    Code to retrieve data from an archive source
    '''
    dataset = []
    count = 0

    with zipfile.ZipFile(path_to_zip_file, 'r') as f:
      for name in f.namelist():
          data = f.read(name)
          if count < n:
            dataset.append(data)
            count += 1
          else:
            break

    return dataset

def data_from_url(path_to_url, n):
    '''
    Code to retrieve data from a URL source
    '''  
    dataset = []
    count = 0

    page = wikipedia.page(path_to_url)
    content = page.content
    dataset.append(content)
    
    all_links = page.links
    for link in all_links:
      if count < n-1:
        try:
          page = wikipedia.page(title=link)
          content = page.content
          dataset.append(content)
          count += 1
        except: pass
      else:
        break
    
    return dataset

In [5]:
src = "https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_15th_and_16th_centuries"
n = 50

dataset = create_dataset(src, n)



  lis = BeautifulSoup(html).find_all('li')


In [11]:
df = pd.DataFrame(dataset, columns=['texts'])
df.to_csv('part2_dataset.csv', index=False)

# POS tagging

In [3]:
df = pd.read_csv('tokenization_after_segmentation.csv')
df.head()

Unnamed: 0,texts,sentences_stanza,sentences_spacy,num_sentences_stanza,num_sentences_spacy,unique_sent_stanza,unique_sent_spacy,tokens_stanza,tokens_spacy,tokens_occurence_stanza,tokens_occurence_spacy
0,Some notable French Huguenots or people with F...,['Some notable French Huguenots or people with...,['Some notable French Huguenots or people with...,1642,1630,"Jean Jacques Favre, pastor.","Antoine Barnave (1761-1783), French revolution...","['notable', 'french', 'huguenot', 'people', 'f...","['notable', 'french', 'huguenot', 'people', 'f...","Counter({'de': 290, 'pastor': 280, 'french': 2...","Counter({'de': 290, 'pastor': 280, 'french': 2..."
1,Abel Boyer (1667? – 16 November 1729) was a Fr...,['Abel Boyer (1667? – 16 November 1729) was a ...,['Abel Boyer (1667? – 16 November 1729) was a ...,54,51,Glen Buxton said he could listen to Barrett's ...,[The psychiatric evaluation of Jesus.,"['abel', 'boyer', 'november', 'french', 'engli...","['abel', 'boyer', 'november', 'french', 'engli...","Counter({'boyer': 27, 'french': 17, 'english':...","Counter({'boyer': 27, 'french': 17, 'english':..."
2,"Abolitionism, or the abolitionist movement, is...","['Abolitionism, or the abolitionist movement, ...","['Abolitionism, or the abolitionist movement, ...",332,302,"Francis Durand, convert from Roman Catholicism...","Faneuil hall and Faneuil Hall Market: or, Pete...","['abolitionism', 'abolitionist', 'movement', '...","['abolitionism', 'abolitionist', 'movement', '...","Counter({'slavery': 144, 'slave': 118, 'state'...","Counter({'slavery': 171, 'slave': 118, 'state'..."
3,"In the United States, abolitionism, the moveme...","['In the United States, abolitionism, the move...","['In the United States, abolitionism, the move...",545,518,"Renaud (1952-), pop-rock singer, anti-military...","Michael Pertwee (1916-1991), playwright and sc...","['united', 'state', 'abolitionism', 'movement'...","['united', 'state', 'abolitionism', 'movement'...","Counter({'slavery': 151, 'slave': 127, 'abolit...","Counter({'slavery': 207, 'slave': 127, 'abolit..."
4,Abraham Bosse (c. 1604 – 14 February 1676) was...,['Abraham Bosse (c.\u20091604 – 14 February 16...,['Abraham Bosse (c.\u20091604 – 14 February 16...,65,75,"Charles Chauvel (1897–1959), Australian film-m...","Ludwig Devrient (1784–1832), German actor.\n","['abraham', 'bosse', 'february', 'french', 'ar...","['abraham', 'bosse', 'february', 'french', 'ar...","Counter({'de': 34, 'la': 16, 'bosse': 14, 'le'...","Counter({'de': 34, 'la': 16, 'bosse': 14, 'le'..."


In [4]:
# to be deleted
df['tokens_stanza'] = df['tokens_stanza'].apply(ast.literal_eval)
df['tokens_spacy'] = df['tokens_spacy'].apply(ast.literal_eval)

In [5]:
# to be deleted
# creating vocabularies of unique tokens for each library
vocab_stanza = set(token for tokens in df['tokens_stanza'] for token in tokens)
vocab_spacy = set(token for tokens in df['tokens_spacy'] for token in tokens)

# tokens which simalteneously present in both vocabularies
SharedTokenInSentences = vocab_stanza.intersection(vocab_spacy)

# creating dataframe for storing pos tags
df_pos = pd.DataFrame(columns=['token', 'stanza_pos', 'spacy_pos'])
df_pos['token'] = list(SharedTokenInSentences)

In [6]:
# pos tagging using stanza
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos')
stanza_doc = df_pos['token'].apply(nlp_stanza)
df_pos['stanza_pos'] = [token.sentences[0].words[0].upos for token in stanza_doc]

2023-05-14 14:24:35 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 58.6MB/s]                    
2023-05-14 14:24:36 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-05-14 14:24:36 INFO: Using device: cpu
2023-05-14 14:24:36 INFO: Loading: tokenize
2023-05-14 14:24:36 INFO: Loading: pos
2023-05-14 14:24:36 INFO: Done loading processors!


In [None]:
# pos tagging using spacy
nlp_spacy = spacy.load('en_core_web_sm')
df_pos['spacy_pos'] = [t.pos_ for token in df_pos['token'].apply(nlp_spacy) for t in token]

In [None]:
df_pos.head()

In [None]:
print(f"Number of tokens in the dataset: {df_pos.shape[0]}")
print(f"Number of times the token is assigned the same UPOS by both libraries: {df_pos[df_pos['stanza_pos'] == df_pos['spacy_pos']].shape[0]}")
print(f"Ratio of the times the token is assigned the same UPOS by both libraries: {df_pos[df_pos['stanza_pos'] == df_pos['spacy_pos']].shape[0] / df_pos.shape[0]}")

In [None]:
# list of pos tags in each library's results
upos_spacy = df_pos['spacy_pos'].unique()
upos_stanza = df_pos['stanza_pos'].unique()

In [None]:
def frequency_mapping(lib, upos_list):
    '''
    Map frequency of UPOS tags in one library to the other library
    lib: str, 'spacy' or 'stanza'
    upos_list: list of UPOS tags in the library
    return: dict, mapping of frequencies
    '''
    if lib == 'spacy':
        other_lib = 'stanza'
    else:
        other_lib = 'spacy'

    print(f"Frequencies of {lib} UPOS tags")
    mapping = {}
    for tag in upos_list:
        print(f"\nFor all tokens labelled {tag} in {lib}:")

        mapping[tag] = {}
        sub_df = df_pos[df_pos[f'{lib}_pos'] == tag]
        other_lib_tag_list = df_pos[df_pos[f'{lib}_pos'] == tag][f'{other_lib}_pos'].unique()

        for other_tag in other_lib_tag_list:
            mapping[tag][other_tag] = len(sub_df[sub_df[f'{other_lib}_pos'] == other_tag]) / len(sub_df) * 100
        for key, value in mapping[tag].items():
            print(f"{key}: {value:.2f}%", end=", ")

    return mapping

In [None]:
mapping_spacy = frequency_mapping('spacy', upos_spacy)

In [None]:
mapping_stanza = frequency_mapping('stanza', upos_stanza)

## Visualization

spacy2stanza = pd.DataFrame(mapping_spacy)
stanza2spacy = pd.DataFrame(mapping_stanza)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(spacy2stanza, annot=True, cmap='YlGnBu')
plt.title('Spacy to Stanza POS tags')
plt.xlabel('Spacy POS')
plt.ylabel('Stanza POS')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(stanza2spacy, annot=True, cmap='YlGnBu')
plt.title('Stanza to Spacy POS tags')
plt.xlabel('Stanza POS')
plt.ylabel('Spacy POS')
plt.show()