# Text Tagging

### Parts of Speech (POS) Tagging

In [None]:
import spacy
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# our text is from jane austin's 'emma'
# we have removed punctuation, lowercased but left in stop words
emma_ja = "emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own"

In [None]:
# create a spacy doc from our text - this will generate tokens and their assosciated pos tags
spacy_doc = nlp(emma_ja)
spacy_doc

emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of auth

In [None]:
# extract the tokens and pos tags into a dataframe
# Instead of appending in a loop, create a list of dictionaries first
data = []
for token in spacy_doc:
    data.append({'token': token.text, 'pos_tag': token.pos_})

# Create the DataFrame from the list of dictionaries
pos_df = pd.DataFrame(data)

In [None]:
pos_df.head()

Unnamed: 0,token,pos_tag
0,emma,PROPN
1,woodhouse,PROPN
2,handsome,ADV
3,clever,ADJ
4,and,CCONJ


In [None]:
# token frequency count
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
86,of,ADP,14
48,had,AUX,9
53,her,PRON,9
6,and,CCONJ,8
109,the,DET,8
0,a,DET,6
112,to,PART,5
12,been,AUX,4
60,in,ADP,4
118,very,ADV,4


In [None]:
# counts of pos_tags
pos_df_poscounts = pos_df_counts.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)
pos_df_poscounts.head()

Unnamed: 0_level_0,token
pos_tag,Unnamed: 1_level_1
NOUN,33
VERB,21
ADJ,18
ADV,18
PRON,9


In [None]:
# see most common nouns
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"]
nouns.head()

Unnamed: 0,token,pos_tag,counts
47,governess,NOUN,3
45,friend,NOUN,3
101,sisters,NOUN,2
27,daughters,NOUN,2
34,emma,NOUN,2


In [None]:
# see most common verbs
verbs = pos_df_counts[pos_df_counts.pos_tag == "VERB"]
verbs.head()

Unnamed: 0,token,pos_tag,counts
91,passed,VERB,1
119,vex,VERB,1
117,unite,VERB,1
115,twentyone,VERB,1
104,supplied,VERB,1


### Named Entity Recognition (NER)

In [None]:
import spacy
from spacy import displacy
from spacy import tokenizer
import re

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
google_text = "Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."

In [None]:
spacy_doc = nlp(google_text)
spacy_doc

Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.

In [None]:
data = []
for token in spacy_doc.ents:
  data.append({'text': token.text, 'label_': token.label_})

# Create the DataFrame from the list of dictionaries
ner_df = pd.DataFrame(data)
ner_df.head(10)

Unnamed: 0,text,label_
0,Google,ORG
1,"September 4, 1998",DATE
2,Larry Page,PERSON
3,Sergey Brin,PERSON
4,PhD,WORK_OF_ART
5,Stanford University,ORG
6,California,GPE
7,about 14%,PERCENT
8,56%,PERCENT
9,IPO,ORG


In [None]:
displacy.render(spacy_doc,style="ent",jupyter=True)

In [None]:
# remove punctuation and lowercase
google_text_clean = re.sub(r'[^\w\s]', '', google_text).lower()
spacy_doc_clean = nlp(google_text_clean)

In [None]:
spacy_doc_clean

google was founded on september 4 1998 by computer scientists larry page and sergey brin while they were phd students at stanford university in california together they own about 14 of its publicly listed shares and control 56 of its stockholder voting power through supervoting stock the company went public via an initial public offering ipo in 2004 in 2015 google was reorganized as a wholly owned subsidiary of alphabet inc google is alphabets largest subsidiary and is a holding company for alphabets internet properties and interests sundar pichai was appointed ceo of google on october 24 2015 replacing larry page who became the ceo of alphabet on december 3 2019 pichai also became the ceo of alphabet

In [None]:
data = []
for token in spacy_doc_clean.ents:
  data.append({'text': token.text, 'label_': token.label_})

# Create the DataFrame from the list of dictionaries
ner_df = pd.DataFrame(data)
ner_df.head(10)

Unnamed: 0,text,label_
0,google,ORG
1,september 4 1998,DATE
2,larry,PERSON
3,phd,ORG
4,stanford university,ORG
5,california,GPE
6,about 14,CARDINAL
7,56,CARDINAL
8,2004,DATE
9,2015,DATE


In [None]:
displacy.render(spacy_doc_clean,style="ent",jupyter=True)

### NER Using LLMs

In [None]:
# Using LLMS
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_LLM = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
example = "My name is Ahmed and I live in Egypt"

ner_results = nlp_LLM(example)
ner_results

[{'entity_group': 'PER',
  'score': 0.9983785,
  'word': 'Ahmed',
  'start': 11,
  'end': 16},
 {'entity_group': 'LOC',
  'score': 0.9998294,
  'word': 'Egypt',
  'start': 31,
  'end': 36}]

In [None]:
google_text

"Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."

In [None]:
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(google_text)

# Apply the NER model to each sentence
results = []
for sentence in sentences:
    ner_results = nlp_LLM(sentence)
    results.append({ "sentence": sentence, "entities": ner_results })

In [None]:
from rich import print
from rich.table import Table

# Create a table
table = Table(title="Named Entity Recognition Results")

# Add columns
table.add_column("Sentence", style="cyan", justify="left")
table.add_column("Entity", style="magenta")
table.add_column("Label", style="green")
table.add_column("Score", style="yellow")

# Add rows to the table
for result in results:
    for entity in result["entities"]:
        table.add_row(
            result["sentence"],
            entity["word"],
            entity["entity_group"],
            f"{entity['score']:.4f}"
        )

# Display the table
print(table)


In [None]:
import spacy
from spacy import displacy

# Create a list of dicts for displaCy visualization
spacy_results = []
for result in results:
    entities = [{"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]} for entity in result["entities"]]
    spacy_results.append({"text": result["sentence"], "ents": entities, "title": None})

# Render the visualization in Jupyter
displacy.render(spacy_results, style="ent", manual=True, jupyter=True)