<a href="https://colab.research.google.com/github/Brahmani1237/NLP-2403A52020/blob/main/NLP_LAB-14_2403A52020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings('ignore') # Suppress warnings related to unauthenticated Hugging Face Hub requests.
!pip install spacy transformers torch pandas # Install necessary libraries: spaCy for NLP, transformers for Hugging Face models, torch for deep learning, and pandas for data manipulation.
!python -m spacy download en_core_web_sm # Download the small English language model for spaCy, which includes pre-trained NER capabilities.

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy                     # Import the spaCy library for Natural Language Processing (NLP).
from spacy import displacy       # Import displacy from spaCy for visualizing named entities.
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification # Import components from the transformers library for building an NER pipeline.
import torch                     # Import PyTorch, a deep learning framework.
import pandas as pd              # Import pandas for data manipulation and creating DataFrames.

In [4]:
nlp = spacy.load("en_core_web_sm") # Load the English language model 'en_core_web_sm' into spaCy for processing text.

In [5]:
sentences = [ # Define a list of sentences to be used for Named Entity Recognition (NER).
    "Apple launched the new iPhone 15 in California.",
    "Elon Musk announced new Tesla updates in Texas.",
    "The Indian cricket team won the World Cup in Australia.",
    "Prime Minister Narendra Modi visited the United States.",
    "Google is investing $5 billion in artificial intelligence research."
]

In [6]:
print("---- spaCy NER Results ----\n") # Print a header for the spaCy NER results.

spacy_results = [] # Initialize an empty list to store spaCy's NER results.

for sentence in sentences: # Iterate through each sentence in the predefined list.
    doc = nlp(sentence) # Process the sentence with the loaded spaCy model to create a Doc object.
    print(f"Sentence: {sentence}") # Print the current sentence.

    for ent in doc.ents: # Iterate through each named entity detected in the Doc object.
        print(f"Entity: {ent.text}, Label: {ent.label_}") # Print the entity text and its assigned label.
        spacy_results.append([sentence, ent.text, ent.label_]) # Append the sentence, entity, and label to the results list.

    print() # Print an empty line for better readability between sentences.

---- spaCy NER Results ----

Sentence: Apple launched the new iPhone 15 in California.
Entity: Apple, Label: ORG
Entity: 15, Label: CARDINAL
Entity: California, Label: GPE

Sentence: Elon Musk announced new Tesla updates in Texas.
Entity: Elon Musk, Label: PERSON
Entity: Tesla, Label: ORG
Entity: Texas, Label: GPE

Sentence: The Indian cricket team won the World Cup in Australia.
Entity: Indian, Label: NORP
Entity: the World Cup, Label: EVENT
Entity: Australia, Label: GPE

Sentence: Prime Minister Narendra Modi visited the United States.
Entity: Narendra Modi, Label: PERSON
Entity: the United States, Label: GPE

Sentence: Google is investing $5 billion in artificial intelligence research.
Entity: Google, Label: ORG
Entity: $5 billion, Label: MONEY



In [7]:
df_spacy = pd.DataFrame(spacy_results, columns=["Sentence", "Entity", "Label"]) # Create a pandas DataFrame from the spaCy NER results with specified column names.
df_spacy # Display the DataFrame containing spaCy's NER output.

Unnamed: 0,Sentence,Entity,Label
0,Apple launched the new iPhone 15 in California.,Apple,ORG
1,Apple launched the new iPhone 15 in California.,15,CARDINAL
2,Apple launched the new iPhone 15 in California.,California,GPE
3,Elon Musk announced new Tesla updates in Texas.,Elon Musk,PERSON
4,Elon Musk announced new Tesla updates in Texas.,Tesla,ORG
5,Elon Musk announced new Tesla updates in Texas.,Texas,GPE
6,The Indian cricket team won the World Cup in A...,Indian,NORP
7,The Indian cricket team won the World Cup in A...,the World Cup,EVENT
8,The Indian cricket team won the World Cup in A...,Australia,GPE
9,Prime Minister Narendra Modi visited the Unite...,Narendra Modi,PERSON


In [8]:
displacy.render(nlp(sentences[0]), style="ent", jupyter=True) # Render the first sentence with its named entities highlighted using spaCy's displacy visualizer.

In [9]:
model_name = "dslim/bert-base-NER" # Specify the name of the pre-trained BERT-based NER model from Hugging Face.

tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the tokenizer associated with the specified pre-trained model.
model = AutoModelForTokenClassification.from_pretrained(model_name) # Load the pre-trained model for token classification (NER).

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) # Create an NER pipeline using the loaded model and tokenizer.

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]



vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: dslim/bert-base-NER
Key                      | Status     |  | 
-------------------------+------------+--+-
bert.pooler.dense.bias   | UNEXPECTED |  | 
bert.pooler.dense.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [10]:
print("---- Hugging Face NER Results ----\n") # Print a header for the Hugging Face NER results.

hf_results = [] # Initialize an empty list to store Hugging Face's NER results.

for sentence in sentences: # Iterate through each sentence.
    print(f"Sentence: {sentence}") # Print the current sentence.
    entities = ner_pipeline(sentence) # Process the sentence with the Hugging Face NER pipeline to extract entities.

    for ent in entities: # Iterate through each entity detected by the pipeline.
        # Print the word, its predicted label, and the confidence score, rounded to 4 decimal places.
        print(f"Word: {ent['word']}, Label: {ent['entity']}, Score: {round(ent['score'],4)}")
        hf_results.append([sentence, ent['word'], ent['entity'], ent['score']]) # Append the sentence, word, label, and score to the results list.

    print() # Print an empty line for better readability.

---- Hugging Face NER Results ----

Sentence: Apple launched the new iPhone 15 in California.
Word: Apple, Label: B-ORG, Score: 0.9975000023841858
Word: iPhone, Label: B-MISC, Score: 0.9983999729156494
Word: 15, Label: I-MISC, Score: 0.998199999332428
Word: California, Label: B-LOC, Score: 0.9997000098228455

Sentence: Elon Musk announced new Tesla updates in Texas.
Word: El, Label: B-ORG, Score: 0.9986000061035156
Word: ##on, Label: I-ORG, Score: 0.9944999814033508
Word: Mu, Label: I-ORG, Score: 0.9968000054359436
Word: ##sk, Label: I-ORG, Score: 0.9861999750137329
Word: Te, Label: B-MISC, Score: 0.595300018787384
Word: ##sla, Label: I-ORG, Score: 0.6690000295639038
Word: Texas, Label: B-LOC, Score: 0.9995999932289124

Sentence: The Indian cricket team won the World Cup in Australia.
Word: Indian, Label: B-MISC, Score: 0.9995999932289124
Word: World, Label: B-MISC, Score: 0.9936000108718872
Word: Cup, Label: I-MISC, Score: 0.9987999796867371
Word: Australia, Label: B-LOC, Score: 0.999

In [11]:
clean_results = [] # Initialize an empty list to store cleaned Hugging Face NER results.

for sentence in sentences: # Iterate through each sentence.
    entities = ner_pipeline(sentence) # Process the sentence with the Hugging Face NER pipeline.

    merged_word = "" # Initialize an empty string for merging subword tokens (not used in this specific implementation but kept from original).
    current_label = "" # Initialize an empty string for the current label (not used in this specific implementation but kept from original).
    current_score = 0 # Initialize a score (not used in this specific implementation but kept from original).

    for ent in entities: # Iterate through each entity detected.
        word = ent["word"].replace("##", "") # Get the word and remove '##' prefix from subword tokens for cleaner display.
        label = ent["entity"] # Get the entity label.
        score = ent["score"] # Get the confidence score for the entity.

        clean_results.append([sentence, word, label, score]) # Append the sentence, cleaned word, label, and score to the results list.

df_hf = pd.DataFrame(clean_results, # Create a pandas DataFrame from the cleaned Hugging Face NER results.
                     columns=["Sentence", "Entity", "Label", "Confidence Score"]) # Define column names for the DataFrame.

df_hf # Display the DataFrame containing the cleaned Hugging Face NER output.

Unnamed: 0,Sentence,Entity,Label,Confidence Score
0,Apple launched the new iPhone 15 in California.,Apple,B-ORG,0.997462
1,Apple launched the new iPhone 15 in California.,iPhone,B-MISC,0.998376
2,Apple launched the new iPhone 15 in California.,15,I-MISC,0.998245
3,Apple launched the new iPhone 15 in California.,California,B-LOC,0.999744
4,Elon Musk announced new Tesla updates in Texas.,El,B-ORG,0.998633
5,Elon Musk announced new Tesla updates in Texas.,on,I-ORG,0.994543
6,Elon Musk announced new Tesla updates in Texas.,Mu,I-ORG,0.996818
7,Elon Musk announced new Tesla updates in Texas.,sk,I-ORG,0.986198
8,Elon Musk announced new Tesla updates in Texas.,Te,B-MISC,0.595255
9,Elon Musk announced new Tesla updates in Texas.,sla,I-ORG,0.668992
