In [1]:
# CADL3: Named Entity Recognition (NER) with spaCy

!pip install spacy
!python -m spacy download en_core_web_sm

import spacy
import pandas as pd

# Load pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Example unstructured dataset
text = """
Elon Musk is the CEO of Tesla and SpaceX.
Sundar Pichai works at Google.
Tim Cook is leading Apple in California.
Satya Nadella joined Microsoft as CEO.
"""

# Process text with spaCy
doc = nlp(text)

# Step 1: Extract all entities
entities = [(ent.text, ent.label_) for ent in doc.ents]
print("Named Entities:\n", entities)

# Step 2: Create a structured table (Person → Organization)
persons = []
orgs = []

for ent in doc.ents:
    if ent.label_ == "PERSON":
        persons.append(ent.text)
    elif ent.label_ == "ORG":
        orgs.append(ent.text)

# Create DataFrame (match persons & orgs by order of appearance)
min_len = min(len(persons), len(orgs))
df = pd.DataFrame({"Person": persons[:min_len], "Organization": orgs[:min_len]})

print("\nStructured Information Table:")
print(df)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Named Entities:
 [('Elon Musk', 'PERSON'), ('Tesla', 'ORG'), ('Sundar Pichai', 'PERSON'), ('Google', 'ORG'), ('Tim Cook', 'PERSON'), ('Apple', 'ORG'), ('California', 'GPE'), ('Satya Nadella', 'PERSON'), ('Microsoft', 'ORG')]

Structured Information Table:
          Person Organization
0      Elon Musk        Tesla
1  Sundar Pichai   