In [13]:
!pip install spacy



In [14]:
%pip install -U pip setuptools wheel
%pip install -U 'spacy[apple]'

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [15]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m523.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:
import spacy
import srsly

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the data from the train.jsonl file
file_path = 'train.jsonl'
data = srsly.read_jsonl(file_path)

# Initialize a blank English model and add an EntityRuler
nlp_blank = spacy.blank("en")
ruler = nlp_blank.add_pipe("entity_ruler")

# Iterate over the data and add patterns for the location mentions
patterns = []
corpus = []
for item in data:
    text = item["text"]
    location_mentions = item["location_mentions"]

    # Add the text to the corpus for later processing
    corpus.append(text)

    # Add patterns for each location mention
    for loc in location_mentions:
        pattern = {"label": "LOC", "pattern": loc["text"]}
        patterns.append(pattern)

ruler.add_patterns(patterns)

# Generate TRAIN_DATA
TRAIN_DATA = []
for sentence in corpus:
    doc = nlp_blank(sentence)
    entities = []

    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))

    if entities:
        TRAIN_DATA.append((sentence, {"entities": entities}))




In [17]:
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        i=0
        for start, end, label in annot["entities"]:
            i+=1
            print(start, end, label, i)
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)

In [18]:
convert(lang="en", TRAIN_DATA=TRAIN_DATA, output_path=Path("./train.spacy"))

118 120 LOC 1
130 140 LOC 2
46 52 LOC 1
87 106 LOC 2
218 228 LOC 1
87 93 LOC 1
95 105 LOC 2
56 75 LOC 1
13 19 LOC 1
66 85 LOC 2
20 30 LOC 1
18 28 LOC 1
91 101 LOC 2
17 27 LOC 1
46 56 LOC 2
11 23 LOC 1
60 79 LOC 1
81 93 LOC 2
63 73 LOC 1
115 121 LOC 2
17 25 LOC 1
26 36 LOC 2
30 40 LOC 1
21 31 LOC 1
0 19 LOC 1
174 193 LOC 1
56 66 LOC 1
59 78 LOC 1
8 18 LOC 1
71 82 LOC 2
130 138 LOC 3
59 70 LOC 1
71 85 LOC 2
94 96 LOC 3
165 171 LOC 4
243 254 LOC 5
60 71 LOC 1
59 69 LOC 1
100 110 LOC 2
28 38 LOC 1
89 97 LOC 1
57 67 LOC 1
76 84 LOC 2
47 57 LOC 1
12 22 LOC 1
29 39 LOC 1
89 95 LOC 2
1 3 LOC 1
123 133 LOC 2
160 168 LOC 3
0 5 LOC 1
10 17 LOC 2
79 81 LOC 3
86 97 LOC 4
56 75 LOC 1
143 149 LOC 2
65 75 LOC 1
59 69 LOC 1
51 61 LOC 1
34 44 LOC 1
179 187 LOC 2
189 191 LOC 3
202 212 LOC 4
224 232 LOC 5
19 29 LOC 1
69 79 LOC 1
0 10 LOC 1
5 13 LOC 1
26 36 LOC 2
71 79 LOC 3
0 8 LOC 1
34 47 LOC 2
90 100 LOC 3
61 69 LOC 1
250 260 LOC 2
109 119 LOC 1
77 85 LOC 1
138 146 LOC 2
148 153 LOC 3
155 163 LOC 4
165 

In [19]:
import spacy
from pathlib import Path

model = None
output_dir = Path("./ner")
n_iter = 10

# Load or create a blank model
if model is not None:
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')
    print("Created blank 'en' model")

# Set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')


Created blank 'en' model


In [20]:
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [21]:
import random
from tqdm import tqdm
import spacy
from spacy.training import Example

# Assuming TRAIN_DATA is a list of tuples like: (text, {"entities": [(start, end, label)]})
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Disable other pipeline components to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}

        for text, annotations in tqdm(TRAIN_DATA):
            # Convert the training data to spaCy's Example objects
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],
                drop=0.5,
                sgd=optimizer,
                losses=losses
            )
        print(f"Iteration {itn}: {losses}")


100%|██████████| 777/777 [00:07<00:00, 97.76it/s] 


Iteration 0: {'ner': 1148.1300253277197}


100%|██████████| 777/777 [00:07<00:00, 101.72it/s]


Iteration 1: {'ner': 347.59583899660316}


100%|██████████| 777/777 [00:07<00:00, 100.72it/s]


Iteration 2: {'ner': 244.44694606474897}


100%|██████████| 777/777 [00:07<00:00, 101.92it/s]


Iteration 3: {'ner': 175.6617979552906}


100%|██████████| 777/777 [00:07<00:00, 101.96it/s]


Iteration 4: {'ner': 142.9312649094207}


100%|██████████| 777/777 [00:07<00:00, 103.19it/s]


Iteration 5: {'ner': 116.66878678191284}


100%|██████████| 777/777 [00:07<00:00, 99.16it/s] 


Iteration 6: {'ner': 141.99569010865642}


100%|██████████| 777/777 [00:07<00:00, 102.07it/s]


Iteration 7: {'ner': 102.20066659940682}


100%|██████████| 777/777 [00:07<00:00, 101.88it/s]


Iteration 8: {'ner': 134.82856001284762}


100%|██████████| 777/777 [00:09<00:00, 82.27it/s] 

Iteration 9: {'ner': 99.15206014409152}





In [22]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Paradise', 'LOC'), ('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('CA', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('CampFire', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC'), ('Butte County', 'LOC')]
Entities [('California', 'LOC'), ('American', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC'), ('Paradise', 'LOC'), ('American', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC'), ('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('9/11,California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('California', 'LOC')]
Entities [('Butte County', 'LOC')]
Entities [('California', 'LOC')]
Entities [('NYC', 'LOC'), ('California', 'LOC'

In [24]:
import srsly
from spacy import displacy

# Load the data from the test.jsonl file
file_path = 'test.jsonl'
data = srsly.read_jsonl(file_path)

# Extract the text from each tweet and store in a list
texts = [item['text'] for item in data]
options = {"ents": ["LOC"], "colors": {"LOC": "lightgreen"}}
# Print the extracted text
for text in texts:
    doc = nlp(text)
    displacy.render(doc, style="ent", options=options, jupyter=True)
    # print('Entities', [(ent.text, ent.label_) for ent in doc.ents])


