In [8]:
import json
import pickle

def json_to_pkl(json_file_path, pkl_file_path):
    """
    Converts a JSON file to PKL format.

    Args:
        json_file_path (str): Path to the input JSON file.
        pkl_file_path (str): Path to save the output PKL file.
    """
    try:
        # Open and read the JSON file
        with open(json_file_path, 'r') as json_file:
            data = json.load(json_file)
        
        # Save the data to a PKL file
        with open(pkl_file_path, 'wb') as pkl_file:
            pickle.dump(data, pkl_file)
        
        print(f"Successfully converted {json_file_path} to {pkl_file_path}.")
    except Exception as e:
        print(f"Error occurred: {e}")

path = "Data/ner.json"
json_to_pkl(path, "Data/ner.pkl")

Successfully converted Data/ner.json to Data/ner.pkl.


In [1]:
import spacy
import pickle
import random

In [2]:
train_data = pickle.load(open('Data/ner.pkl', 'rb'))
train_data[0]

{'document': "Data Engineer (all genders) Babbel 10117 Berlin Babbel is driven by a mission: Everyone. Learning. Languages. This means building products that help people connect and communicate across cultures. Babbel, Babbel Travel and Babbel for Business focus on using a new language in the real world, in real situations, with real people. And it works: Studies with Yale University, City University of New York and Michigan State University prove that it gets users talking with confidence.  The key is a blend of humanity and technology. A team of linguistic experts hand-craft each of our courses, to provide content that is constantly adapted to our learners’ needs. Interactive content with videos and podcasts makes understanding a new language easy, because Babbel is for everyone. That’s why our team is as diverse as our content. From headquarters in Berlin and New York, 750 people from more than 60 nationalities represent the backgrounds, characteristics and perspectives that make al

In [3]:
train_data2 = []
for k in range(len(train_data)):
    new_elements = (train_data[k]['document'], { 'entities': [tuple(d.values())[:3] for d in train_data[k]['annotation']]})
    #print(new_elements)
    train_data2.append(new_elements) 
print(train_data2[0][1])


{'entities': [(0, 13, 'SKILLS'), (1717, 1731, 'SKILLS'), (1736, 1747, 'SKILLS'), (1796, 1808, 'SKILLS'), (1852, 1868, 'SKILLS'), (2012, 2023, 'SKILLS'), (2052, 2056, 'SKILLS'), (2085, 2101, 'SKILLS'), (2251, 2255, 'SKILLS'), (2324, 2330, 'SKILLS'), (2335, 2346, 'SKILLS'), (2374, 2390, 'EDUCATION'), (2402, 2413, 'SKILLS'), (2425, 2433, 'EXPERIENCE'), (2461, 2474, 'SKILLS'), (2528, 2534, 'SKILLS'), (2590, 2596, 'SKILLS'), (2674, 2688, 'SKILLS'), (2690, 2694, 'SKILLS'), (2695, 2698, 'SKILLS'), (2703, 2708, 'SKILLS'), (2750, 2754, 'SKILLS'), (2755, 2767, 'SKILLS'), (2769, 2776, 'SKILLS'), (2778, 2786, 'SKILLS'), (2788, 2804, 'SKILLS'), (2806, 2808, 'SKILLS'), (2857, 2861, 'SKILLS'), (2904, 2910, 'SKILLS'), (2940, 2951, 'SKILLS'), (2984, 2994, 'SOFT-SKILLS'), (3037, 3053, 'SKILLS'), (3085, 3089, 'SKILLS'), (3090, 3099, 'SKILLS'), (3111, 3115, 'SKILLS'), (3116, 3130, 'SKILLS'), (3147, 3160, 'SKILLS'), (3173, 3189, 'SKILLS'), (3215, 3217, 'SKILLS'), (3266, 3283, 'SKILLS'), (3284, 3298, 'SKILL

In [None]:
from spacy.training.example import Example
from spacy.util import minibatch, compounding

nlp = spacy.load('en_core_web_lg')

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

for _, annotations in train_data2:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

examples = []
for text, annotations in train_data2:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Train with batch strategy
batch_sizes = compounding(4.0, 32.0, 1.001)
# Disable other components in the pipeline during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # Only train NER
    optimizer = nlp.create_optimizer()
    """
    optimizer.learn_rate = 0.001
    for epoch in range(15):  # Set the number of epochs
        print(f"Epoch {epoch + 1}/{15}")
        losses = {}  # Initialize a dictionary to store losses
        for example in examples:
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Losses: {losses}") """
    for epoch in range(15):
        random.shuffle(train_data2)
        print(f"Epoch {epoch + 1}/15")
        losses = {}
        batches = minibatch(examples, size=batch_sizes)
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
            #evaluate_model(nlp, texts, true_labe
        print(f"Losses: {losses}")
       

Epoch 1/15
Losses: {'ner': 19845.127718293952}
Epoch 2/15
Losses: {'ner': 11518.848632735397}
Epoch 3/15
Losses: {'ner': 9246.412289732489}
Epoch 4/15
Losses: {'ner': 8107.543223173709}
Epoch 5/15
Losses: {'ner': 7418.729802750902}
Epoch 6/15
Losses: {'ner': 6890.216871345054}
Epoch 7/15
Losses: {'ner': 6384.49674207217}
Epoch 8/15
Losses: {'ner': 6101.940094810865}
Epoch 9/15
Losses: {'ner': 5861.615441802797}
Epoch 10/15
Losses: {'ner': 5533.575753756328}
Epoch 11/15
Losses: {'ner': 5310.5133907144855}
Epoch 12/15
Losses: {'ner': 4959.1984590189795}
Epoch 13/15
Losses: {'ner': 4791.42874350153}
Epoch 14/15
Losses: {'ner': 4596.223288790489}
Epoch 15/15
Losses: {'ner': 4398.479946490141}
