In [2]:
# Directory containing JSON files
data_directory =r'C:\Users\Dell\OneDrive\Desktop\NLP Project\ner-label\ner-label'

In [5]:
import os
import json
import re

# Define the text cleaning function
def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Define the function to process all JSON files in a folder
def process_json_files(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            # Construct full file path
            input_filepath = os.path.join(input_folder, filename)
            
            # Read JSON data from file
            with open(input_filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)
            
            # Apply clean_text function to the text field in JSON data
            if 'text' in data:
                data['text'] = clean_text(data['text'])
            
            # Construct output file path
            output_filepath = os.path.join(output_folder, filename)
            
            # Save cleaned data to new JSON file
            with open(output_filepath, 'w', encoding='utf-8') as file:
                json.dump(data, file, ensure_ascii=False, indent=4)
            
            print(f"Processed {filename}")

# Example usage
input_folder = r'C:\Users\Dell\OneDrive\Desktop\NLP Project\ner-label\ner-label'
output_folder = r'C:\Users\Dell\OneDrive\Desktop\NLP Project\ner-label'
process_json_files(input_folder, output_folder)


Processed alabelled.json
Processed annotations (1).json
Processed annotations (2).json
Processed annotations (3).json
Processed annotations.json
Processed melabelled.json


In [6]:
import os
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

# Function to load JSON data from a file
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def validate_data(text, entities):
    for start, end, label in entities:
        if not isinstance(start, int) or not isinstance(end, int):
            raise ValueError(f"Entity spans must be integers. Got: start={start}, end={end}")
        if start < 0 or end > len(text):
            raise ValueError(f"Entity spans are out of range for the given text. Got: start={start}, end={end}, text length={len(text)}")
        if start >= end:
            raise ValueError(f"Entity start must be less than end. Got: start={start}, end={end}")
    return True

def convert_data(data):
    training_data = []
    annotations = data.get("annotations", [])
    for item in annotations:
        if item is None or not isinstance(item, list) or len(item) < 2:
            continue
        text = item[0]
        annotations_dict = item[1]
        entities = [(start, end, label) for start, end, label in annotations_dict.get("entities", [])]
        # Validate data
        validate_data(text, entities)
        training_data.append((text, {"entities": entities}))
    return training_data

# Directory containing JSON files
data_directory = r'C:\Users\Dell\OneDrive\Desktop\NLP Project\ner-label'

# Collect all training data
all_training_data = []
for filename in os.listdir(data_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(data_directory, filename)
        data = load_json_data(file_path)
        training_data = convert_data(data)
        all_training_data.extend(training_data)

# Create a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add new labels to the NER component
for _, annotations in all_training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components (if any)
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Training the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(500):
        random.shuffle(all_training_data)
        losses = {}
        batches = minibatch(all_training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

# Save the trained model
nlp.to_disk("trained_model")




Iteration 1, Losses: {'ner': 18977.32207971606}
Iteration 2, Losses: {'ner': 8736.424551547705}
Iteration 3, Losses: {'ner': 6978.453593330172}
Iteration 4, Losses: {'ner': 6457.878148941182}
Iteration 5, Losses: {'ner': 6414.557613779503}
Iteration 6, Losses: {'ner': 5786.9464194288485}
Iteration 7, Losses: {'ner': 5902.413834297227}
Iteration 8, Losses: {'ner': 5734.882802135129}
Iteration 9, Losses: {'ner': 5823.018464014976}
Iteration 10, Losses: {'ner': 5606.038657792981}
Iteration 11, Losses: {'ner': 5484.30115898642}
Iteration 12, Losses: {'ner': 6419.588278037703}
Iteration 13, Losses: {'ner': 5613.095119632855}
Iteration 14, Losses: {'ner': 5302.382082684857}
Iteration 15, Losses: {'ner': 5819.77595680528}
Iteration 16, Losses: {'ner': 6191.510398483316}
Iteration 17, Losses: {'ner': 5244.18663463745}
Iteration 18, Losses: {'ner': 7355.081919263793}
Iteration 19, Losses: {'ner': 5626.0492774500635}
Iteration 20, Losses: {'ner': 5429.793784206751}
Iteration 21, Losses: {'ner': 

In [8]:
nlp.to_disk(r"C:\Users\Dell\OneDrive\Desktop\NLP Project")

In [9]:
import spacy
from spacy.training import Example
import json
import os

# Function to load test data
def load_data(data_directory):
    all_data = []
    for filename in os.listdir(data_directory):
        if filename.endswith('.json'):
            file_path = os.path.join(data_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            for item in data.get("annotations", []):
                if item is None:
                    continue
                text, annotations = item
                entities = [(start, end, label) for start, end, label in annotations.get("entities", [])]
                all_data.append((text, {"entities": entities}))
    return all_data

# Function to evaluate the model
def evaluate_model(nlp, test_data):
    examples = []
    for text, annotations in test_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    
    results = nlp.evaluate(examples)
    return results

# Load the trained model
model_path = r"C:\Users\Dell\OneDrive\Desktop\trained model"
nlp = spacy.load(model_path)

# Load test data
test_data_directory = r"C:\Users\Dell\OneDrive\Desktop\NLP Project\ner-label"
test_data = load_data(test_data_directory)

# Evaluate the model
results = evaluate_model(nlp, test_data)

# Print the evaluation results
print("NER Evaluation Results:")
if results.get('ents_p') is not None:
    print(f"Precision: {results['ents_p']:.2f}")
    print(f"Recall: {results['ents_r']:.2f}")
    print(f"F1-Score: {results['ents_f']:.2f}")
else:
    print("No entities were predicted. Please check your test data and model.")

# Print scores for individual entity types
print("\nScores by entity type:")
if results.get('ents_per_type'):
    for entity_type, scores in results['ents_per_type'].items():
        print(f"{entity_type}:")
        print(f"  Precision: {scores['p']:.2f}")
        print(f"  Recall: {scores['r']:.2f}")
        print(f"  F1-Score: {scores['f']:.2f}")
else:
    print("No entity types were evaluated. Please check your test data and model.")

# # Print some example predictions
# print("\nExample Predictions:")
# for text, _ in test_data[:5]:  # Print predictions for first 5 examples
#     doc = nlp(text)
#     print(f"\nText: {text}")
#     print("Predicted Entities:")
#     for ent in doc.ents:
#         print(f"  {ent.text} - {ent.label_}")


NER Evaluation Results:
Precision: 0.81
Recall: 0.79
F1-Score: 0.80

Scores by entity type:
LOCATION:
  Precision: 0.92
  Recall: 0.94
  F1-Score: 0.93
QUALIFICATION:
  Precision: 0.70
  Recall: 0.67
  F1-Score: 0.68
INDUSTRY:
  Precision: 0.69
  Recall: 0.71
  F1-Score: 0.70
SKILL :
  Precision: 0.85
  Recall: 0.85
  F1-Score: 0.85
EXPERIENCE:
  Precision: 0.93
  Recall: 0.97
  F1-Score: 0.95
SKILL:
  Precision: 0.88
  Recall: 0.85
  F1-Score: 0.87
JOB TITLE:
  Precision: 0.84
  Recall: 0.82
  F1-Score: 0.83
SALARY:
  Precision: 0.99
  Recall: 0.99
  F1-Score: 0.99
JOB_TITLE:
  Precision: 0.92
  Recall: 0.37
  F1-Score: 0.53


In [12]:
#Test the model
import spacy
from spacy import displacy

# Load the trained model
nlp = spacy.load(r"C:\Users\Dell\OneDrive\Desktop\trained model")

# Function to visualize entities in text
def visualize_ner(text):
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)

In [13]:
# Specify the path to your text file
file_path = r"C:\Users\Dell\OneDrive\Desktop\NLP Project/output_file_3.txt"

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


visualize_ner(text)