<a href="https://colab.research.google.com/github/Chiamakac/TRAININGS/blob/main/Alignment/Projection_Work/SPACY_NER/Spacy_IgboNER_DEV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Developing IgboNER tagger with Spacy

In [None]:
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/MasakhaNER2.0/data/ibo/dev.txt
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/MasakhaNER2.0/data/ibo/test.txt
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/MasakhaNER2.0/data/ibo/train.txt
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/ibo/dev.txt
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/ibo/test.txt
!wget https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/ibo/train.txt

In [None]:
import os

# Function to merge text files in a directory
def merge_text_files(directory, output_file):
    with open(output_file, 'w') as merged_file:
        for file_name in os.listdir(directory):
            if file_name.endswith('.txt'):
                file_path = os.path.join(directory, file_name)
                with open(file_path, 'r') as current_file:
                    content = current_file.read()
                    merged_file.write(content)

# Directory containing the text files to merge
text_files_directory = '/content/masakhane'

# Output file where merged content will be saved
output_file = 'merged_output.txt'

# Merge the text files
merge_text_files(text_files_directory, output_file)

In [None]:
# Read the lines from the 'dev.txt' file, strip leading/trailing spaces and split by spaces
# This creates a list of lists where each sublist contains words from a line
lines = [line.strip().split() for line in open('merged_output.txt').readlines()]

# Extract the first word from each sublist and join them into a single string with spaces in between
# This creates a string containing the first word of each non-empty line
text = " ".join([l[0] for l in lines if l])


In [None]:
def join_spans(tag_list):
    i = 0
    tag_spans = []

    # Iterate through the tokens and tags
    while i < len(tag_list):
        token, tag = tag_list[i]

        # Merge contiguous tokens with the same NER tag
        while i + 1 < len(tag_list) and tag_list[i + 1][1].startswith('I'):
            token = token + ' ' + tag_list[i + 1][0]
            i += 1

        # Append the merged token and its tag to the tag_spans list
        tag_spans.append((token, tag))
        i += 1

    # Return the spans, excluding the 'I-' prefix from the NER tags
    return [(t, s[2:]) for t, s in tag_spans]

# Call the join_spans function with a list of lines, extracting non-empty lines with NER tags other than 'O'
ner_taglist = join_spans([l for l in lines if l and l[1] != 'O'])

# Explanation:
# The join_spans function iterates through the tokens and their corresponding tags, merging tokens with the same NER tag to form spans.
# It uses a while loop to merge contiguous tokens with tags starting with 'I'.
# The merged tokens and their corresponding tags are appended to the tag_spans list.
# Finally, the function returns a list of tuples where each tuple contains a merged token and its corresponding NER tag (excluding the 'I-' prefix).

In [None]:
# Extract tokens with NER tags ending with 'LOC','DATE', 'ORG'
locs = [l[0] for l in ner_taglist if l[1].endswith('LOC')]
dates = [l[0] for l in ner_taglist if l[1].endswith('DATE')]
orgs = [l[0] for l in ner_taglist if l[1].endswith('ORG')]

# Extract tokens with NER tags ending with 'PER' (excluding '-')
pers = [l[0] for l in ner_taglist if l[1].endswith('PER') and l[0] != '-']

In [None]:
len(set(locs))

1294

In [None]:
import spacy
from spacy import displacy

# Load the English model and disable NER and parser
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # we will change the English model later...

# add spacy's EntityRuler
ruler = nlp.add_pipe('entity_ruler')#, before='ner')

# Define entity patterns based on LOC, DATE, ORG, and PER tokens
patterns  = [{'label':'LOC', 'pattern':loc} for loc in set(locs)]
patterns += [{'label':'DATE', 'pattern':date} for date in set(dates)]
patterns += [{'label':'ORG', 'pattern':orgs} for orgs in set(orgs)]
patterns += [{'label':'PER', 'pattern':pers} for pers in set(pers)]

# Add the patterns to the entity ruler
ruler.add_patterns(patterns)

In [None]:
sentences = """Buhari nọ n'isi gọọmentị mpụ na nrụrụaka - Saraki Ọ pụtara na enweghi ihe e mere mekaa ndị Igbo ?
ASUU strike : Kedụ mgbe ọ ga - ebi , ka Willie Obiano jụrụ?
Sineti sị gọọmenti kpọchita Onnoghen Ndị Sịnetị a bụ nke onyeisi ha bụ Bukola Saraki bụ nke agba asatọ n'ime ọchịchị onye kwuo uche ya.
Ndị Sịnetị a bụ nke onyeisi ha bụ Bukola Saraki bụ nke agba asatọ n'ime ọchịchị onye - kwuo - uche - ya
Nkeiruka, Adamu, Willie, Kuryas gara Anambra na Mee afọ 2021.
N'aka ọzọ , ụlọomeiwu Sineti akpọpụla gọọmenti etiti ụlọikpe ka ha were ọsọ were ije kpọchite Onnoghen bụ onye bụbụ onyeisi ndị okaikpe na Naịjirịa .""".splitlines()

In [None]:
# Process each sentence

doc = nlp(sentences[5])

    # Print named entities in the sentence
for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

    # Visualize the entities using displacy
displacy.render(doc, style="ent", jupyter=True)

Naịjirịa 139 147 LOC


In [None]:
# Process each sentence and print named entities
for sentence in sentences:
    doc = nlp(sentence)

    # Check if there are entities in the document
    if doc.ents:
        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)

        # Visualize the entities using displacy
        displacy.render(doc, style="ent", jupyter=True)
    else:
        print( sentence)


Buhari 0 6 PER


Willie Obiano 40 53 PER


Sineti sị gọọmenti kpọchita Onnoghen Ndị Sịnetị a bụ nke onyeisi ha bụ Bukola Saraki bụ nke agba asatọ n'ime ọchịchị onye kwuo uche ya.
Ndị Sịnetị a bụ nke onyeisi ha bụ Bukola Saraki bụ nke agba asatọ n'ime ọchịchị onye - kwuo - uche - ya
Nkeiruka 0 8 PER
Adamu 10 15 PER
Willie 17 23 PER
Kuryas 25 31 PER
Anambra 37 44 LOC
2021 56 60 DATE


Naịjirịa 139 147 LOC


In [None]:
# Function to save unique tokens to a file
def save_unique_tokens_to_file(tokens, filename):
    unique_tokens = set(tokens)  # Convert to a set to get unique tokens
    with open(filename, 'w') as file:
        for token in unique_tokens:
            file.write(token + '\n')

# Assume you have extracted 'locs', 'dates', 'orgs', and 'pers' lists

# Save unique tokens with NER tag ending in 'LOC' to a file
save_unique_tokens_to_file(locs, 'unique_tokens_loc.txt')

# Save unique tokens with NER tag ending in 'DATE' to a file
save_unique_tokens_to_file(dates, 'unique_tokens_date.txt')

# Save unique tokens with NER tag ending in 'ORG' to a file
save_unique_tokens_to_file(orgs, 'unique_tokens_org.txt')

# Save unique tokens with NER tag ending in 'PER' to a file
save_unique_tokens_to_file(pers, 'unique_tokens_per.txt')
