# Domain Entity Recogniser

## Import Statements & Data Sources

The dataset can be in .csv or .txt format. In .txt, the first line should be considered as header.

In [18]:
import nltk
from collections import Counter
from IPython.display import clear_output
import timeit
datasetPath = "dataset.csv" # Relative File Path of Dataset w.r.t. script
outputPath = "DomainEntity.csv" # Relative File Path of Output w.r.t. script

## Getting Sentences

In [19]:
with open(datasetPath, "r") as file:
    next(file) # Header
    lines = file.readlines()
rawData = " ".join( [ i.strip("\n").strip().lower() for i in lines ] ) # Raw Data with all the words in lower case, seperated by whitespace

## Initialisation of Variables

In [20]:
totalLines = len(lines) # Total number of lines
limit = int(1.6e-05 * totalLines) + 2 # Limit for frequency of ngrams
freqEntity = set() # Set of frequent entities
continuityLength = 0 # Number of continuous words which are nouns
continuityString = "" # String of continuous words which are nouns
common = set() # Set of frequent entities which are nouns

## Getting Frequent n-grams

Accepts "n" and adds the frequent n-grams to set of frequent entities

In [21]:
def getFreqNGram(n, max_n=5):
    ngram = Counter(nltk.ngrams(rawData.split(), n))
    for gram in ngram.most_common():
        if gram[1]>(max_n-n+1)*limit:
            freqEntity.add(" ".join(gram[0])) # Add n-gram to the set

## Matching n-grams

Accepts "n" and checks whether the n-grams in the sentences are domain entities

In [22]:
def matchNGram(n):
    global continuityLength
    global continuityString
    ngramSent = nltk.ngrams(continuityString.split(),n)
    for gram in ngramSent:
        if " ".join(gram).lower() in freqEntity:
            common.add(" ".join(gram)) # Add entity to the set
            # Remove entity from string of continuous nouns
            continuityString = continuityString.replace(" ".join(gram), "")
            continuityLength -= n
            if continuityLength < n:
                break

## Getting Domain Entities

Extracts the domain entities in a continuous string of nouns

In [23]:
def getDomainEntity():
    if continuityLength >= 5:
        matchNGram(5)
    if continuityLength >= 4:
        matchNGram(4)
    if continuityLength >= 3:
        matchNGram(3)
    if continuityLength >= 2:
        matchNGram(2)
    matchNGram(1)

## Main

Stores frequent n-grams, n ranging from 1 to 5, in a set.
Writes to a file the domain entity.

In [24]:
print("Initializing...")
for n in range(1,6):
    getFreqNGram(n)
# File Initializiation
file = open(outputPath,"w")
file.write("SENTENCE ID,NAMED ENTITES\n")
start = timeit.default_timer() # For indicating status of loop
for sentid, sent in enumerate(lines):
    file.write(str(sentid+1) + ",") # Sentence ID
    # Skip sentences of length 1
    if len(sent.split()) <= 1:
        file.write("\n")
        continue
    tagged = nltk.pos_tag(nltk.word_tokenize(sent)) # POS tagged sentence
    # Initialisation
    continuityLength = 0
    continuityString = ""
    common.clear()
    for (word, tag) in tagged:
        if tag in ["NN", "NNS", "NNP", "NNPS"]:
            continuityLength += 1
            continuityString = continuityString + word + " "
        elif continuityLength > 0:
            getDomainEntity()
            continuityLength = 0
            continuityString = ""
    if continuityLength > 0:
         getDomainEntity()
    for i in common:
        file.write(i + "|")
    file.write("\n")
    # For indicating status of loop
    if sentid % int(totalLines / 1000) == 0:
        stop = timeit.default_timer()
        if (sentid / totalLines * 100) < 5:
            expected_time = "Calculating..."
        else:
            time_perc = timeit.default_timer()
            expected_time = str(round(((time_perc - start) / (sentid / totalLines)) / 60, 2)) + " minutes"
        clear_output(wait = True)
        print("Progress:", round(sentid / totalLines * 100, 2), "%")
        print("Run Time:", round((stop - start) / 60, 2), "minutes")
        print("Expected Run Time:", expected_time)
stop = timeit.default_timer()
clear_output(wait = True)
print("Progress: 100 %")
print("Run Time:", round((stop - start) / 60, 2), "minutes")
file.close()

Progress: 100 %
Run Time: 5.15 minutes
