### NLTK Name Entity Recognition

In [12]:
import nltk
from nltk import word_tokenize, sent_tokenize
import pandas as pd
from nltk.tag import pos_tag
nltk.download('tagsets')
nltk.help.upenn_tagset('NNP')
nltk.help.upenn_tagset('NN')
nltk.download('maxent_ne_chunker')
nltk.download('words')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to /home/kurubal/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/kurubal/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/kurubal/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Pyspark/ReDe/tr.txt", "r", encoding="utf8") as file:
    text = file.read(1000000)

In [None]:
words = word_tokenize(text)
pos_tags = nltk.pos_tag(words)
chunks = nltk.ne_chunk(pos_tags, binary=True)

In [None]:
entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

In [None]:
entities_df.to_excel("NLTK_NE.xlsx", sheet_name="NE", index=False)

### Spacy Name Entity Recognition

In [None]:
# !pip install spacy

In [None]:
import spacy
import spacy
from spacy import displacy
from collections import Counter
spacy.cli.download("xx_ent_wiki_sm")

In [None]:
nlp = spacy.load("xx_ent_wiki_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]) # "xx_ent_wiki_sm" multi language

In [None]:
text = "Prime Minister Narendra Modi on Tuesday announced the 266 billion dollars package for the India to fight against the coronavirus pandemic."

In [None]:
NER = nlp(text)

In [None]:
for w in NER.ents:
    print(w.text,w.label_)

In [None]:
try:
  text_list = []
  with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Pyspark/ReDe/xaa.tr", "r", encoding="utf-8", buffering=1000000) as file:
    for i in file:
      text_list.append(i)
  print(text_list)
except:
  print("There is not such a file  or path is incorrect")

In [None]:
# For Loop For List Sentence
text_ent = []
label_ent = []
for i in text_list:
    NER = nlp(i)
    for w in NER.ents:
        text_ent.append(w.text)
        label_ent.append(w.label_)
df_ne_spc = pd.DataFrame()
df_ne_spc["text"] = text_ent
df_ne_spc["label"] = label_ent
df_ne_spc

In [None]:
df_ne_spc.to_excel("Spacy_NE_50MB.xlsx", sheet_name="NE", index=False)

### Spark Name Entity Recognition

In [4]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.3.1
Apache Spark version: 3.1.2


In [5]:
pipeline = PretrainedPipeline('onto_recognize_entities_bert_tiny')

onto_recognize_entities_bert_tiny download started this may take some time.
Approx size to download 30.2 MB
[ | ]onto_recognize_entities_bert_tiny download started this may take some time.
Approximate size to download 30.2 MB
[ / ]Download done! Loading the resource.
[ — ]

2021-11-28 17:53:03.838984: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-28 17:53:03.916191: I external/org_tensorflow/tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3599500000 Hz
2021-11-28 17:53:05.551114: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1139] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the f

[OK!]


In [6]:
result = pipeline.annotate('Donald John Trump (born June 14, 1946) is the 45th and current president of the United States.')

In [8]:
print(result['ner'])

['B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'B-ORDINAL', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'I-GPE']


In [10]:
ner = [result['ner'] for content in result]
token = [result['token'] for content in result]
list(zip(token[0], ner[0]))

[('Donald', 'B-PERSON'),
 ('John', 'I-PERSON'),
 ('Trump', 'I-PERSON'),
 ('(born', 'O'),
 ('June', 'B-DATE'),
 ('14', 'I-DATE'),
 (',', 'I-DATE'),
 ('1946', 'I-DATE'),
 (')', 'O'),
 ('is', 'O'),
 ('the', 'O'),
 ('45th', 'B-ORDINAL'),
 ('and', 'O'),
 ('current', 'O'),
 ('president', 'O'),
 ('of', 'O'),
 ('the', 'B-GPE'),
 ('United', 'I-GPE'),
 ('States.', 'I-GPE')]

In [13]:
pd.DataFrame(list(zip(token[0], ner[0])))

Unnamed: 0,0,1
0,Donald,B-PERSON
1,John,I-PERSON
2,Trump,I-PERSON
3,(born,O
4,June,B-DATE
5,14,I-DATE
6,",",I-DATE
7,1946,I-DATE
8,),O
9,is,O
