Imports

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence, Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
import pandas as pd

In [2]:
data_json = pd.read_json('../../data/YungJunData/NLPoutput.json', lines = True)

data_json["content"] = data_json["content"].apply(lambda x: x.replace("\n", " "))
data_json["content"] = data_json["content"].apply(lambda x: x.lower())

data_json["content"].head(5)

0    contact www.linkedin.com/in/hongong (linkedin)...
1    contact www.linkedin.com/in/jun-wai- chin-ob80...
2    contact www.linkedin.com/in/mark-tan-bc (linke...
3    contact www.linkedin.com/in/lohys68 (linkedin)...
4    contact www.linkedin.com/in/adi-zafri- boba6a1...
Name: content, dtype: object

In [3]:
#NAM - Name
#COL - CollegeName
#GRA - GraduationYear
#YOE - YearsofExperience
#COM - CompaniesWorkAt
#EMA - Email
#LOC - Location
#SKI - Skills
data_json.iloc[0].content[2510]

'1'

In [4]:
data_json.iloc[69].annotation

[{'label': ['Name'],
  'points': [{'start': 115, 'end': 124, 'text': 'Travis Ng'}]},
 {'label': ['CollegeName'],
  'points': [{'start': 1593,
    'end': 1630,
    'text': 'Tunku Abdul Rahman University College'}]},
 {'label': ['GraduationYear'],
  'points': [{'start': 1456, 'end': 1460, 'text': '2014'}]},
 {'label': ['YearsofExperience'],
  'points': [{'start': 352, 'end': 368, 'text': '3 years 6 months'}]},
 {'label': ['CompaniesWorkAt'],
  'points': [{'start': 150,
    'end': 177,
    'text': 'Oriental Steel Pipe Sdn Bhd'}]},
 {'label': ['Email'],
  'points': [{'start': 8,
    'end': 48,
    'text': 'www.linkedin.com/in/travis- ng-9a391213b'}]},
 {'label': ['Skills'],
  'points': [{'start': 71, 'end': 81, 'text': 'Accounting'}]},
 {'label': ['Skills'],
  'points': [{'start': 82, 'end': 98, 'text': 'Microsoft Office'}]},
 {'label': ['Skills'],
  'points': [{'start': 99, 'end': 114, 'text': 'Microsoft Excel'}]}]

## Preprocessing

In [5]:
def json_to_BIO(row):
    #{8:B-EMA}
    tags = {'Name':'NAM',
            'CollegeName':'COL',
            'GraduationYear':'GRA',
            'YearsofExperience':'YOE',
            'CompaniesWorkAt':'COM',
            'Email':'EMA',
            'Location':'LOC',
            'Skills':'SKI'}
    annot = dict()
    start_l = list()
    
    for label in row['annotation']:
        start = label['points'][0]['start']
        end = label['points'][0]['end']
        start_l.append(start)
        for i in range(start,end):
            annot[i] = tags[label['label'][0]]
        
    
    text = row['content']
    tokens = row['content'].split(" ")
    
    tagging = list()
    idx = 0
    buffer = ""
    entity = ""
    pos = ""
    wasO = False
    ignore = ['(',')','*',',','.']
    for idx,char in enumerate(text):
        if char != ' ':
            if char not in ignore:
                if buffer == "":
                    if idx in annot.keys():
                        if wasO or idx in start_l:
                            pos = 'B'
                        else:
                            pos = 'I'
                        entity = annot[idx]
                    else:
                        pos = 'O'
                buffer += char
        else:
            if entity != "":
                wasO = False
                tagging.append(buffer + " " + pos + "-" + entity)
            else:
                wasO = True
                tagging.append(buffer + " " + pos)
            buffer = ""
            entity = ""
            pos = ""
    return tagging

data = data_json.apply(json_to_BIO, axis=1)
#print(data[0])

In [6]:
data_new = list()
for i in data:
    data_new.append([j for j in i if j != " "])

In [7]:
train_data = ""
test_data = ""
dev_data = ""
for i in data_new[:70]:
    train_data += "\n".join(i)
    train_data += "\n\n"

for i in data_new[70:85]:
    test_data += "\n".join(i)
    test_data += "\n\n"
    
for i in data_new[85:]:
    dev_data += "\n".join(i)
    dev_data += "\n\n"

with open("processed/train_data.txt", "w") as text_file:
    text_file.write(train_data)

with open("processed/test_data.txt", "w") as text_file:
    text_file.write(test_data)

with open("processed/dev_data.txt", "w") as text_file:
    text_file.write(dev_data)


In [8]:
# define columns
columns = {0: 'text', 1:'ner'}

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus('processed', columns,
                              train_file='train_data.txt',
                              test_file='test_data.txt',
                              dev_file='dev_data.txt')

2021-12-12 14:59:27,996 Reading data from processed
2021-12-12 14:59:27,998 Train: processed/train_data.txt
2021-12-12 14:59:27,998 Dev: processed/dev_data.txt
2021-12-12 14:59:27,999 Test: processed/test_data.txt


In [9]:
label_type = 'ner'

#make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

2021-12-12 14:59:28,499 Computing label dictionary. Progress:


100%|█████████████████████████████████████████| 70/70 [00:00<00:00, 1333.58it/s]

2021-12-12 14:59:28,575 Corpus contains the labels: ner (#22313)
2021-12-12 14:59:28,575 Created (for label 'ner') Dictionary with 17 tags: <unk>, O, B-EMA, B-SKI, I-SKI, B-NAM, I-NAM, B-COM, I-COM, B-YOE, I-YOE, B-LOC, I-LOC, B-COL, I-COL, B-GRA, I-EMA
Dictionary with 17 tags: <unk>, O, B-EMA, B-SKI, I-SKI, B-NAM, I-NAM, B-COM, I-COM, B-YOE, I-YOE, B-LOC, I-LOC, B-COL, I-COL, B-GRA, I-EMA





In [10]:
#initialize embedding stack with Flair and GloVe
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)


In [11]:
#initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

In [12]:
#initialize trainer
trainer = ModelTrainer(tagger, corpus)


In [13]:
#training
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=1,
              max_epochs=60)

2021-12-12 14:59:35,375 ----------------------------------------------------------------------------------------------------
2021-12-12 14:59:35,376 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, b

{'test_score': 0.5936073059360729,
 'dev_score_history': [0.0,
  0.0,
  0.2368421052631579,
  0.22580645161290322,
  0.3426294820717131,
  0.3333333333333333,
  0.4000000000000001,
  0.46009389671361506,
  0.4824561403508772,
  0.3724137931034483,
  0.5447761194029851,
  0.46511627906976744,
  0.49659863945578236,
  0.5088339222614842,
  0.570281124497992,
  0.5916666666666667,
  0.6042553191489363,
  0.5909090909090908,
  0.6178861788617886,
  0.6008583690987124,
  0.5321100917431193,
  0.5941422594142258,
  0.6224066390041493,
  0.5689655172413793,
  0.6264150943396226,
  0.6007604562737644,
  0.5514018691588785,
  0.616600790513834,
  0.6173913043478261,
  0.6346863468634687,
  0.6610169491525424,
  0.6446886446886447,
  0.6206896551724138,
  0.6255144032921811,
  0.6561264822134387,
  0.644194756554307,
  0.6220472440944882,
  0.631578947368421,
  0.6459143968871595,
  0.6339622641509434,
  0.6356589147286822,
  0.6468401486988848,
  0.6307692307692307,
  0.6307692307692307,
  0.63

In [17]:
import torch
import gc
del trainer
gc.collect()

172

In [18]:
torch.cuda.empty_cache()

In [14]:
# load the model
model = SequenceTagger.load('resources/taggers/sota-ner-flair/final-model.pt')


2021-12-12 15:37:33,959 loading file resources/taggers/sota-ner-flair/final-model.pt


In [26]:
test = """Contact

www.linkedin.com/in/john-ooi- a593b145 (LinkedIn)

Top Skills Firewalls

Security TCP/IP

Certifications

Tellabs - Certificate of Achievement Tellabs Managed Edge System

Tellabs 8600 Hardware Interface Module Installation and Replacement Certification

Certified Information System Security Professional (CISSP)

Certified Penetration Testing Engineer

AlienVault Certified Security Engineer
John Ooi

Principal Security Consultant Seiangoi

Experience

SysArmy Sdn Bhd Principal Security Consultant November 2017 - Present (4 years 2 months)

PKF Malaysia Senior Security Consultant November 2016 - October 2017 (1 year)

e-Cop Surveillance Sdn Bhd Security Consultant November 2012 - October 2016 (4 years)

* Supporting various of Managed Security Service clients.
* Work closely with SOC team to provide 2nd level of support.
* Involved in pre-sales activities for market leading ICT security solutions such as IFS, APT, SIEM etc.
* Provision of Technical Pre-sales and Post-Sales support to assigned

accounts (Financial Service Sector) * Preparation of Technical Proposals, Tender Documentation, Presentations, Technical Briefings and follow-up discussion, implementation and review * Preparation of Monthly Security Incident Reports and Presentation * Liaising with clients/users with regards to security incident reports and

handling * Provision of Advisories and Technical Recommendations to clients to enhance security network posture, controls, policies, processes and practices.
* Propose technical solutions and functional consultation and support in project implementation with excellent customer service skills.
* Provide guidance to Security Engineer in project implementation & troubleshooting.
* Involve in operational services: security compliance, gap analysis, risk management, resource violation and external gateways perimeter review.

AT&T Communication Services Project Validation & Guidance Coordinator April 2011 - November 2012 (1 year 8 months)

Page 1 of 2
 
Diversified Gateway Berhard Network Support Engineer March 2009 - April 2011 (2 years 2 months)

Satyam Computer Services Ltd Network & System Support Engineer July 2007 - March 2009 (1 year 9 months)

Education

Universiti Putra Malaysia Computer Science, Networking

Page 2 of 2"""


# create example sentence
sentence = Sentence(test)

# predict tags and print
model.predict(sentence)

for entity in sentence.get_spans('ner'):
    print(entity)

Span [14,15]: "Firewalls Security"   [− Labels: SKI (0.7791)]
Span [16]: "TCP"   [− Labels: SKI (0.985)]
Span [54,55]: "John Ooi"   [− Labels: NAM (0.9992)]
Span [61,62,63]: "SysArmy Sdn Bhd"   [− Labels: COM (0.9715)]
Span [72,73,74,75]: "4 years 2 months"   [− Labels: YOE (0.9453)]
Span [346,347,348]: "Universiti Putra Malaysia"   [− Labels: COL (0.9492)]
