# Generating training data for multiple Entities

|| ================================================================ ||

# Cleaning data and auto creating json files

In [1]:
import generatedata as gd
import pandas as pd

In [2]:
df = pd.read_csv('./ner.csv')
df['Category'] = df['Category'].str.lower()

df['Category'] = [data.replace(' ','') for data in df['Category'].str.lower()]
entity_list = df['Category'].unique().tolist()


In [3]:
entity_list

['product',
 'work_of_art',
 'ordinal',
 'fundhouse',
 'media',
 'date',
 'gpe',
 'cardinal',
 'norp',
 'performance',
 'financialproduct',
 'regulator',
 'measure',
 'law',
 'mfconcept',
 'money',
 'products',
 'expenseratio',
 'productaddon',
 'customeridentifier',
 'portfolio',
 'kyc',
 'org',
 'ratingagency',
 'productinsight',
 'startegy',
 'strategy',
 'benchmark',
 'return',
 'investortype',
 'taxation',
 'banking',
 'finacialproductidentifier',
 'payoutmethod',
 'b2c',
 'distribution',
 'transactionmethod',
 'time',
 'trxnplatform',
 'bankingidentifier',
 'sid_kim_sai',
 'customerservice',
 'r&t',
 'frequency',
 'activeproduct',
 'passiveproduct',
 'consulting',
 'person',
 'ageband',
 'financialorg',
 'loc',
 'investing',
 'quantity',
 'fac',
 'digitalplatforms',
 'rta',
 'bank',
 'nonfinancialproduct',
 'event',
 'rtaexpert',
 'socialmedia',
 'businessfunction',
 'assetallocation',
 'gov',
 'financialinstrument',
 'ratings',
 'entity',
 'mutualfund',
 'process',
 'charges',
 

In [4]:
(entity_list.sort())

In [5]:
entity_list

['activeproduct',
 'ageband',
 'assetallocation',
 'b2c',
 'bank',
 'banking',
 'bankingidentifier',
 'benchmark',
 'businessfunction',
 'cardinal',
 'charges',
 'consulting',
 'customeridentifier',
 'customerservice',
 'date',
 'digitalplatforms',
 'distribution',
 'entity',
 'event',
 'expenseratio',
 'fac',
 'famousinvestor',
 'finacialproductidentifier',
 'financialinstrument',
 'financialorg',
 'financialplanning',
 'financialproduct',
 'frequency',
 'fundhouse',
 'gov',
 'gpe',
 'investing',
 'investortype',
 'kyc',
 'law',
 'loc',
 'measure',
 'media',
 'mfconcept',
 'money',
 'mutualfund',
 'nonfinancialproduct',
 'norp',
 'ordinal',
 'org',
 'passiveproduct',
 'payoutmethod',
 'performance',
 'person',
 'portfolio',
 'process',
 'product',
 'productaddon',
 'productinsight',
 'products',
 'quantity',
 'r&t',
 'ratingagency',
 'ratings',
 'regulator',
 'return',
 'rta',
 'rtaexpert',
 'sid_kim_sai',
 'socialmedia',
 'startegy',
 'strategy',
 'taxation',
 'time',
 'transactionme

# storing json files in directory

In [5]:
gd.createjson(path='E:/CustomNER/folder/',feature_name='Category',name='Name',list_name=entity_list,dataset=df)

In [6]:
from nercustom_automate import AutoCreatearr , GenerateRules

arr = AutoCreatearr(file='./ner.csv')

# Generating the training corpus

In [7]:
training_corp = arr.get_values(dir='E:/CustomNER/folder',feature='Category')

In [8]:
training_corp

[('E:/CustomNER/folder/activeproduct.json', 'active product'),
 ('E:/CustomNER/folder/ageband.json', 'age band'),
 ('E:/CustomNER/folder/assetallocation.json', 'asset allocation'),
 ('E:/CustomNER/folder/b2c.json', 'b2c'),
 ('E:/CustomNER/folder/bank.json', 'bank'),
 ('E:/CustomNER/folder/banking.json', 'banking'),
 ('E:/CustomNER/folder/bankingidentifier.json', 'banking identifier'),
 ('E:/CustomNER/folder/benchmark.json', 'benchmark'),
 ('E:/CustomNER/folder/businessfunction.json', 'business function'),
 ('E:/CustomNER/folder/cardinal.json', 'cardinal'),
 ('E:/CustomNER/folder/charges.json', 'charges'),
 ('E:/CustomNER/folder/consulting.json', 'consulting'),
 ('E:/CustomNER/folder/customeridentifier.json', 'customer identifier'),
 ('E:/CustomNER/folder/customerservice.json', 'customer service'),
 ('E:/CustomNER/folder/date.json', 'date'),
 ('E:/CustomNER/folder/digitalplatforms.json', 'digital platforms'),
 ('E:/CustomNER/folder/distribution.json', 'distribution'),
 ('E:/CustomNER/fo

In [9]:
len(training_corp)

72

# Generating patterns data (a set of rules)

In [10]:
generate = GenerateRules(data_pair=training_corp,generatefilejson='MF_patternsq2.json',prefix='no',split_words='no')

In [11]:
patterns_data = generate.traindata()

# displaying patterns data fro all entities

we will use this patterns data from our Custom model

In [12]:
len(patterns_data)

10296

# Generating custom model 

  this model can now classify multiple entities and can be used to create Training data for our own Custom NER model
  

In [13]:
generate.generate_model("mutual_funds_custom_model1")

In [14]:
from nercustom_automate import GenerateTrainingData , ModelTraining

In [15]:
training = GenerateTrainingData(model='mutual_funds_custom_model')



In [16]:
training_arr_1 = training.trainingcorpus(file='E:/CustomNER/training.txt')
training_arr_2 = training.trainingcorpus(file='E:/CustomNER/additional Data.txt')

In [17]:
main_training = training_arr_1 + training_arr_2

In [18]:
len(main_training)

1725

In [19]:
main_training

[['mutual fund’s', {'entities': [(0, 11, 'product')]}],
 ['mutual fund’s', {'entities': [(0, 11, 'product')]}],
 ['contents 01 introduction', {'entities': [(12, 24, 'work_of_art')]}],
 ['02 working of mutual funds',
  {'entities': [(3, 10, 'org'), (14, 26, 'product')]}],
 ['03 mutual funds for your need', {'entities': [(3, 15, 'product')]}],
 ['04 how to buy a mutual fund', {'entities': [(16, 27, 'fund house')]}],
 ['06 your first fund', {'entities': [(8, 13, 'ordinal')]}],
 ['disclaimer: any information contained herein is for informational purpose only and does not constitute advice or offer to sell/purchase units of the schemes of taurus mutual fund. information gathered and provided in this booklet has been provided by value research and the fund does not warrant the accuracy and/or completeness of any information. taurus amc/taurus mf disclaims any liability for action taken by anyone on the basis of the opinions contained herein.',
  {'entities': [(160, 178, 'fund house'),
    (2

In [9]:
training.save_data(file='main_training_data.json',data=main_training)

In [6]:
m = ModelTraining(pipe_name='ner')

In [7]:
model = m.trainmodel(data=main_training,iterations=50)

Statring iteration 0
{'ner': 6820.080517436479}
Statring iteration 1
{'ner': 3444.6405959844146}
Statring iteration 2
{'ner': 3029.1933275781885}
Statring iteration 3
{'ner': 2644.9591092335695}
Statring iteration 4
{'ner': 2311.421634233313}
Statring iteration 5
{'ner': 2101.0859405366973}
Statring iteration 6
{'ner': 1849.355904844937}
Statring iteration 7
{'ner': 1825.8036259928745}
Statring iteration 8
{'ner': 1816.0826843268082}
Statring iteration 9
{'ner': 1565.821346992256}
Statring iteration 10
{'ner': 1411.1260534143419}
Statring iteration 11
{'ner': 1412.2524350066974}
Statring iteration 12
{'ner': 1368.1727632319175}
Statring iteration 13
{'ner': 1280.4274603203949}
Statring iteration 14
{'ner': 1285.4346139953582}
Statring iteration 15
{'ner': 1263.058739104711}
Statring iteration 16


KeyboardInterrupt: 