In [3]:
import spacy
import json
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, protocol=3)

In [4]:
with open('insider_rule_dump.json') as f:
    data = json.load(f)
with open('project_2_labels.json') as f:
    label_file = json.load(f)
labels = []
for k in label_file:
    labels.append(k['text'])
labels.append('Action')
labels

['Person In charge',
 'Place',
 'Assets',
 'Person - General',
 'Object',
 'Company',
 'Penalty',
 'Investigation',
 'Legal Doc',
 'time',
 'Action']

In [3]:
#Extracting rules as a dictionary to be given to rule objects.
def extract_attributes(line):
    output_dir = 'my_mixed_random/'
    nlp2 = spacy.load(output_dir)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(line)
    doc2 = nlp2(line)
    output_dict = {k:[] for k in labels}
    for token in doc:
        if token.pos_ == 'VERB':
            output_dict['Action'].append(token.text)
    for ent in doc2.ents:
        output_dict[ent.label_].append(ent.text)
    return output_dict
#Cleaning objects for repetitions and removing stop words in verbs/actions.
def clean_attributes(output_dict):
    clean_dir = {}
    for k in output_dict.keys():
        clean_dir[k]=list(set(output_dict[k]))
    try:
        clean_dir['Action'].remove('shall')
    except ValueError:
        pass
    try:
        clean_dir['Action'].remove('may')
    except ValueError:
        pass
    try:
        clean_dir['Action'].remove('would')
    except ValueError:
        pass
    return clean_dir

In [4]:
line = data[11]
print(extract_attributes(line))
print(clean_attributes(extract_attributes(line)))

{'Person In charge': [], 'Place': [], 'Assets': [], 'Person - General': ['Any person', 'insider', 'persons'], 'Object': ['unpublished price sensitive information', 'legitimate purpose', 'unpublished price sensitive information'], 'Company': [], 'Penalty': [], 'Investigation': [], 'Legal Doc': ['regulations', 'notice', 'regulations'], 'time': [], 'Action': ['shall', 'considered', 'shall', 'given', 'maintain']}
{'Person In charge': [], 'Place': [], 'Assets': [], 'Person - General': ['insider', 'Any person', 'persons'], 'Object': ['legitimate purpose', 'unpublished price sensitive information'], 'Company': [], 'Penalty': [], 'Investigation': [], 'Legal Doc': ['regulations', 'notice'], 'time': [], 'Action': ['given', 'considered', 'maintain']}


In [6]:
import re
#The following functions are used to index the documents is order and combine them chapter wise.
p = re.compile(r'^(\(\d\)|\(\d\d\))')
new_data = []
int_list = []
for k in range(len(data)):
    int_list.append(data[k])
    if k+1<len(data):
        if not p.findall(data[k+1]):
            pass
        else:
            new_data.append(''.join(map(str, int_list)))
            int_list = []

for line in new_data:
    p = re.compile(r'^(\(\d\)|\(\d\d\))')

reg_data = []
chap_list = []
for k in range(len(new_data)):
    chap_list.append(new_data[k])
    if k+1<len(new_data):
        if p.findall(new_data[k+1])[0][1]=='1':
            reg_data.append(chap_list)
            chap_list = []
reg_dict = {}
#reg_dict is a dictionary of rules with keys as chapter numbers and values as list of ordered rules(strings).
count = 0
for k in reg_data:
    rules_dict = {}
    for i in k:
        rules_dict[p.findall(i)[0][1]] = i 
    count +=1
    reg_dict[count] = rules_dict

# print(reg_dict)

raw_reg_dict = reg_dict
raw_reg_dict

{1: {'1': '(1) These regulations may be called the SEBI (Prohibition of Insider Trading) Regulations, 2015.\n',
  '2': '(2)These regulations shall come into force on the one hundred and twentieth day from the date of its publication in the Official Gazette.\n'},
 2: {'1': '(1) In these regulations, unless the context otherwise requires, the following words, expressions and derivations therefrom shall have the meanings assigned to them as under:–\n(a)“Act” means the Securities and Exchange Board of India Act, 1992 (15 of 1992);\n(b)“Board” means the Securities and Exchange Board of India;\n(c)“compliance officer” means any senior officer, designated so and reporting to the board of directors or head of the organization in case board is not there, who is financially literate and is capable of appreciating requirements for legal and regulatory compliance under these regulations and who shall be responsible for compliance of policies, procedures, maintenance of records, monitoring adherenc

In [63]:
from Rule import Rule
regulation_doc = {}
for chapter_num in raw_reg_dict:
    regulation_doc[chapter_num] = {}
    for rule_index in raw_reg_dict[chapter_num]:
        d = clean_attributes(extract_attributes(raw_reg_dict[chapter_num][rule_index]))
        #In the above line of code, a rule text present in the indexed rules and regulations is
        #sent to a ner model and its 11 attributes are collected and sent back in a dictionary.
        #This dictionary is sent to the import module of a rule to create and fill a rule object.
        r = Rule('Insider Trading', int(chapter_num)+1, int(rule_index)+1, raw_reg_dict[chapter_num][rule_index])
        r.fill_attributes(d)
        regulation_doc[chapter_num][rule_index] = r
regulation_doc        

{1: {'1': <Rule.Rule at 0x7f6906971400>, '2': <Rule.Rule at 0x7f6902a295b0>},
 2: {'1': <Rule.Rule at 0x7f6902beca30>},
 3: {'1': <Rule.Rule at 0x7f6902bbcf40>,
  '2': <Rule.Rule at 0x7f69051980a0>,
  '3': <Rule.Rule at 0x7f6902e98310>,
  '4': <Rule.Rule at 0x7f6905316f40>},
 4: {'1': <Rule.Rule at 0x7f690532e790>},
 5: {'1': <Rule.Rule at 0x7f69027e0580>,
  '2': <Rule.Rule at 0x7f690531f160>,
  '3': <Rule.Rule at 0x7f6902ee0760>,
  '4': <Rule.Rule at 0x7f6902ab1730>},
 6: {'1': <Rule.Rule at 0x7f6901ea5430>,
  '2': <Rule.Rule at 0x7f69052ff8b0>,
  '3': <Rule.Rule at 0x7f69028a4370>,
  '4': <Rule.Rule at 0x7f6902b38cd0>},
 7: {'1': <Rule.Rule at 0x7f6902a3fb80>, '2': <Rule.Rule at 0x7f690315af70>},
 8: {'1': <Rule.Rule at 0x7f6902e8d670>},
 9: {'1': <Rule.Rule at 0x7f69027e0d60>, '2': <Rule.Rule at 0x7f6903117a90>},
 10: {'1': <Rule.Rule at 0x7f6906a9d8e0>,
  '2': <Rule.Rule at 0x7f6902ee5310>,
  '3': <Rule.Rule at 0x7f6902bbf070>},
 11: {'1': <Rule.Rule at 0x7f6906965b50>,
  '2': <Rul

In [66]:
from Regulation import Regulation
insider_regulation_object = Regulation('Insider Trading', regulation_doc) 
insider_regulation_object


<Regulation.Regulation at 0x7f69031d4820>

In [73]:
save_object(insider_regulation_object, 'insider_trading_regulation_object.pkl')

In [6]:
import os
#Already trained 1571225904231,1592212424293 
# entries = os.listdir('data/')
# for file in entries:
#     if((file!= '1592212424293.txt') and (file!= '1571225904231.txt')):
output_dir = 'my_mixed_random/'
nlp2 = spacy.load(output_dir)
def model_output(file):
    k = open('data-reg/'+file, 'r')
    s = k.readlines()
    for l in s:
        doc2 = nlp2(l)
        print(l)
        for ent in doc2.ents:
            print(ent.label_, ent.start_char, ent.end_char, ent.text)
        print('********************')

In [1]:
# model_output('1571225904231.txt')

In [3]:
import json
import spacy
def make_file_json(file_name):
    output_dir = 'best_trained/'
    nlp2 = spacy.load(output_dir)
    data_dict = {}
    data_dict['file_name'] = file_name
    data_dict['fields'] = []
    k = open('data-reg/'+file_name, 'r')
    s = k.read()
    doc2 = nlp2(s)
    count_id = 0
    list_ents = []
    for ent in doc2.ents:
        new_dict = {}
        new_dict['id'] = count_id
        new_dict['span'] = {'start':ent.start_char, 'end':ent.end_char}
        new_dict['text'] = ent.text
        new_dict['tags'] = [{'tag_name': ent.label_, "supplementary_Info":{'start':ent.start_char, 'end':ent.end_char, 'text':ent.text}}]
#         new_dict['tags'] = [{'tag_name': ent.label_}]
        count_id += 1
        list_ents.append({'label':ent.label_, 'start':ent.start_char, 'end':ent.end_char})
#         print(list_ents)
#         print(ent.label_, ent.start_char, ent.end_char, ent.text)
        data_dict['fields'].append(new_dict)
#     json_object = json.dumps(new_dict)   
#     print(json_object)
#     return data_dict
    return list_ents
    with open(file_name+".json","w") as f:
        json.dump(data_dict,f)

In [4]:
# object_json = make_file_json('1571225904231.htmlTxt')
# object_json
output_my = make_file_json('1571225904231.htmlTxt')
# output_my

In [2]:
# import json
# f = open('data_rules/data/1571225904231.json')
# object_json2 = json.load(f)
# object_json2[0]

In [11]:
#testing ujwal's tags
file_name = '1571225904231.htmlTxt'
k = open('data-reg/'+file_name, 'r')
s = k.read()
list_ents = []
for k in object_json2:
#     print(s[k['span']['start']:k['span']['end']],'---------', k['tags']['tag_name'])
    list_ents.append({'label':k['tags']['tag_name'], 'start':k['span']['start'], 'end':k['span']['end']})
list_ents.extend(output_my)
# print(output_my)
list_ents[-3]

{'label': 'Person - General', 'start': 103322, 'end': 103328}

In [157]:
def merge_object(json_object,json_object2):
#     list_ids = []
    temp = json_object2
#     for k in json_object2['fields']:
#         list_ids.append(k['id'])
    for j in json_object2:
        for k in json_object['fields']:
#             print((k['span'],j['span']))
            if (k['span']['end']> j['span']['end']) and (k['span']['start']<= j['span']['start']):
#                 print('yes')
#                 k['tags']['supplementary_Info'].append({'start':j['span']['start'], 'end':j['span']['end'], 'text':j['text']})
#                 print(j['tags'])
#                 print(k['tags'])
                j['tags']['supplementary_Info']={'start':j['span']['start'], 'end':j['span']['end'], 'text':j['text']}
                k['tags'].append(j['tags'])
#                 print(((k['tags']),(j['tags'])))
#                 z = {**k['tags'], **j['tags']}
#                 print(z)
#                 k['tags'] = z
        temp.remove(j)
    last_id = len(json_object['fields'])
    for k in temp:
        k['id'] = last_id+1
        last_id += 1
        json_object['fields'].append(k)
    return json_object
                

In [3]:
# o = merge_object(object_json,object_json2)
# o

In [160]:
# ex = [{"text": "But Google is starting from behind.",
#        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
#        "title": 'Insider Trading'}]
# file_name = o['file_name']
file_name = '1571225904231.htmlTxt'
k = open('data-reg/'+file_name, 'r')
s = k.read()
make_dict = {'text':s}
list_ents = []
for k in o['fields']:
    for j in k['tags']:
        if isinstance(j,dict):
            list_ents.append({'label':j['tag_name'], 'start':j['supplementary_Info']['start'], 'end':j['supplementary_Info']['end']})
#getting pos tags


In [12]:
#pos tagging
nlp = spacy.load("en_core_web_sm")
doc = nlp(s)
for token in doc:
    if token.pos_ =='VERB':
        list_ents.append({'label':token.pos_, 'start':token.idx, 'end': token.idx+len(token.text)})
# s[3320:3340]
file_name = '1571225904231.htmlTxt'
k = open('data-reg/'+file_name, 'r')
s = k.read()
make_dict = {'text':s}
#sorting list_ents.
list_ents = sorted(list_ents,
                          key=lambda k: (k['start']))
make_dict['ents'] = list_ents
make_dict["title"]= 'Insider Trading'
ex = [make_dict]
entities = []
for k in list_ents:
    entities.append(k['label'])
entities = list(set(entities))
len(entities)

14

In [14]:
import random 

def get_entity_options(entities):
    """
    generating color options for visualizing the named entities
    """
    def color_generator(number_of_colors):
        color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                 for i in range(number_of_colors)]
        return color

#     entities = ["GGP", "SO", "TAXON", "CHEBI", "GO", "CL", 
#                 "DNA", "CELL_TYPE", "CELL_LINE", "RNA", "PROTEIN",
#                 "DISEASE", "CHEMICAL",
#                 "CANCER", "ORGAN", "TISSUE", "ORGANISM", "CELL", "AMINO_ACID", "GENE_OR_GENE_PRODUCT", "SIMPLE_CHEMICAL", "ANATOMICAL_SYSTEM", "IMMATERIAL_ANATOMICAL_ENTITY", "MULTI-TISSUE_STRUCTURE", "DEVELOPING_ANATOMICAL_STRUCTURE", "ORGANISM_SUBDIVISION", "CELLULAR_COMPONENT"]
    
    colors = {"ENT":"#E8DAEF"}
    
#     if random_colors:
    color = color_generator(len(entities))
    for i in range(len(entities)):
        colors[entities[i]] = color[i]
#     print(colors)
#     else:
#         entities_cat_1 = {"GGP":"#F9E79F", "SO":"#F7DC6F", "TAXON":"#F4D03F", "CHEBI":"#FAD7A0", "GO":"#F8C471", "CL":"#F5B041"}
#         entities_cat_2 = {"DNA":"#82E0AA", "CELL_TYPE":"#AED6F1", "CELL_LINE":"#E8DAEF", "RNA":"#82E0AA", "PROTEIN":"#82E0AA"}
#         entities_cat_3 = {"DISEASE":"#D7BDE2", "CHEMICAL":"#D2B4DE"}
#         entities_cat_4 = {"CANCER":"#ABEBC6", "ORGAN":"#82E0AA", "TISSUE":"#A9DFBF", "ORGANISM":"#A2D9CE", "CELL":"#76D7C4", "AMINO_ACID":"#85C1E9", "GENE_OR_GENE_PRODUCT":"#AED6F1", "SIMPLE_CHEMICAL":"#76D7C4", "ANATOMICAL_SYSTEM":"#82E0AA", "IMMATERIAL_ANATOMICAL_ENTITY":"#A2D9CE", "MULTI-TISSUE_STRUCTURE":"#85C1E9", "DEVELOPING_ANATOMICAL_STRUCTURE":"#A9DFBF", "ORGANISM_SUBDIVISION":"#58D68D", "CELLULAR_COMPONENT":"#7FB3D5"}

#         entities_cats = [entities_cat_1, entities_cat_2, entities_cat_3, entities_cat_4]
#         for item in entities_cats:
#             colors = {**colors, **item}
    
    options = {"ents": entities, "colors": colors}
    
    return options

In [15]:
# print(type(get_entity_options(entitites)))
from spacy import displacy
k = get_entity_options(entities)
print(k)
html = displacy.render(ex, style="ent", manual=True, page=True)


{'ents': ['Person In charge', 'coref', 'entitty', 'Person - General', 'VERB', 'Penalty', 'temporal', 'Company', 'Place', 'Legal Doc', 'Object', 'time', 'Investigation', 'Assets'], 'colors': {'ENT': '#E8DAEF', 'Person In charge': '#811F80', 'coref': '#6EF077', 'entitty': '#7D7374', 'Person - General': '#CA5D31', 'VERB': '#59529D', 'Penalty': '#C85264', 'temporal': '#E63093', 'Company': '#93132D', 'Place': '#96DF01', 'Legal Doc': '#5B87AF', 'Object': '#2D7932', 'time': '#9CFF28', 'Investigation': '#945B9A', 'Assets': '#35A317'}}


In [16]:
from IPython.core.display import HTML
print(type(html))
# html = HTML(html)
with open('display_color.html', 'w') as f:
    f.write(html)

<class 'NoneType'>


TypeError: write() argument must be str, not None

In [4]:
# #convert to jsonl
# import json
# data = []
# f = open('file.json1', 'r')
# # print(type(f))
# for line in f:
# #     print(type(line))
#     data.append(json.loads(line))
# id_list = []
# for k in data:
# #     print(k)
#     id_list.append(k['id'])
# #     print(k['id'])
# #     print(k['text'])
# #     print('*******')
# # print(min(id_list))
# # print(max(id_list))
# ordered_json = []
# for k in range(min(id_list), max(id_list)+1):
#     for j in data:
#         if j['id']==k:
#             ordered_json.append(j)
# start_point = 0
# for k in ordered_json:
#     for j in k['labels']:
#         k['org'] = (j[0],j[1])
#         j[0] += start_point
#         j[1] += start_point
#     start_point += len(k['text'])
# #     print(k['labels'])
# ordered_json

In [4]:
import json
import spacy
def make_file_json(file_name, ordered_json):
#     output_dir = 'my_mixed_random/'
#     nlp2 = spacy.load(output_dir)
    data_dict = {}
    data_dict['file_name'] = file_name
    data_dict['fields'] = []
#     k = open('data/'+file_name, 'r')
#     s = k.read()
#     doc2 = nlp2(s)
#     for ent in doc2.ents:
    for data in ordered_json:
#         print(data['text'])
        for h in data['labels']:
            new_dict = {}
#             print(data['text'])
            new_dict['span'] = (h[0],h[1])
            new_dict['text'] = data['text'][k['org'][0]:k['org'][1]+1]
            new_dict['tag'] = h[2]
    #         print(ent.label_, ent.start_char, ent.end_char, ent.text)
            data_dict['fields'].append(new_dict)
#     json_object = json.dumps(new_dict)   
#     print(json_object)
    print(data_dict)
    with open(file_name+".json","w") as f:
        json.dump(data_dict,f)

In [5]:
make_file_json('insider', ordered_json)

NameError: name 'ordered_json' is not defined

In [31]:
del data[1]['meta']
del data[1]['annotation_approver']
del data[1]['id']


In [5]:
# import json
# f = json.dumps(data[1])
# print(f)
# k = [f]
# with open('hi.jsonl', 'w') as outfile:
#     for entry in k:
#         json.dump(entry, outfile)
#         outfile.write('\n')

In [15]:
ex = [{"text": "But Google is starting from behind.",
       "ents": [{"start": 4, "end": 10, "label": "ORG"}],
       "title": 'Insider Trading'}]

html = displacy.render(ex, style="ent", manual=True)


In [6]:
# import spacy

# output_dir = 'my_mixed_random/'
# nlp2 = spacy.load(output_dir)
# # text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
# file_name = '1571225904231.txt'
# k = open('data-reg/'+file_name, 'r')
# s = k.read()
# doc2 = nlp2(s)
# # nlp = spacy.load("en_core_web_sm")
# # doc = nlp(text)
# displacy.serve(doc2, style="ent")

In [4]:
import json
import spacy
import os
file_array = os.listdir('HTML_TEMPLATING_TEXT/')
for file_name in file_array:
    if file_name.endswith(".json"):
        f = open('HTML_TEMPLATING_TEXT/'+file_name)
        output_dir = 'my_mixed_random/'
        nlp2 = spacy.load(output_dir)
        object_json2 = json.load(f)
        list_ents_ordered_rules = {}
        for k in object_json2:
            doc2 = nlp2(k)
            temp = []
            for ent in doc2.ents:
                temp.append((ent.start_char, ent.end_char,ent.label_, ent.text))
            list_ents_ordered_rules[k] = temp
        list_ents_ordered_rules
        save_object(list_ents_ordered_rules, 'filled_html_sathvik/'+file_name+'_tagged.pkl')

In [5]:
# pickle.dump(list_ents_ordered_rules,'1538990677517_tagged.pkl', protocol=3 )


### Ujwal's json objects divided by rules

In [7]:
# import json
# import spacy
# rule_tag_dict = {}
# rule_without_pos = {}
# #to store tags rule wise.
# nlp = spacy.load("en_core_web_sm")
# output_dir = 'best_trained/'
# nlp2 = spacy.load(output_dir)
# ent_list = []
# f = open('data_rules/data/1571225904231.json')
# object_json2 = json.load(f)
# for rule in object_json2:
# #     print((rule['rule'][rule['span']['start']:rule['span']['end']], rule['text']))
#     ent_list.append({'tag_name':rule['tags']['tag_name'], 'start':rule['span']['start'], 'end':rule['span']['end'], 'text':rule['text']})
#     doc2 = nlp2(rule['rule'])
#     for ent in doc2.ents:
#         ent_list.append({'start':ent.start_char, 'end':ent.end_char,'tag_name': ent.label_, 'text': ent.text})
#     #appending entries without pos tag.
# #     ent_list = sorted(ent_list,
# #                           key=lambda k: (k['start']))
# #     rule_without_pos[rule['rule']] = ent_list
#     doc = nlp(rule['rule'])
# #     print(rule['rule'])
#     for token in doc:
#         ent_list.append({'tag_name':token.pos_, 'start':token.idx, 'end': token.idx+len(token.text), 'text':token.text})
#     #sorting spans.
#     ent_list = sorted(ent_list,
#                           key=lambda k: (k['start']))
#     rule_tag_dict[rule['rule']] = ent_list
# rule_tag_dict

In [13]:
for rule in object_json2:
    doc = nlp(rule['rule'])
    print(rule['rule'])
    for token in doc:
        print({'tag_name':token.pos_, 'start':token.idx, 'end': token.idx+len(token.text), 'text':token.text})
    break
        

SECURITIES AND EXCHANGE BOARD OF INDIA
{'tag_name': 'PROPN', 'start': 0, 'end': 10, 'text': 'SECURITIES'}
{'tag_name': 'CCONJ', 'start': 11, 'end': 14, 'text': 'AND'}
{'tag_name': 'PROPN', 'start': 15, 'end': 23, 'text': 'EXCHANGE'}
{'tag_name': 'PROPN', 'start': 24, 'end': 29, 'text': 'BOARD'}
{'tag_name': 'ADP', 'start': 30, 'end': 32, 'text': 'OF'}
{'tag_name': 'PROPN', 'start': 33, 'end': 38, 'text': 'INDIA'}


In [9]:
save_object(rule_tag_dict, 'insider_nested_tags_indictformat.pkl')

In [8]:
# rule_tag_dict.keys()

In [1]:
import pickle
file = open('insider_nested_tags_indictformat.pkl', 'rb')

# dump information to that file
rule_tag_dict = pickle.load(file)

# close the file
file.close()

In [9]:
# for k in rule_tag_dict.keys():
#     for j in rule_tag_dict[k]:
#         print(j)
#     print('over')
#     break

In [8]:
rule_signature = []
for k in rule_without_pos:
    capture_sign = []
#     print(rule_tag_dict[k])
    for j in rule_without_pos[k]:
#         print(j)
        capture_sign.append(j['tag_name'])
    rule_signature.append(tuple(capture_sign))

In [6]:
print(len(rule_signature))
print(len(list(set(rule_signature))))

411
411


In [9]:
print(len(rule_signature))
print(len(list(set(rule_signature))))

411
411


In [10]:
# import csv
# output_dir = 'my_mixed_random/'
# nlp2 = spacy.load(output_dir)
# output_dict = {}
# with open('reg2-dynamic.csv', 'r') as file:
#     reader = csv.reader(file)
#     for row in reader:
#         print(row[0])
#         ent_list = []
#         doc2 = nlp2(row[0])
#         for ent in doc2.ents:
#             ent_list.append({'start':ent.start_char, 'end':ent.end_char,'tag_name': ent.label_, 'text': ent.text})
#         output_dict[row[0]] = ent_list
# save_object(output_dict, 'reg2.pkl')