In [2]:
import spacy
import json
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

In [3]:
with open('insider_rule_dump.json') as f:
    data = json.load(f)
with open('project_2_labels.json') as f:
    label_file = json.load(f)
labels = []
for k in label_file:
    labels.append(k['text'])
labels.append('Action')
labels

['Person In charge',
 'Place',
 'Assets',
 'Person - General',
 'Object',
 'Company',
 'Penalty',
 'Investigation',
 'Legal Doc',
 'time',
 'Action']

In [3]:
#Extracting rules as a dictionary to be given to rule objects.
def extract_attributes(line):
    output_dir = 'my_mixed_random/'
    nlp2 = spacy.load(output_dir)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(line)
    doc2 = nlp2(line)
    output_dict = {k:[] for k in labels}
    for token in doc:
        if token.pos_ == 'VERB':
            output_dict['Action'].append(token.text)
    for ent in doc2.ents:
        output_dict[ent.label_].append(ent.text)
    return output_dict
#Cleaning objects for repetitions and removing stop words in verbs/actions.
def clean_attributes(output_dict):
    clean_dir = {}
    for k in output_dict.keys():
        clean_dir[k]=list(set(output_dict[k]))
    try:
        clean_dir['Action'].remove('shall')
    except ValueError:
        pass
    try:
        clean_dir['Action'].remove('may')
    except ValueError:
        pass
    try:
        clean_dir['Action'].remove('would')
    except ValueError:
        pass
    return clean_dir

In [4]:
line = data[11]
print(extract_attributes(line))
print(clean_attributes(extract_attributes(line)))

{'Person In charge': [], 'Place': [], 'Assets': [], 'Person - General': ['Any person', 'insider', 'persons'], 'Object': ['unpublished price sensitive information', 'legitimate purpose', 'unpublished price sensitive information'], 'Company': [], 'Penalty': [], 'Investigation': [], 'Legal Doc': ['regulations', 'notice', 'regulations'], 'time': [], 'Action': ['shall', 'considered', 'shall', 'given', 'maintain']}
{'Person In charge': [], 'Place': [], 'Assets': [], 'Person - General': ['insider', 'Any person', 'persons'], 'Object': ['legitimate purpose', 'unpublished price sensitive information'], 'Company': [], 'Penalty': [], 'Investigation': [], 'Legal Doc': ['regulations', 'notice'], 'time': [], 'Action': ['given', 'considered', 'maintain']}


In [74]:
import re
#The following functions are used to index the documents is order and combine them chapter wise.
p = re.compile(r'^(\(\d\)|\(\d\d\))')
new_data = []
int_list = []
for k in range(len(data)):
    int_list.append(data[k])
    if k+1<len(data):
        if not p.findall(data[k+1]):
            pass
        else:
            new_data.append(''.join(map(str, int_list)))
            int_list = []

for line in new_data:
    p = re.compile(r'^(\(\d\)|\(\d\d\))')

reg_data = []
chap_list = []
for k in range(len(new_data)):
    chap_list.append(new_data[k])
    if k+1<len(new_data):
        if p.findall(new_data[k+1])[0][1]=='1':
            reg_data.append(chap_list)
            chap_list = []
reg_dict = {}
#reg_dict is a dictionary of rules with keys as chapter numbers and values as list of ordered rules(strings).
count = 0
for k in reg_data:
    rules_dict = {}
    for i in k:
        rules_dict[p.findall(i)[0][1]] = i 
    count +=1
    reg_dict[count] = rules_dict

# print(reg_dict)

raw_reg_dict = reg_dict
    

In [63]:
from Rule import Rule
regulation_doc = {}
for chapter_num in raw_reg_dict:
    regulation_doc[chapter_num] = {}
    for rule_index in raw_reg_dict[chapter_num]:
        d = clean_attributes(extract_attributes(raw_reg_dict[chapter_num][rule_index]))
        #In the above line of code, a rule text present in the indexed rules and regulations is
        #sent to a ner model and its 11 attributes are collected and sent back in a dictionary.
        #This dictionary is sent to the import module of a rule to create and fill a rule object.
        r = Rule('Insider Trading', int(chapter_num)+1, int(rule_index)+1, raw_reg_dict[chapter_num][rule_index])
        r.fill_attributes(d)
        regulation_doc[chapter_num][rule_index] = r
regulation_doc        

{1: {'1': <Rule.Rule at 0x7f6906971400>, '2': <Rule.Rule at 0x7f6902a295b0>},
 2: {'1': <Rule.Rule at 0x7f6902beca30>},
 3: {'1': <Rule.Rule at 0x7f6902bbcf40>,
  '2': <Rule.Rule at 0x7f69051980a0>,
  '3': <Rule.Rule at 0x7f6902e98310>,
  '4': <Rule.Rule at 0x7f6905316f40>},
 4: {'1': <Rule.Rule at 0x7f690532e790>},
 5: {'1': <Rule.Rule at 0x7f69027e0580>,
  '2': <Rule.Rule at 0x7f690531f160>,
  '3': <Rule.Rule at 0x7f6902ee0760>,
  '4': <Rule.Rule at 0x7f6902ab1730>},
 6: {'1': <Rule.Rule at 0x7f6901ea5430>,
  '2': <Rule.Rule at 0x7f69052ff8b0>,
  '3': <Rule.Rule at 0x7f69028a4370>,
  '4': <Rule.Rule at 0x7f6902b38cd0>},
 7: {'1': <Rule.Rule at 0x7f6902a3fb80>, '2': <Rule.Rule at 0x7f690315af70>},
 8: {'1': <Rule.Rule at 0x7f6902e8d670>},
 9: {'1': <Rule.Rule at 0x7f69027e0d60>, '2': <Rule.Rule at 0x7f6903117a90>},
 10: {'1': <Rule.Rule at 0x7f6906a9d8e0>,
  '2': <Rule.Rule at 0x7f6902ee5310>,
  '3': <Rule.Rule at 0x7f6902bbf070>},
 11: {'1': <Rule.Rule at 0x7f6906965b50>,
  '2': <Rul

In [66]:
from Regulation import Regulation
insider_regulation_object = Regulation('Insider Trading', regulation_doc) 
insider_regulation_object


<Regulation.Regulation at 0x7f69031d4820>

In [73]:
save_object(insider_regulation_object, 'insider_trading_regulation_object.pkl')