In [1]:
import numpy as np
import warnings
import pandas as pd

In [2]:
from load_contracts import read_contract, preprocess_text
from load_info import get_security_names, load_info_xlsx

In [3]:
filename = "contracts/135_ActelisNetworks_COI_01072005.pdf"

In [49]:
# END_OF_IV_INTRO_TEXT = ['rel','right','prefer','privileg','and','restrict','grant','to','or','impos','upon','the','common','stock','and','the','prefer','stock','and','the','holder']
END_OF_IV_INTRO_TEXT = ['vote']

In [5]:
text = read_contract(filename)
text = text.split(" ")
text = list(filter(lambda a: a != '', text))

### Utils

In [6]:
def is_num(x):
    try:
        int(x)
    except ValueError:
        return False
    return True

In [7]:
def revert_secuirty_names(df):
    df.loc[df["Security Name"] == "common stock", "Security Name"] = "common"
    return df

In [8]:
def get_closest_string(values, target, less, any=False):
    if (any):
        val = min(values, key=lambda x: abs(target - x))
    elif (less):
        val = min(values, key=lambda x: target - x)
    else:
        val = min(values, key=lambda x: x - target)
    return val

In [9]:
def find_target(text, target):
    target_loc = 0
    indicies = []
    start_index = 0
    for i in range(len(text)):
        word = text[i]
#         print("target_loc", target_loc)
        if word == target[target_loc]:
            if target_loc == 0: #marks start of sequence
                start_index = i
            target_loc += 1
            if target_loc == len(target):
                indicies += [start_index]
                target_loc = 0
        elif word == target[0]:
            target_loc = 1
        else:
            if target_loc > 0:
                #reset just incase ababcd 
                i -= (target_loc - 1) 
            target_loc = 0
    return indicies

### Search Functions

In [10]:
def get_closest_num(values, target, less, min=-1, max=1e10000, any=False):
    try:
        index = int(target)
    except ValueError:
        indicies = find_target(values, target.split(" "))
        if len(indicies) > 1:
            warnings.warn("Indicies length greath than one", RuntimeWarning)
            for i in range(len(indicies)):
                if less:
                    cur_target = indicies[i]
                else:
                    cur_target = indicies[len(indicies) - 1 -i] #starts at end of doc
                cur_val = get_closest_num(values, cur_target, less, min=min, max=max, any=any)
                if (cur_val != None):
                    return cur_val
            warnings.warn("No number found", RuntimeWarning)
            print("No num found target:", target, "indicies:", indicies)
            return None
        assert(len(indicies) > 1), "Invalid target"
        index = indicies[-1]
    if any: #doesn't get closest
        iters = range(0, len(values))
    elif less:
        iters = range(index -1, -1, -1) #0 is last
    else:#less = false
        iters = range(index, len(values))
    for i in iters:
        if (is_num(values[i])):
            return int(values[i])
    print("No num", target, "index", index)
    return None
    

In [11]:
def get_names(text):
    unsplit_names = get_security_names()
    names = [name.split(" ") for name in unsplit_names]
    indicies = []
    types = []
    for name in names:
        if (len(name) == 1):
            name += ["stock"]
        cur_indicies = find_target(text, name)
        if len(cur_indicies) > 0:
            types += [" ".join(name)]
            indicies += [cur_indicies]
    #remove extra "prefered stock"
    has_extra_prefered = True
    if "prefer stock" in types:
        for type in types:
            if type[-6:] == 'prefer':
                has_extra_prefered = True
                break
        if (has_extra_prefered):
            indicies.pop(types.index('prefer stock'))
            types.remove('prefer stock')
    return indicies, types

In [12]:
def get_num_of_shares(text, names):
    out = []
    for name in names:
        out += [get_closest_num(text, name, less=True)]
    return out
        

In [13]:
def get_types(names):
    types = []
    for name in names:
        if "common" in name:
            types += ["common"]
        elif "prefer" in name:
            types += ["prefered"]
        else:
            print("name doesn't contain common or prefer:", "".join(name))
            types += ["No type found"]
    return types

In [14]:
def generate_stats(IV_intro_text):
    _, names = get_names(IV_intro_text)
    types = get_types(names)
    nums = get_num_of_shares(IV_intro_text, names)
    d = {"Security Name":names, "Security Type":types, "Number":nums}
    df = pd.DataFrame(d)
    df = revert_secuirty_names(df)
    return df

In [15]:
unsplit_names = get_security_names()

In [16]:
all_names = [name.split(" ") for name in unsplit_names]

In [18]:
indicies = []
for name in all_names:
    indicies += [find_target(text, name)]


In [149]:
index_of_article_IV = find_target(text, ["articl", "fourth"])

In [150]:
text[index_of_article_IV[0]: index_of_article_IV[0] + 2]

['articl', 'fourth']

### Search Through IV Intro

In [19]:
end_of_IV_intro = find_target(text, END_OF_IV_INTRO_TEXT)

In [20]:
assert(len(end_of_IV_intro) == 1)

In [21]:
end_of_IV_intro = end_of_IV_intro[0]

In [22]:
articl_forth_occur = find_target(text, ["articl", "fourth"])

In [23]:
begining_of_IV = get_closest_string(values = articl_forth_occur, target=end_of_IV_intro, less=True)

In [24]:
print("IV Intro starts", begining_of_IV, "IV Intro ends", end_of_IV_intro)

IV Intro starts 203 IV Intro ends 363


In [25]:
" ".join(text[begining_of_IV:end_of_IV_intro])

'articl fourth compani is author to issu two class of share design respect common stock and prefer stock the compani is author to issu 43000000 share of common stock with a par valu of $0.01 per share the compani is author to issu 26007500 share of prefer stock 800000 of which are design seri a prefer stock with a par valu of $0.01 per share seri a prefer 700000 of which are design seri b prefer stock with a par valu of $0.01 per share seri b prefer 1507500 of which are design seri c prefer stock with a par valu of $0.01 per share seri c prefer and 23000000 of which are design seri d prefer stock with a par valu of $0.01 per share seri d prefer for the purpos of thi amend and restat certif of incorpor the term prefer stock shall mean seri a prefer seri b prefer seri c prefer and seri d prefer collectively.th'

In [26]:
IV_intro_text = text[begining_of_IV:end_of_IV_intro]

In [27]:
indicies, names = get_names(IV_intro_text)

In [28]:
types = get_types(names)

In [29]:
get_closest_num(IV_intro_text, names[0], False)



26007500

In [30]:
get_closest_num(IV_intro_text, names[0], False)



26007500

In [31]:
nums = get_num_of_shares(IV_intro_text, names)

No num 13 index 13




In [32]:
print(names)
print(types)
print(nums)

['common stock', 'seri a prefer', 'seri b prefer', 'seri c prefer', 'seri d prefer']
['common', 'prefered', 'prefered', 'prefered', 'prefered']
[43000000, 800000, 700000, 1507500, 23000000]


In [33]:
generate_stats(IV_intro_text)

No num 13 index 13




Unnamed: 0,Security Name,Security Type,Number
0,common,common,43000000
1,seri a prefer,prefered,800000
2,seri b prefer,prefered,700000
3,seri c prefer,prefered,1507500
4,seri d prefer,prefered,23000000


### Evalutate Search Methods

In [53]:
def get_IV_intro_text(filename):
    text = read_contract(filename)
    text = text.split(" ")
    text = list(filter(lambda a: a != '', text))
    print(" ".join(text))
    end_of_IV_intro = find_target(text, END_OF_IV_INTRO_TEXT)
    assert(end_of_IV_intro), "END Sequence not found"
    end_of_IV_intro = end_of_IV_intro[0]
    articl_forth_occur = find_target(text, ["articl", "fourth"])
    assert(articl_forth_occur), "Article fourth Sequence not found"
    begining_of_IV = get_closest_string(values = articl_forth_occur, target=end_of_IV_intro, less=True)
    IV_intro_text = text[begining_of_IV:end_of_IV_intro]
    return IV_intro_text

In [44]:
def test_methods(filename, n=-1):
    iters = 0
    y = load_info_xlsx("rs1 database AN.xlsx", np_array=False)
    for filename in y["File Name"]:
        iters += 1
        if iters == n:
            break
        print(filename)
        Intro_IV_text = get_IV_intro_text("./contracts/" + filename)
        
#     text = get_IV_intro_text(filename)
#     stats = generate_stats(text)
    
    

In [54]:
test_methods(filename, n=10)

17445_955DREAMS_COI_01232012.pdf
955 dream inc s t a t s o f d elaw are s e c r e ta r y o f s ta te d e liv e r e d 1 2 1 0 pm 0 1 2 3 2 0 1 2 d iv is io n o f c o rp o ra tio n s file 1 2 0 3 pm 0 1 2 3 2 0 1 2 120073329 4886034 filerest certif of incorpor pursuant to section 242 and 245 of the gener corpor law of the state of delawar 955 dream inc. a corpor organ and exist under and by virtu of the provis of the gener corpor law of the state of delawar the gener corpor law doe herebi certifi a follows.1 the name of thi corpor is 955 dream inc thi corpor wa origin incorpor pursuant to the gener corpor law on octob 26 2010 under the name 955 dream inc.2 the board of director of thi corpor duli adopt resolut propos to amend and restat the certif of incorpor of thi corpor declar said amend and restat to be advis and in the best interest of thi corpor and it stockhold and author the appropri offic of thi corpor to solicit the consent of the stockhold therefor which resolut set forth the 

AssertionError: Article fourth Sequence not found