In [1]:
#2018 HMDA Edit Testing File Generator
from collections import OrderedDict
from io import StringIO
import json
import os
import pandas as pd
import random

#custom imports
import lar_constraints
import lar_generator
from rules_engine import rules_engine


In [2]:
#2018 Filing Instruction Guide: https://www.consumerfinance.gov/data-research/hmda/static/for-filers/2018/2018-HMDA-FIG.pdf

use_cols = ['name', 'metDivName', 'countyFips', 'geoIdMsa', 'metDivFp', 'smallCounty', 'tracts']
cbsa_cols = ['name', 'metDivName', 'state', 'countyFips', 'county', 'tracts','geoIdMsa', 'metDivFp', 'smallCounty', 
             'stateCode', 'tractDecimal']
cbsas = pd.read_csv('../dependancies/tract_to_cbsa_2015.txt', usecols=use_cols, delimiter='|', 
                    header=None, names=cbsa_cols, dtype=str) #load tract to CBSA data from platform file
cbsas["tractFips"] = cbsas.countyFips + cbsas.tracts
counties = list(cbsas.countyFips)
tracts = list(cbsas.tractFips)

In [3]:
#load schemas for LAR and transmittal sheet
lar_schema_df = pd.DataFrame(json.load(open("../schemas/lar_schema.json", "r")))
ts_schema_df = pd.DataFrame(json.load(open("../schemas/ts_schema.json", "r")))

In [4]:
lar_gen = lar_generator.lar_gen(lar_schema_df, ts_schema_df, counties=counties, tracts=tracts) #instantiate generator
lar_const = lar_constraints.lar_constraints(counties=counties, tracts=tracts)#instantiate constraints

In [5]:
#run constraint functions on row
file_length = 500 #set number of rows in test file
first = True
lei = None

#create list of constraints in lar_constraints object
constraints = [] 
for func in dir(lar_const):
    if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
        constraints.append(func)
        
for i in range(0, file_length): #loop over file length
    print("making new row\n\n")
    if lei:
        row = lar_gen.make_row(lei=lei)
    else:
        row = lar_gen.make_row() #create new row
    lei = row["lei"]
    diff = [1] #initialize diff for loop
    iters = 1
#diff needs a secondary check to 
    while len(diff) > 0:
        row_base = row.copy() #copy row to enable diff
        #apply constraint functions to LAR row
        print("\nstarting constraints iteration {iter}".format(iter=iters))
        for func in constraints: #loop over all constraints
            row = getattr(lar_const, func)(row) #apply constraint to row

        #convert initial and copied rows to sets for diff    
        initial_row = set(row_base.items())
        changed_row = set(row.items())
        diff = (initial_row - changed_row) #subtract row sets to show changes from constraint funcs
#            print(len(initial_row), len(changed_row)) #check number of fields, should be 110
        #show readout of number of changes made to new LAR row
        #print("changes from {func}: {vals}".format(func=str(func), vals=len(diff)))
        #print(row["app_date"], row["action_taken"])
        print(len(diff), "difference")
        iters+=1
    #create first row of dataframe
    if first:
        lar_frame = pd.DataFrame(row, index=[1])
        first = False
        print("finished row\n",lar_frame.columns)
    #add additional rows to dataframe
    else:
        #print("concating")
        print("finished row\n")
        new_lar = pd.DataFrame(row, index=[1])
        lar_frame = pd.concat([lar_frame, new_lar], axis=0)
lar_frame.reset_index(inplace=True) #reset index
lar_frame.drop("index", inplace=True, axis=1) #drop additional index column
print(iters)

making new row



starting constraints iteration 1
39 difference

starting constraints iteration 2
19 difference

starting constraints iteration 3
0 difference
finished row
 Index(['record_id', 'lei', 'uli', 'app_date', 'loan_type', 'loan_purpose',
       'preapproval', 'const_method', 'occ_type', 'loan_amount',
       ...
       'aus_code_5', 'aus_result_1', 'aus_result_2', 'aus_result_3',
       'aus_result_4', 'aus_result_5', 'aus_code_16', 'reverse_mortgage',
       'open_end_credit', 'business_purpose'],
      dtype='object', length=110)
making new row



starting constraints iteration 1
32 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
29 difference

starting constraints iteration 2
4 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
29 difference

starting constraints iteration 2
10 dif

11 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
48 difference

starting constraints iteration 2
13 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
37 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
39 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
39 difference

starting constraints iteration 2
22 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
2 difference

starting constraints iteration 5
0 difference
finished r


starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
46 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
34 difference

starting constraints iteration 2
5 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints iteration 2
15 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
16 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
46 di

1 difference

starting constraints iteration 8
0 difference
finished row

making new row



starting constraints iteration 1
43 difference

starting constraints iteration 2
13 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
37 difference

starting constraints iteration 2
15 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
52 difference

starting constraints iteration 2
15 difference

starting constraints iteration 3
2 difference

starting constraints iteration 4
2 difference

starting constraints iteration 5
1 difference

starting constraints iteration 6
1 difference

starting constraints iteration 7
0 difference
finished row

making new row



starting constraints iteration 1
49 difference

starting constraints iteration 2
11 difference

starting constraints iteration 3
3 difference

starting

51 difference

starting constraints iteration 2
16 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
1 difference

starting constraints iteration 6
0 difference
finished row

making new row



starting constraints iteration 1
52 difference

starting constraints iteration 2
17 difference

starting constraints iteration 3
2 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints iteration 2
19 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
43 difference

starting constraints iteration 2
5 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 diffe

56 difference

starting constraints iteration 2
18 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
37 difference

starting constraints iteration 2
23 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
1 difference

starting constraints iteration 6
0 difference
finished row

making new row



starting constraints iteration 1
41 difference

starting constraints iteration 2
7 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
36 difference

starting constraints iteration 2
9 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
6 difference

starting 

10 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
43 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
47 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
47 difference

starting constraints iteration 2
20 difference

starting constraints iteration 3
4 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
41 difference

starting constraints iteration 2
10 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
46 difference

starting constraints iteration 2
6 difference

starting constraints it


starting constraints iteration 1
37 difference

starting constraints iteration 2
20 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
38 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
39 difference

starting constraints iteration 2
1 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
41 difference

starting constraints iteration 2
6 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints iteration 2
23 difference

starting constraints iteration 3
0 difference
finished row

making ne


starting constraints iteration 1
53 difference

starting constraints iteration 2
13 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
35 difference

starting constraints iteration 2
5 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
46 difference

starting constraints iteration 2
27 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
8 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints iteration 2
17 difference

starting constraints iteration 3
4 difference

starting constraints 

50 difference

starting constraints iteration 2
18 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
43 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
16 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
45 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
0 difference
finished row

making new row



starting constraints iteration 1
49 difference

starting constraints 

50 difference

starting constraints iteration 2
22 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints iteration 2
14 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
43 difference

starting constraints iteration 2
12 difference

starting constraints iteration 3
3 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
0 difference
finished row

making new row



starting constraints iteration 1
55 difference

starting constraints iteration 2
20 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
46 difference

starting constraints iteration 2
9 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
44 difference

starting constraints i

0 difference
finished row

making new row



starting constraints iteration 1
42 difference

starting constraints iteration 2
9 difference

starting constraints iteration 3
0 difference
finished row

making new row



starting constraints iteration 1
52 difference

starting constraints iteration 2
14 difference

starting constraints iteration 3
1 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
1 difference

starting constraints iteration 6
1 difference

starting constraints iteration 7
1 difference

starting constraints iteration 8
1 difference

starting constraints iteration 9
0 difference
finished row

making new row



starting constraints iteration 1
42 difference

starting constraints iteration 2
11 difference

starting constraints iteration 3
2 difference

starting constraints iteration 4
1 difference

starting constraints iteration 5
1 difference

starting constraints iteration 6
1 difference

starting constraints iteration 7
0 differe

In [6]:
#Quality and Macro field interrelationship constraints:

In [7]:
#Create a sample TS row
#Note: this will need to be more robust to include other federal agencies
# mlo_id needs NA option
#set dummy values for TS row
ts_row_small = OrderedDict()
ts_row_small["record_id"]="1"
ts_row_small["inst_name"]="Ficus Bank"
ts_row_small["calendar_year"]=str(2018)
ts_row_small["calendar_quarter"]="4"
ts_row_small["contact_name"]="Mr. Smug Pockets"
ts_row_small["contact_tel"]="555-555-5555"
ts_row_small["contact_email"]="pockets@ficus.com"
ts_row_small["contact_street_address"]="1234 Ficus Lane"
ts_row_small["office_city"]="Ficusville"
ts_row_small["office_state"]="UT"
ts_row_small["office_zip"]="84096"
ts_row_small["federal_agency"]="9"
ts_row_small["lar_entries"]= str(len(lar_frame))
ts_row_small["tax_id"]="01-0123456"
ts_row_small["lei"]=lar_frame.get_value(0, "lei")

In [8]:
#join LAR and TS rows to make an output file
def write_file(ts_input=None, lar_input=None, directory="../edits_files/", name="passes_all.txt"):
    """Takes a TS row as a dictionary and LAR data as a dataframe. Writes LAR data to file and 
    re-reads it to combine with TS data to make a full file."""
    #make directories for files if they do not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    #write LAR dataframe to file
    parts_dir = directory+"file_parts/"
    if not os.path.exists(parts_dir):
        os.makedirs(parts_dir)
        
    lar_input.to_csv(parts_dir + "lar_data.txt", sep="|", header=False, index=False, index_label=False)
    #load LAR data as file rows
    with open(parts_dir + "lar_data.txt", 'r') as lar_data:
        lar = lar_data.readlines()

    with open(directory + name, 'w') as final_file:
        final_file.write("|".join(ts_input.values())+"\n")
        for line in lar:
            final_file.write("{line}".format(line=line))

In [9]:
#modify file for testing


In [10]:
#write sample file to disk
write_file(ts_input=ts_row_small, lar_input=lar_frame) #writes created file to disk
#validator engine uses the default: path="../edits_files/", data_file="passes_all.txt" for data files
validator = rules_engine(lar_schema=lar_schema_df, ts_schema=ts_schema_df, tracts=tracts, counties=counties) #instantiate edits rules engine

In [11]:
#split TS and LAR using validator function
#validator creates class objects of each of these internally as well
ts_df, lar_df = validator.split_ts_row(path="../edits_files/", data_file="passes_all.txt")


In [12]:
for func in dir(validator):
    if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
        #print("applying:", func)
        getattr(validator, func)()


In [13]:
validator.results

{'s300_1': OrderedDict([('row_type', 'TS'), ('record_id', 'passed')]),
 's300_2': OrderedDict([('row_type', 'LAR'), ('record_id', 'passed')]),
 's301': OrderedDict([('row_type', 'TS'), ('LEI', 'passed')]),
 's302': OrderedDict([('row_type', 'TS'), ('calendar_year', 'passed')]),
 's304': OrderedDict([('row_type', 'TS/LAR'), ('lar_entries', 'passed')]),
 's305': OrderedDict([('row_type', 'LAR'), ('all', 'passed')]),
 'v600': OrderedDict([('row_type', 'LAR'), ('LEI', 'passed')]),
 'v601_1': OrderedDict([('row_type', 'TS'), ('inst_name', 'passed')]),
 'v601_2': OrderedDict([('row_type', 'TS'), ('contact_name', 'passed')]),
 'v601_3': OrderedDict([('row_type', 'TS'), ('contact_email', 'passed')]),
 'v601_4': OrderedDict([('row_type', 'TS'),
              ('contact_street_address', 'passed')]),
 'v601_5': OrderedDict([('row_type', 'TS'), ('office_city', 'passed')]),
 'v602': OrderedDict([('row_type', 'TS'), ('calendar_quarter', 'passed')]),
 'v603': OrderedDict([('row_type', 'TS'), ('contact

In [22]:
ulis= [ 'T387ZBFFXOJ57E7HJ5QAS7EFIWIXAIRXGLLQ6L3W3M658',
                'T387ZBFFXOJ57E7HJ5QAVLYZNCWLBCFDSJOTM6NW9V863',
                'T387ZBFFXOJ57E7HJ5QAEZ6BE7Z00KR91B6MH5EIVDI61',
                'T387ZBFFXOJ57E7HJ5QAV3V0O0FPK3VTWY9S89M6LX496',
                'T387ZBFFXOJ57E7HJ5QA4A4GKL9JU4LBZXMRQGQFTEX59',
                'T387ZBFFXOJ57E7HJ5QA3VR078FQT57FSQIJ76YSV9236']
validator.lar_df[["loan_purpose", "preapproval","action_taken","app_race_1", "app_race_2", "app_race_3", "app_race_4", "app_race_5"
                  ,"uli"]][validator.lar_df.uli.isin(ulis)]

Unnamed: 0,loan_purpose,preapproval,action_taken,app_race_1,app_race_2,app_race_3,app_race_4,app_race_5,uli
475,1,2,8,7,,,,,T387ZBFFXOJ57E7HJ5QAS7EFIWIXAIRXGLLQ6L3W3M658
483,1,2,8,2,5.0,4.0,3.0,1.0,T387ZBFFXOJ57E7HJ5QAVLYZNCWLBCFDSJOTM6NW9V863
484,32,2,7,5,1.0,3.0,2.0,4.0,T387ZBFFXOJ57E7HJ5QAEZ6BE7Z00KR91B6MH5EIVDI61
487,31,2,8,2,3.0,5.0,4.0,1.0,T387ZBFFXOJ57E7HJ5QAV3V0O0FPK3VTWY9S89M6LX496
494,2,2,7,41,43.0,21.0,1.0,2.0,T387ZBFFXOJ57E7HJ5QA4A4GKL9JU4LBZXMRQGQFTEX59
495,31,2,8,5,2.0,1.0,3.0,4.0,T387ZBFFXOJ57E7HJ5QA3VR078FQT57FSQIJ76YSV9236


In [15]:
validator.lar_df[["uli", "aus_code_5", "mlo_id", "aus_code_16"]]

Unnamed: 0,uli,aus_code_5,mlo_id,aus_code_16
0,T387ZBFFXOJ57E7HJ5QA0A7L2IJ34U5Y9W3G0KY1A3P46,,BFI,
1,T387ZBFFXOJ57E7HJ5QAWU6WI70KL7RJ9EYQPAEGN3G58,W8CEVHBQRSA4CM9QGRX6KCDFSWBZSNO7L8HR0DUSUBQ5PD...,98,
2,T387ZBFFXOJ57E7HJ5QADJ2IS9Y5C32N17G0QBTXNTL96,FI3M3D1KTJWGPA5WNMK3SZ2Z2GQJX8MUU4RU4DPT161L1P...,QXMFHE2EPPHQI,
3,T387ZBFFXOJ57E7HJ5QAOTMPQ6YX54NGDX6X9215P4Q20,,JG18UQDP5OVZYSCCWD8GS,
4,T387ZBFFXOJ57E7HJ5QAPMAFJP31ZUM5Y2O0T8UXBUZ72,U7WH214A555RVL8TNTNVNKBBNUJYC0BZPZTWDX5SXW7TQO...,YI41GEKVGT3E24AYZE8C,
5,T387ZBFFXOJ57E7HJ5QARUO6PVIHXJPNZLK22ILAD4J18,,PGXFUW67L5HWO,
6,T387ZBFFXOJ57E7HJ5QA3LUPRXB6H9BOPKP7AXQYWZ971,,C,
7,T387ZBFFXOJ57E7HJ5QA5NSEBCH7PFBJLUZIVCH2Z7Y92,,5PF61788VN52A5,
8,T387ZBFFXOJ57E7HJ5QAEU9LIH0QBW45F9D6I73QOBF77,,I9S1DS,
9,T387ZBFFXOJ57E7HJ5QADZ4VFRTHCOQB5Q0NKANJK1L90,X4USQEO75VZLGK03F2MIPE7KABYTWCEIWZRAOR69AWKZSO...,NDFKHG,


In [16]:
validator.s300_1.__doc__

'1) The first row of your file must begin with a 1;.'

In [17]:
validator.lar_df[validator.lar_df.app_age.map(lambda x: x.isdigit())==False]

Unnamed: 0,record_id,lei,uli,app_date,loan_type,loan_purpose,preapproval,const_method,occ_type,loan_amount,...,aus_code_5,aus_result_1,aus_result_2,aus_result_3,aus_result_4,aus_result_5,aus_code_16,reverse_mortgage,open_end_credit,business_purpose


In [18]:
lar_gen.max_units

30