In [8]:
import pandas as pd
pd.set_option('expand_frame_repr', True)
award_string = 'SBAHQ-'

## Find Files and Columns
Using a sample award id string, look through JAAMS and Prism files to see which ones contain award ids.
For files that contain award id data, find out which column(s) the award id is in.

In [9]:
#assuming that award ids for this data subset are start with 'SBAHQ-', 
#find the JAAMS and Prism tables that contain award is somewhere
#in their data
awards_files = !grep --include=\*.txt -rnl -e 'SBAHQ-'  data
jaams_awards_files = [f.split('\n')[0] for f in awards_files if f.find('jaams') > -1]
prism_awards_files = [f.split('\n')[0] for f in awards_files if f.find('prism') > -1]
prism = {}
#raw prism files aren't quoted, so many lines won't parse (skip them for now)
for fi in prism_awards_files:
    key = fi.split('/')[-1][:-4].lower()
    prism[key] = pd.read_csv(fi, error_bad_lines = False, warn_bad_lines = False, encoding="latin_1")
    prism[key].rename(columns=lambda x: '{}.'.format(key) + x.lower(), inplace = True)
prism_csv = pd.read_csv('data/data_act_prism_grants_fy14.csv', encoding="latin_1")
jaams = {}
for fi in jaams_awards_files:
    key = fi.split('/')[-1][:-4].lower()
    jaams[key] = pd.read_csv(fi)
    jaams[key].rename(columns=lambda x: '{}.'.format(key) + x.lower(), inplace = True)

### JAAMS

In [12]:
print('looking for JAAMS colums that have an award id\n')
for k, df in jaams.items():
    for col in df:
        if df[col].to_string().find(award_string) >= 0:
            print ('{}: {}'.format(k, col))

looking for JAAMS colums that have an award id

po_lines_all: po_lines_all.attribute1
ap_invoice_distributions_all: ap_invoice_distributions_all.attribute3
po_requisition_headers_all: po_requisition_headers_all.attribute2
ap_invoices_all: ap_invoices_all.invoice_num
ap_invoices_all: ap_invoices_all.attribute11
po_headers_all: po_headers_all.segment1


### Prism

In [14]:
print('looking for Prism colums that have an award id\n')
for k, df in prism.items():
    for col in df:
        if df[col].to_string().find(award_string) >= 0:
            print ('{}: {}'.format(k, col))
for col in prism_csv:
    if prism_csv[col].to_string().find(award_string) >= 0:
        print ('prism csv: {}'.format(col))

looking for Prism colums that have an award id

grantheader: grantheader.sba1222progcode
grantheader: grantheader.sba1222budgetcode
grantheader: grantheader.sba1222documentno
faadsciv: faadsciv.docnum
header: header.docnum
prism csv: Grant Number
prism csv: Award ID
prism csv: Unique Identifer


## Relationships

Very, very preliminary sketch of relationship between Prism and JAAMS.
* According to the SBA mapping doc, the Prism award_id = header.docnum + header.versionnum
* Several JAAMS tables have columns that contain what look like Prims docnums. These are shown below, but we're not sure which is the best join.

![Prism/JAAMS relationship](data/prism_jaams_relationship.png 'Prism/JAAMS relatinoship')

In [15]:
header = prism['header']
po_lines_all = jaams['po_lines_all']
ap_invoices_all = jaams['ap_invoices_all']

In [16]:
#take a look at the DOCNUMS in the Prism header file
pd.unique(header['header.docnum'].ravel())

array(['SBAHQ-15-B-0009', 'SBAHQ-12-IT-0024', 'SBAHQ-12-IT-0058',
       'SBAHQ-12-Y-0079', 'SBAHQ-13-Y-0005', 'SBAHQ-12-Y-0010',
       'SBAHQ-15-B-0001', 'SBAHQ-15-B-0013', 'SBAHQ-15-B-0014',
       'SBAHQ-15-B-0015', 'SBAHQ-15-B-0016', 'SBAHQ-15-B-0018',
       'SBAHQ-11-IT-0011', 'SBAHQ-15-B-0005', 'SBAHQ-15-B-0019',
       'SBAHQ-15-B-0020', 'SBAHQ-13-Y-0091', 'SBAHQ-13-Y-0016',
       'SBAHQ-10-V-0008', 'SBAHQ-15-B-0002', 'SBAHQ-15-B-0003',
       'SBAHQ-15-B-0004', 'SBAHQ-14-S-0001', 'SBAHQ-15-B-0021',
       'SBAHQ-15-B-0022', 'SBAHQ-15-B-0023', 'SBAHQ-15-B-0024',
       'SBAHQ-11-G-0015', 'SBAHQ-15-B-0007', 'SBAHQ-15-B-0008',
       'SBAHQ-15-B-0010', 'SBAHQ-15-B-0011', 'SBAHQ-15-B-0012',
       'SBAHQ-13-Y-0011', 'SBAHQ-15-B-0029', 'SBAHQ-15-B-0031',
       'SBAHQ-15-B-0026', 'SBAHQ-15-B-0034', 'SBAHQ-13-Y-0146',
       'SBAHQ-15-B-0033', 'SBAHQ-15-B-0032', 'SBAHQ-15-B-0025',
       'SBAHQ-15-B-0028', 'SBAHQ-13-Y-0105', 'SBAHQ-13-Y-0048',
       'SBAHQ-11-J-0027', 'SBAHQ-13-Y

In [17]:
#take a look at the VERSIONNUMs in the Prism header file
pd.unique(header['header.versionnum'].ravel())

array(['0001', '0003', '0006', '0004', 'ORIG', '0002', '0017', '0009',
       '0008', '0005', '0007', '0010', nan, '398554'], dtype=object)

### Header to PO_LINES_ALL

In [18]:
header['header.docnum_stripped'] = header['header.docnum'].replace('-', '', regex = True)
po_lines_all['po_lines_all.attribute1_stripped'] = po_lines_all['po_lines_all.attribute1'].replace('-', '', regex = True)
print ('header rows: {}'.format(len(header.index)))
print ('header DOCNUM non-null: {}'.format(header['header.docnum'].count()))
print ('po_lines_all rows: {}'.format(len(po_lines_all.index)))
print ('po_lines_all ATTRIBUTE1 non-null: {}'.format(po_lines_all['po_lines_all.attribute1'].count()))
header_po_lines_all = pd.merge(
    header.dropna(subset = ['header.docnum_stripped']),
    po_lines_all,
    left_on = 'header.docnum_stripped',
    right_on = 'po_lines_all.attribute1_stripped'
)
header_po_lines_all.to_csv('data/header_po_lines_all.csv', index=False)
print ('merged rows: {}'.format(len(header_po_lines_all.index)))

header rows: 132
header DOCNUM non-null: 131
po_lines_all rows: 579
po_lines_all ATTRIBUTE1 non-null: 510
merged rows: 18


### Header to AP_INVOICES_ALL

This merge seems the most promising: was able to match 42 records using the docnum alone. 

**TODO:** Incorporate versionnum into this merge

In [19]:
ap_invoices_all['ap_invoices_all.invoice_num_stripped'] = ap_invoices_all['ap_invoices_all.invoice_num'].replace('-(?!.*-).*', '', regex = True).replace('-', '', regex = True)
print ('header rows: {}'.format(len(header.index)))
print ('header DOCNUM (stripped) non-null: {}'.format(header['header.docnum_stripped'].count()))
print ('ap_invoices_all rows: {}'.format(len(ap_invoices_all.index)))
print ('ap_invoices_all DOC_NUM (created) non-null: {}'.format(ap_invoices_all['ap_invoices_all.invoice_num_stripped'].count()))
header_ap_invoices_all = pd.merge(
    header.dropna(subset = ['header.docnum_stripped']),
    ap_invoices_all,
    left_on = 'header.docnum_stripped',
    right_on = 'ap_invoices_all.invoice_num_stripped'
)
print ('merged rows: {}'.format(len(header_ap_invoices_all.index)))
header_ap_invoices_all.to_csv('data/header_ap_invoices_all.csv', index=False)

header rows: 132
header DOCNUM (stripped) non-null: 131
ap_invoices_all rows: 1209
ap_invoices_all DOC_NUM (created) non-null: 1209
merged rows: 42


In [20]:
#Display a subset of the header/invoices merge (no PII)
header_ap_invoice_merge_subset = header_ap_invoices_all[['header.dockey',
    'header.verkey', 'header.doctype', 'header.docnum', 'header.ordernum', 'header.versionnum', 'header.status',
    'header.shortdescr', 'header.awardtype', 'header.awarddate', 'header.amount', 'header.obligatedamt',
    'header.purpose', 'ap_invoices_all.invoice_id', 'ap_invoices_all.invoice_num', 'ap_invoices_all.vendor_id',
    'ap_invoices_all.invoice_amount', 'ap_invoices_all.invoice_date', 'ap_invoices_all.description',
    'ap_invoices_all.source', 'ap_invoices_all.po_header_id']]
header_ap_invoice_merge_subset                            

Unnamed: 0,header.dockey,header.verkey,header.doctype,header.docnum,header.ordernum,header.versionnum,header.status,header.shortdescr,header.awardtype,header.awarddate,...,header.obligatedamt,header.purpose,ap_invoices_all.invoice_id,ap_invoices_all.invoice_num,ap_invoices_all.vendor_id,ap_invoices_all.invoice_amount,ap_invoices_all.invoice_date,ap_invoices_all.description,ap_invoices_all.source,ap_invoices_all.po_header_id
0,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3125721,SBAHQ14S0001-1(101713),3872,0,10/14/2013,10/01/13 - 01/15/14,Manual Invoice Entry,
1,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3125721,SBAHQ14S0001-1(101713),3872,0,10/14/2013,10/01/13 - 01/15/14,Manual Invoice Entry,
2,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3230577,SBAHQ14S0001-2(012414),3872,3055000,1/24/2014,01/16/14 - 06/30/14,Manual Invoice Entry,
3,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3230577,SBAHQ14S0001-2(012414),3872,3055000,1/24/2014,01/16/14 - 06/30/14,Manual Invoice Entry,
4,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3240680,SBAHQ14S0001-1(101713)A,3872,2000000,10/14/2013,10/01/13 - 01/15/14,Manual Invoice Entry,
5,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3336275,SBAHQ14S0001-3(062314),3872,1900000,6/23/2014,07/01/14 - 09/28/14,Manual Invoice Entry,
6,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3336275,SBAHQ14S0001-3(062314),3872,1900000,6/23/2014,07/01/14 - 09/28/14,Manual Invoice Entry,
7,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3382636,SBAHQ14S0001-4(092614),3872,7000000,9/26/2014,09/29/14 - 09/30/14,Manual Invoice Entry,
8,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3382636,SBAHQ14S0001-4(092614),3872,7000000,9/26/2014,09/29/14 - 09/30/14,Manual Invoice Entry,
9,67442,37840,500,SBAHQ-14-S-0001,,9,5,SCORE GRANT,601,11/4/2014,...,1345000,,3382636,SBAHQ14S0001-4(092614),3872,7000000,9/26/2014,09/29/14 - 09/30/14,Manual Invoice Entry,
