In [1]:
import pandas as pd
import numpy as np
from ambry import library
l = library()

p = l.get('ffiec.gov-hmda-2010e-lar-2010-ca').partition
str(p.identity.fqname)

'ffiec.gov-hmda-2010e-lar-2010-ca-0.0.6~p02X00V006'

In [2]:
ts = l.get('ffiec.gov-hmda-2010e-ts').partition.pandas

In [3]:
def get_ts(year):
    ts_name = ts[ts.activity_year == year][['respondent_id','respondent_name']].copy()
    ts_name.respondent_id = ts_name.respondent_id.astype(str)
    ts_name = ts_name.drop_duplicates(subset=['respondent_name']).set_index(['respondent_id'])
    return ts_name

def get_lar(year):
    # 'ffiec.gov-hmda-2010e-lar-2012-ca'
    return (l.get('ffiec.gov-hmda-2010e-lar-{}-ca'.format(year)).partition
            .select("SELECT * FROM lar WHERE county_code = 73")
            .pandas )

In [4]:
def generate_share_frame(df, ts_name, group):
    dfx = pd.DataFrame()
    dfx['count'] = df.groupby('respondent_id').count()['id']
    dfx['count_pct'] = np.round(dfx['count'] / dfx['count'].sum() * 100.0, 2)

    dfx['amount'] = df.groupby('respondent_id').sum()['loan_amount']
    dfx['amount_pct'] = np.round(dfx['amount'] / dfx['amount'].sum() * 100.0,2)

    dfx['amount_avg'] = np.round(df.groupby('respondent_id').mean()['loan_amount'],0)

    dfx['amount_median'] = np.round(df.groupby('respondent_id').median()['loan_amount'],0)
    
    dfx = (dfx.merge(ts_name,left_index = True, right_index = True)
               .set_index(['respondent_name'] )
               .sort('count_pct', ascending = False))
    
    # HACK. There is certainly a better way to set the first-level of a new multi index on columns,
    # but I'm tired of reading docs. 
    x = dfx.T
    x['group'] = group
    dfx =  x.set_index('group',append=True).reorder_levels([1,0]).T

    return dfx
    

In [7]:
def build_market_share_report(year):
    df = get_lar(year)
    ts_name = get_ts(year)
    all_share = generate_share_frame(df,ts_name,'all')
    no_refi_share = generate_share_frame(df[df.loan_purpose != 2], ts_name, 'no_imp')
    combined = all_share.join(no_refi_share)
    return combined

In [8]:
for year in (2010, 2011, 2012, 2013):
    df = build_market_share_report(year)
    print year, len(df)
    df.to_csv('market-share-sandiego-county-{}.csv'.format(year))

2010 519
2011 677
2012 722
2013 722
