In [1]:
import numpy as np
import pandas as pd
import sqlalchemy
import pyodbc
import urllib
import os


server = '10.43.20.148'
database = 'HimsLoan'
username = os.environ.get('HIMS_USERNAME')
password = os.environ.get('HIMS_PASSWORD')
driver='{ODBC Driver 17 for SQL Server}'

params = urllib.parse.quote_plus('DRIVER='+driver+';SERVER='+server+';PORT=1443;DATABASE='+database+';UID='+username+';PWD='+password)
engine = sqlalchemy.create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)

In [2]:
query = """
    select distinct
    p.ProjectNo,
    p.ProjName,
    p.FullProjectNumber,
    p.calendaryear,
    sf.[Description] FundingSource,
    ids.IDISNo,
    stt.[Description] ProjectStatus,
    mn.[Description] MilestoneName,
    md.DateReceived MilestoneReachedDate,
    f.descrip FundName,
    fi.FundingAmt
    from
    himsloan.dbo.project p join
    himsloan.dbo.LutProjectStatus stt on p.LutProjectStatusCd=stt.LutProjectStatusCd join
    himsloan.dbo.loan l on p.ProjUniqueID=l.ProjUniqueID join
    himsloan.dbo.loanfund lf on l.LoanID=lf.LoanID join
    himsloan.dbo.FundingInfo fi on lf.FundingInfoID=fi.FundingInfoID join
    himsloan.dbo.LutSourceofFunds sf on fi.LutSourceofFundsCD=sf.LutSourceofFundsCD join
    himsloan.dbo.AssnProjectIDISNo ids on p.ProjUniqueID=ids.Projuniqueid join
    himsloan.dbo.MilestoneDate md on p.ProjUniqueID=md.ProjUniqueID join
    himsloan.dbo.lutmilestonename mn on md.LutMilestoneNameCD=mn.LutMilestoneNameCD join
    fmis_new.dbo.vwFundLine_new fl on fi.FundRevSrcID=fl.FundRevSrcID join
    fmis_new.dbo.fund f on fl.FundCD=f.FundCD
    """

In [3]:
df = pd.read_sql(query, engine)

In [4]:
df.head()

Unnamed: 0,ProjectNo,ProjName,FullProjectNumber,calendaryear,FundingSource,IDISNo,ProjectStatus,MilestoneName,MilestoneReachedDate,FundName,FundingAmt
0,100008,ESTELA HERNANDEZ,1,2000,New Funds,4848,Open,Purchase Loan Check Date,2000-01-11,HOME INVESTMENT PARTSHIP PROGR,35000.0
1,100011,HIGHLAND VILLAGE,2,2000,New Funds,4850,Closed,Construction Contract Completion,2000-12-29,COMMUNITY DEVELOPMENT TRUST,1800000.0
2,100017,MARIA T. BLANCO,12,2000,New Funds,4862,Open,Purchase Loan Check Date,2000-01-26,HOME INVESTMENT PARTSHIP PROGR,3180.0
3,100018,"PINEDA, CYNTHIA A. - TRUSTEE",38,2000,New Funds,4887,Closed,Construction Completion(Contract vs Actual),2002-05-29,HOME INVESTMENT PARTSHIP PROGR,35000.0
4,100030,"TOWNE SQUARE APTS., A CALIF. LTD. PARTNERSHIP",44,2000,New Funds,4927,Closed,Construction Contract Completion,2001-02-20,HOME INVESTMENT PARTSHIP PROGR,500000.0


In [5]:
df.columns

Index(['ProjectNo', 'ProjName', 'FullProjectNumber', 'calendaryear',
       'FundingSource', 'IDISNo', 'ProjectStatus', 'MilestoneName',
       'MilestoneReachedDate', 'FundName', 'FundingAmt'],
      dtype='object')

In [6]:
for col in ['FundingSource', 'FundName']:
    print(df[col].value_counts())

New Funds                      25220
Rollover                        1092
Reclass Grant to Loan            372
Reclass Other Asset to Loan      146
Other                             43
Name: FundingSource, dtype: int64
HOME INVESTMENT PARTSHIP PROGR                            15498
COMMUNITY DEVELOPMENT TRUST                                3917
CITY OF LA AFFORDABLE HOUSING                              2859
CALHOME TRUST FUND                                         1230
LAHD SMALL GRANTS&AWARDS                                    794
BEGIN REUSE PROGRAM FUND                                    372
NSP2-ARRA                                                   344
LOCAL HOUSING TRUST FUND                                    312
HOUSING OPPOR FOR PERSON W/AID                              249
RENTAL REHAB PROGRAM FUND                                   188
CRA Initial Transfer 5/2013                                 131
NSP1-HERA                                                   130
MUNICIPAL HOU

In [7]:
df.to_parquet('../data/HIMS/sample_master.parquet')

In [9]:
df['MilestoneReachedDate'] = df.MilestoneReachedDate.astype('str')
df.to_stata('../data/HIMS/sample_master.dta')

## Sample master

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('../data/HIMS/sample_master.parquet')

In [None]:
df.head()

In [None]:
print(f'# unique ProjectNo: {df.ProjectNo.nunique()}')
print(f'# unique FullProjectNumber: {df.FullProjectNumber.nunique()}')
print(f'# unique IDISNo: {df.IDISNo.nunique()}')

In [None]:
df.dtypes

In [None]:
df['Date'] = df.DateReceived.dt.to_period('M')

In [None]:
df2 = df[['ProjectNo', 'MilestoneName', 'DateReceived', 'Date']].drop_duplicates()
df2['obs'] = df2.groupby(['ProjectNo', 'Date']).cumcount() + 1

In [None]:
df2 = df2.reset_index()

In [None]:
df2['obs_max'] = df2.groupby(['ProjectNo', 'Date']).max()['obs']

In [None]:
df2.head(20)