In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import time
import imp
import os
import os, os.path
import datetime as dt
import dateutil.parser as dp
import dill

%matplotlib inline

In [2]:
# Create lists of data files (3 groups: regular attributes, GEO, and Seller composition)

data_directory = "../data/fnm/30yr fixed"

# pools attributes 
pools_attributes_files = list()

for root, dirs, files in os.walk(data_directory):
    for name in files:
        file_name = os.path.join(root, name)
        if 'pools attributes' in file_name:
            pools_attributes_files.append(file_name)
        
# pools geographical composition        
pools_geo_pct_files = list()

for root, dirs, files in os.walk(data_directory):
    for name in files:
        file_name = os.path.join(root, name)
        if 'geo pct' in file_name:
            pools_geo_pct_files.append(os.path.join(root, name))  
        
# pools seller composition        
pools_seller_pct_files = list()

for root, dirs, files in os.walk(data_directory):
    for name in files:
        file_name = os.path.join(root, name)
        if 'seller pct' in file_name:
            pools_seller_pct_files.append(os.path.join(root, name)) 
        
print("# of attr files: ",len(pools_attributes_files))
print("# of geo pct files: ",len(pools_geo_pct_files))
print("# of seller pct files: ",len(pools_seller_pct_files))

# of attr files:  262
# of geo pct files:  262
# of seller pct files:  262


In [4]:
# Read in all files with regular pool attributes
attr_df = pd.DataFrame()
for file in pools_attributes_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(attr_df)==0:
        attr_df = one_month_issue_pools
    else:
        if all(attr_df.columns == one_month_issue_pools.columns):
            attr_df = pd.concat([attr_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')

    #print('Done loading ' + file)

In [5]:
attr_df.columns

Index(['poolno', 'asofdate', 'cusip', 'Prefix', 'spread', 'Cpr1', 'Cpr3',
       'Cpr6', 'Cpr12', 'SMM', 'DayCount', 'OBal', 'CBal', 'pbal', 'paydown',
       'Prepay', 'factor', 'OCoupon', 'Coupon', 'OWac', 'Wac', 'Wam', 'Age',
       'aols', 'waols', 'wtclnsz', 'wtolnsz', 'CLnsz', 'OLnsz', 'ONLoans',
       'cnloans', 'PCNLoans', 'PPNLoans', 'OSato', 'CSato', 'oltv', 'cltv',
       'ocltv', 'ccltv', 'fico', 'dti', 'codti', '%CashWindow', '%Majors',
       'PurpPct_purchase', 'PurpPct_refi', 'PctChannel_Broker',
       'PctChannel_Corr', 'PctChannel_Retail', 'OccPct_owner',
       'OccPct_2ndHome', 'OccPct_investor', 'PropUnitsPct_2-4', 'burnout',
       'wac_min', 'wac_qtl1', 'wac_qtl3', 'wac_max', 'ofico_min', 'ofico_qtl1',
       'ofico_qtl3', 'ofico_max', 'oltv_min', 'oltv_qtl1', 'oltv_qtl3',
       'oltv_max', 'aols_min', 'aols_qtl1', 'aols_qtl3', 'aols_max', 'hpa3m',
       'hpa1', 'hpa5', 'hpalife', 'hpa3m_po', 'hpa1_po', 'hpa5_po',
       'hpalife_po', 'Label'],
      dtype='o

In [6]:
print(f'attr_df.shape = {attr_df.shape}')

attr_df.shape = (12070738, 79)


In [7]:
# Read in all files with pools GEO info
geo_df = pd.DataFrame()
for file in pools_geo_pct_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(geo_df)==0:
        geo_df = one_month_issue_pools
    else:
        if all(geo_df.columns == one_month_issue_pools.columns):
            geo_df = pd.concat([geo_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')

    #print('Done loading ' + file)

In [8]:
geo_df.columns

Index(['poolno', 'asofdate', 'cusip', 'StatePct_AK', 'StatePct_AL',
       'StatePct_AR', 'StatePct_AZ', 'StatePct_CA', 'StatePct_CO',
       'StatePct_CT', 'StatePct_DC', 'StatePct_DE', 'StatePct_FL',
       'StatePct_GA', 'StatePct_GU', 'StatePct_HI', 'StatePct_IA',
       'StatePct_ID', 'StatePct_IL', 'StatePct_IN', 'StatePct_KS',
       'StatePct_KY', 'StatePct_LA', 'StatePct_MA', 'StatePct_MD',
       'StatePct_ME', 'StatePct_MI', 'StatePct_MN', 'StatePct_MO',
       'StatePct_MS', 'StatePct_MT', 'StatePct_NC', 'StatePct_ND',
       'StatePct_NE', 'StatePct_NH', 'StatePct_NJ', 'StatePct_NM',
       'StatePct_NV', 'StatePct_NY', 'StatePct_OH', 'StatePct_OK',
       'StatePct_OR', 'StatePct_PA', 'StatePct_PR', 'StatePct_RI',
       'StatePct_SC', 'StatePct_SD', 'StatePct_TN', 'StatePct_TX',
       'StatePct_UT', 'StatePct_VA', 'StatePct_VI', 'StatePct_VT',
       'StatePct_WA', 'StatePct_WI', 'StatePct_WV', 'StatePct_WY', 'Label'],
      dtype='object')

In [9]:
print(f'geo_df.shape = {geo_df.shape}')

geo_df.shape = (11957560, 58)


In [10]:
# Read in all files with pools Seller info
seller_df = pd.DataFrame()
for file in pools_seller_pct_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(seller_df)==0:
        seller_df = one_month_issue_pools
    else:
        if all(seller_df.columns == one_month_issue_pools.columns):
            seller_df = pd.concat([seller_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')
    #print('Done loading ' + file)

In [11]:
seller_df.columns

Index(['poolno', 'asofdate', 'SellerPct_AMRHT', 'SellerPct_ALS',
       'SellerPct_CAFULL', 'SellerPct_CNTL', 'SellerPct_CITIZ', 'SellerPct_53',
       'SellerPct_FIR', 'SellerPct_FRDOM', 'SellerPct_GUILD',
       'SellerPct_CHASE', 'SellerPct_LLSL', 'SellerPct_MATRX', 'SellerPct_NCM',
       'SellerPct_NATIONST', 'SellerPct_NRESM', 'SellerPct_PNYMAC',
       'SellerPct_PILOSI', 'SellerPct_QUICK', 'SellerPct_REG',
       'SellerPct_RMSC', 'SellerPct_UNSHFI', 'SellerPct_WFHM', 'cusip',
       'Prefix', 'Label'],
      dtype='object')

In [12]:
print(f'seller_df.shape = {seller_df.shape}')

seller_df.shape = (7443443, 27)


# Clean data and fill NAs

### pool attributes

In [15]:
attr_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12070738 entries, 0 to 2919
Data columns (total 79 columns):
 #   Column             Non-Null Count     Dtype  
---  ------             --------------     -----  
 0   poolno             12070738 non-null  object 
 1   asofdate           12070738 non-null  int64  
 2   cusip              12070738 non-null  object 
 3   Prefix             12070738 non-null  object 
 4   spread             12066040 non-null  float64
 5   Cpr1               12066612 non-null  float64
 6   Cpr3               11709965 non-null  float64
 7   Cpr6               11182949 non-null  float64
 8   Cpr12              10171574 non-null  float64
 9   SMM                12069813 non-null  float64
 10  DayCount           12070738 non-null  int64  
 11  OBal               12070738 non-null  int64  
 12  CBal               12066040 non-null  float64
 13  pbal               12070738 non-null  float64
 14  paydown            12070738 non-null  float64
 15  Prepay           

In [16]:
attr_df.columns

Index(['poolno', 'asofdate', 'cusip', 'Prefix', 'spread', 'Cpr1', 'Cpr3',
       'Cpr6', 'Cpr12', 'SMM', 'DayCount', 'OBal', 'CBal', 'pbal', 'paydown',
       'Prepay', 'factor', 'OCoupon', 'Coupon', 'OWac', 'Wac', 'Wam', 'Age',
       'aols', 'waols', 'wtclnsz', 'wtolnsz', 'CLnsz', 'OLnsz', 'ONLoans',
       'cnloans', 'PCNLoans', 'PPNLoans', 'OSato', 'CSato', 'oltv', 'cltv',
       'ocltv', 'ccltv', 'fico', 'dti', 'codti', '%CashWindow', '%Majors',
       'PurpPct_purchase', 'PurpPct_refi', 'PctChannel_Broker',
       'PctChannel_Corr', 'PctChannel_Retail', 'OccPct_owner',
       'OccPct_2ndHome', 'OccPct_investor', 'PropUnitsPct_2-4', 'burnout',
       'wac_min', 'wac_qtl1', 'wac_qtl3', 'wac_max', 'ofico_min', 'ofico_qtl1',
       'ofico_qtl3', 'ofico_max', 'oltv_min', 'oltv_qtl1', 'oltv_qtl3',
       'oltv_max', 'aols_min', 'aols_qtl1', 'aols_qtl3', 'aols_max', 'hpa3m',
       'hpa1', 'hpa5', 'hpalife', 'hpa3m_po', 'hpa1_po', 'hpa5_po',
       'hpalife_po', 'Label'],
      dtype='o

In [None]:
'Cpr1', 'Cpr3',
       'Cpr6', 'Cpr12',
'pbal', 'paydown',
       'Prepay',
'OCoupon',
'OWac',
'PCNLoans', 'PPNLoans',
'OSato',
'dti', 'codti',



In [None]:
# attr_df.drop(['Cpr1','Cpr3','Cpr6','Cpr12','Cpr24','CprLife','pbal','paydown','Prepay','OCoupon','OWac',
#               'PCNLoans','PPNLoans','OSato','%cltv_80','%cltv_105','%cltv_125','%ccltv_80','%ccltv_105','%ccltv_125',
#               '%FedHold','%CMOHold','dti','codti'],axis=1,inplace=True)

In [None]:
attr_df.dropna(subset=['CBal','SMM','Wac','aols','fico','OccPct_investor','OccPct_owner','PropUnitsPct_2-4'],inplace=True)

In [None]:
attr_df.sort_values(by=['poolno','asofdate'],inplace=True)

In [None]:
attr_df.fillna(method='ffill',inplace=True)

In [None]:
attr_df['CBal'] = attr_df['CBal'].apply(lambda x: float(x))

In [None]:
attr_df.rename(columns={'Label':'pool_issue_month'},inplace=True)

In [None]:
attr_df.info(verbose=True, null_counts=True)

In [None]:
geo_df.drop('Label',axis=1,inplace=True)

In [None]:
geo_df.sort_values(by=['poolno','asofdate'],inplace=True)

In [None]:
geo_df.fillna(method='ffill',inplace=True)

In [None]:
geo_df.info(verbose=True, null_counts=True)

In [None]:
geo_columns = ['StatePct_AK','StatePct_AL','StatePct_AR','StatePct_AZ','StatePct_CA','StatePct_CO','StatePct_CT',
                'StatePct_DC','StatePct_DE','StatePct_FL','StatePct_GA','StatePct_GU','StatePct_HI','StatePct_IA',
                'StatePct_ID','StatePct_IL','StatePct_IN','StatePct_KS','StatePct_KY','StatePct_LA','StatePct_MA',
                'StatePct_MD','StatePct_ME','StatePct_MI','StatePct_MN','StatePct_MO','StatePct_MS','StatePct_MT',
                'StatePct_NC','StatePct_ND','StatePct_NE','StatePct_NH','StatePct_NJ','StatePct_NM','StatePct_NV',
                'StatePct_NY','StatePct_OH','StatePct_OK','StatePct_OR','StatePct_PA','StatePct_PR','StatePct_RI',
                'StatePct_SC','StatePct_SD','StatePct_TN','StatePct_TX','StatePct_UT','StatePct_VA','StatePct_VI',
                'StatePct_VT','StatePct_WA','StatePct_WI','StatePct_WV','StatePct_WY']

In [None]:
def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

In [None]:
geo_df[geo_columns] = geo_df[geo_columns].applymap(lambda x: float(x) if isfloat(x) else np.nan)

In [None]:
geo_df.fillna(0,inplace=True)

In [None]:
geo_df.info(verbose=True, null_counts=True)

In [None]:
seller_df.info(verbose=True, null_counts=True)

In [None]:
seller_df.drop(['Label','Prefix'],axis=1,inplace=True)

In [None]:
seller_df.sort_values(by=['poolno','asofdate'],inplace=True)

In [None]:
seller_df.fillna(method='ffill',inplace=True)

In [None]:
seller_columns = ['SellerPct_AMRHT','SellerPct_ALS','SellerPct_CAFULL','SellerPct_CNTL','SellerPct_CITIZ',
                  'SellerPct_53','SellerPct_FIR','SellerPct_FRDOM','SellerPct_GUILD','SellerPct_CHASE',
                  'SellerPct_LLSL','SellerPct_MATRX','SellerPct_NCM','SellerPct_NATIONST','SellerPct_NRESM',
                  'SellerPct_PNYMAC','SellerPct_PILOSI','SellerPct_QUICK','SellerPct_REG','SellerPct_RMSC',
                  'SellerPct_UNSHFI','SellerPct_WFHM']

In [None]:
seller_df[seller_columns] = seller_df[seller_columns].applymap(lambda x: float(x) if isfloat(x) else np.nan)

In [None]:
seller_df.fillna(0,inplace=True)

In [None]:
seller_df.info(verbose=True, null_counts=True)

# Below we are combining the three DataFrames (attr_df, geo_df, seller_df) into a single DataFrame called df

In [None]:
print(f'attr_df.shape   = {attr_df.shape}')
print(f'geo_df.shape    = {geo_df.shape}')
print(f'seller_df.shape = {seller_df.shape}')

In [None]:
geo_df.drop('poolno',axis=1,inplace=True)

In [None]:
df = pd.merge(attr_df,geo_df,on=['cusip','asofdate'],how='left')

In [None]:
df.info(verbose=True, null_counts=True)

In [None]:
seller_df.drop('poolno',axis=1,inplace=True)

In [None]:
df = pd.merge(df,seller_df,on=['cusip','asofdate'],how='left')

In [None]:
df.fillna(0,inplace=True)

In [None]:
df.info(verbose=True, null_counts=True)

In [None]:
print(f'df.shape = {df.shape}')

In [None]:
df.to_pickle('../jumbos_30yr.pkl')

In [None]:
# Write df into a sequence of small files

#out_dir = '../30y fixed rate data/fnm/conforming/clean data'
out_dir = '../30y fixed rate data/fnm/jumbos/clean data'

print(f'df.shape = {df.shape}')

rows = df.shape[0]

N = rows//30000 + 1

for i in range(N):
    df.iloc[i*30000:(i+1)*30000,].to_csv(f'{out_dir}/pools_part{i+1}.csv',index=False)