In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import time
import imp
import os
import os, os.path
import datetime as dt
import dateutil.parser as dp
import dill

%matplotlib inline

In [2]:
# Create lists of data files (3 groups: regular attributes, GEO, and Seller composition)
pools_attributes_files = list()
for root, dirs, files in os.walk("C:/Users/YuriTurygin/Desktop/NN-PPM/data/pools attributes"):
    for name in files:
        pools_attributes_files.append(os.path.join(root, name))        
        
pools_geo_pct_files = list()
for root, dirs, files in os.walk("C:/Users/YuriTurygin/Desktop/NN-PPM/data/geo pct"):
    for name in files:
        pools_geo_pct_files.append(os.path.join(root, name))  
        
pools_seller_pct_files = list()
for root, dirs, files in os.walk("C:/Users/YuriTurygin/Desktop/NN-PPM/data/seller pct"):
    for name in files:
        pools_seller_pct_files.append(os.path.join(root, name)) 
        
print("# of attr files: ",len(pools_attributes_files))
print("# of geo pct files: ",len(pools_geo_pct_files))
print("# of seller pct files: ",len(pools_seller_pct_files))

# of attr files:  121
# of geo pct files:  120
# of seller pct files:  122


In [3]:
# Read in all files with regular pool attributes
attr_df = pd.DataFrame()
for file in pools_attributes_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(attr_df)==0:
        attr_df = one_month_issue_pools
    else:
        # df1.columns.difference(df2.columns)
        # df2.columns.difference(df1.columns)
        if all(attr_df.columns == one_month_issue_pools.columns):
            attr_df = pd.concat([attr_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')

    #print('Done loading ' + file)

In [4]:
attr_df.columns

Index(['poolno', 'asofdate', 'cusip', 'Prefix', 'spread', 'Cpr1', 'Cpr3',
       'Cpr6', 'Cpr12', 'Cpr24', 'CprLife', 'SMM', 'DayCount', 'OBal', 'CBal',
       'pbal', 'paydown', 'Prepay', 'factor', 'OCoupon', 'Coupon', 'OWac',
       'Wac', 'Wam', 'Age', 'aols', 'waols', 'ONLoans', 'cnloans', 'PCNLoans',
       'PPNLoans', 'OSato', 'CSato', 'oltv', 'cltv', '%cltv_80', '%cltv_105',
       '%cltv_125', '%ccltv_80', '%ccltv_105', '%ccltv_125', 'fico',
       '%FedHold', '%CMOHold', 'dti', 'codti', '%CashWindow', '%Majors',
       'ocltv', 'ccltv', 'PurpPct_purchase', 'PurpPct_refi',
       'PctChannel_Broker', 'PctChannel_Corr', 'PctChannel_Retail',
       'OccPct_investor', 'OccPct_owner', 'PropUnitsPct_2-4', 'Label'],
      dtype='object')

In [5]:
len(attr_df)

10143061

In [6]:
#dill.dump_session('notebook_env_just_attributes.db')

In [7]:
# Read in all files with pools GEO info
geo_df = pd.DataFrame()
for file in pools_geo_pct_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(geo_df)==0:
        geo_df = one_month_issue_pools
    else:
        # df1.columns.difference(df2.columns)
        # df2.columns.difference(df1.columns)
        if all(geo_df.columns == one_month_issue_pools.columns):
            geo_df = pd.concat([geo_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')

    #print('Done loading ' + file)

In [8]:
geo_df.columns

Index(['poolno', 'asofdate', 'cusip', 'StatePct_AK', 'StatePct_AL',
       'StatePct_AR', 'StatePct_AZ', 'StatePct_CA', 'StatePct_CO',
       'StatePct_CT', 'StatePct_DC', 'StatePct_DE', 'StatePct_FL',
       'StatePct_GA', 'StatePct_GU', 'StatePct_HI', 'StatePct_IA',
       'StatePct_ID', 'StatePct_IL', 'StatePct_IN', 'StatePct_KS',
       'StatePct_KY', 'StatePct_LA', 'StatePct_MA', 'StatePct_MD',
       'StatePct_ME', 'StatePct_MI', 'StatePct_MN', 'StatePct_MO',
       'StatePct_MS', 'StatePct_MT', 'StatePct_NC', 'StatePct_ND',
       'StatePct_NE', 'StatePct_NH', 'StatePct_NJ', 'StatePct_NM',
       'StatePct_NV', 'StatePct_NY', 'StatePct_OH', 'StatePct_OK',
       'StatePct_OR', 'StatePct_PA', 'StatePct_PR', 'StatePct_RI',
       'StatePct_SC', 'StatePct_SD', 'StatePct_TN', 'StatePct_TX',
       'StatePct_UT', 'StatePct_VA', 'StatePct_VI', 'StatePct_VT',
       'StatePct_WA', 'StatePct_WI', 'StatePct_WV', 'StatePct_WY', 'Label'],
      dtype='object')

In [9]:
len(geo_df)

9929896

In [10]:
#dill.dump_session('notebook_env_attr_and_geo.db')

In [11]:
# Read in all files with pools Seller info
seller_df = pd.DataFrame()
for file in pools_seller_pct_files:
    one_month_issue_pools = pd.read_csv(file)
    if len(seller_df)==0:
        seller_df = one_month_issue_pools
    else:
        # df1.columns.difference(df2.columns)
        # df2.columns.difference(df1.columns)
        if all(seller_df.columns == one_month_issue_pools.columns):
            seller_df = pd.concat([seller_df,one_month_issue_pools])
        else:
            print(f'columns missmatch for {file}')
    #print('Done loading ' + file)

In [12]:
seller_df.columns

Index(['poolno', 'asofdate', 'SellerPct_AMRHT', 'SellerPct_ALS',
       'SellerPct_CAFULL', 'SellerPct_CNTL', 'SellerPct_CITIZ', 'SellerPct_53',
       'SellerPct_FIR', 'SellerPct_FRDOM', 'SellerPct_GUILD',
       'SellerPct_CHASE', 'SellerPct_LLSL', 'SellerPct_MATRX', 'SellerPct_NCM',
       'SellerPct_NATIONST', 'SellerPct_NRESM', 'SellerPct_PNYMAC',
       'SellerPct_PILOSI', 'SellerPct_QUICK', 'SellerPct_REG',
       'SellerPct_RMSC', 'SellerPct_UNSHFI', 'SellerPct_WFHM', 'cusip',
       'Prefix', 'Label'],
      dtype='object')

In [13]:
len(seller_df)

9636863

In [14]:
dill.dump_session('notebook_env_all_3_dfs.db')

In [15]:
attr_df.shape

(10143061, 59)

In [25]:
attr_df.drop(['Prefix','Cpr1','Cpr3','Cpr6','Cpr12','Cpr24','CprLife','pbal','paydown','Prepay','OCoupon','OWac','PCNLoans','PPNLoans','OSato','%cltv_80','%cltv_105','%cltv_125','%ccltv_80','%ccltv_105','%ccltv_125','%FedHold','%CMOHold','dti','codti'],axis=1,inplace=True)

In [42]:
attr_df.dropna(subset=['CBal','SMM','Wac','aols','fico','OccPct_investor','OccPct_owner','PropUnitsPct_2-4'],inplace=True)

In [50]:
attr_df.sort_values(by=['poolno','asofdate'],inplace=True)

In [56]:
attr_df.fillna(method='ffill',inplace=True)

In [61]:
attr_df['CBal'] = attr_df['CBal'].apply(lambda x: float(x))

In [64]:
attr_df.rename(columns={'Label':'pool_issue_month'},inplace=True)

In [65]:
attr_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10113549 entries, 1482 to 164706
Data columns (total 34 columns):
 #   Column             Non-Null Count     Dtype  
---  ------             --------------     -----  
 0   poolno             10113549 non-null  object 
 1   asofdate           10113549 non-null  int64  
 2   cusip              10113549 non-null  object 
 3   spread             10113549 non-null  float64
 4   SMM                10113549 non-null  float64
 5   DayCount           10113549 non-null  float64
 6   OBal               10113549 non-null  float64
 7   CBal               10113549 non-null  float64
 8   factor             10113549 non-null  float64
 9   Coupon             10113549 non-null  float64
 10  Wac                10113549 non-null  float64
 11  Wam                10113549 non-null  float64
 12  Age                10113549 non-null  float64
 13  aols               10113549 non-null  float64
 14  waols              10113549 non-null  float64
 15  ONLoans     

In [66]:
geo_df.sort_values(by=['poolno','asofdate'],inplace=True)

In [67]:
geo_df.fillna(method='ffill',inplace=True)

In [71]:
geo_pct = geo_df[['StatePct_AK','StatePct_AL','StatePct_AR','StatePct_AZ','StatePct_CA','StatePct_CO','StatePct_CT',
                    'StatePct_DC','StatePct_DE','StatePct_FL','StatePct_GA','StatePct_GU','StatePct_HI','StatePct_IA',
                    'StatePct_ID','StatePct_IL','StatePct_IN','StatePct_KS','StatePct_KY','StatePct_LA','StatePct_MA',
                    'StatePct_MD','StatePct_ME','StatePct_MI','StatePct_MN','StatePct_MO','StatePct_MS','StatePct_MT',
                    'StatePct_NC','StatePct_ND','StatePct_NE','StatePct_NH','StatePct_NJ','StatePct_NM','StatePct_NV',
                    'StatePct_NY','StatePct_OH','StatePct_OK','StatePct_OR','StatePct_PA','StatePct_PR','StatePct_RI',
                    'StatePct_SC','StatePct_SD','StatePct_TN','StatePct_TX','StatePct_UT','StatePct_VA','StatePct_VI',
                    'StatePct_VT','StatePct_WA','StatePct_WI','StatePct_WV','StatePct_WY']]

geo_pct = geo_pct.applymap(lambda x: float(x))

ValueError: could not convert string to float: 

In [None]:
geo_df.info(verbose=True, null_counts=True)