In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
import pathlib

In [2]:
csv_dir = pathlib.Path.cwd().parent / 'CSV files' # can join path elements with / operator

acquisition_df = pd.read_csv(csv_dir / 'Final Project Data.csv', sep=",")
acquisition_df = acquisition_df.rename(columns = {'Loan Identifier':'loan identifier'})
acquisition_df = acquisition_df.drop(['Co-borrower Credit Score at Origination', 'Mortgage Insurance Type'], axis = 1)
acquisition_df.head()

Unnamed: 0,loan identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,First Time Home Buyer Indicator,Loan Purpose,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Primary Mortgage Insurance Percent,Product Type,Relocation Mortgage Indicator
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,N,C,PU,1,P,CA,925,,FRM,N
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,N,P,PU,1,P,TX,770,25.0,FRM,N
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,N,P,SF,1,S,NC,286,25.0,FRM,N
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,Y,P,SF,1,P,IL,600,25.0,FRM,N
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,Y,P,CO,1,P,CA,945,,FRM,N


In [3]:
#performance_df = pd.read_csv(csv_dir / 'cleaned_Performance.csv', sep=",")
#performance_df.head()
# This 'cleaned_Performance.csv' file, somewhere in the pipeline, lost most of its observations.
# I think they got cut off when importing the .txt file into Excel, then converting it to .csv.
# To avoid this, I think I'll copy the code for cleaning performance in this notebook.
# So the performance data can go from .txt into the mergedData without any .csv conversion in the middle.

performance_df = pd.read_csv(csv_dir / 'cleaned_Performance_full.csv', sep=",")
performance_df.head()

Unnamed: 0,loan identifier,monthly reporting period,current interest rate,loan age,remaining months to legal maturity,adjusted months to maturity,maturity date,MSA,current loan delinquency status,modification flag
0,100000913397,2019-01-01,5.88,3,357,357.0,2048-10-01,40140,0,N
1,100000913397,2019-02-01,5.88,4,356,356.0,2048-10-01,40140,0,N
2,100000913397,2019-03-01,5.88,5,355,355.0,2048-10-01,40140,0,N
3,100000913397,2019-04-01,5.88,6,354,354.0,2048-10-01,40140,0,N
4,100000913397,2019-05-01,5.88,7,353,353.0,2048-10-01,40140,0,N


In [4]:
#perf_col_names = ['']
#raw_perf_df = pd.read_csv("Performance_2019Q1.csv", sep="|", header=None, names=perf_col_names)
#raw_perf_df.head()

In [5]:
# Creates a subset dataframe containing all rows from performance where current loan delinquency status != 0.
performance_df["current loan delinquency status"].unique()
delinquency_df = performance_df[performance_df["current loan delinquency status"] != '0']
# This is a start, but the resulting merge has some duplicate loan identifiers and null entries for acquisition rows where none of their performance rows are in the above table.

In [6]:
full_perf_df = performance_df.sort_values("monthly reporting period").groupby(by = "loan identifier").tail(1)
full_perf_df.head()
#full_perf_df.count

Unnamed: 0,loan identifier,monthly reporting period,current interest rate,loan age,remaining months to legal maturity,adjusted months to maturity,maturity date,MSA,current loan delinquency status,modification flag
1004059,391757295836,2019-02-01,4.88,1,359,0.0,2049-01-01,35620,X,N
209046,160527809683,2019-02-01,5.25,2,358,0.0,2048-12-01,41620,X,N
16118,104765459314,2019-02-01,5.5,2,358,0.0,2048-12-01,31180,X,N
91683,126533834704,2019-02-01,5.63,2,358,0.0,2048-12-01,31080,X,N
15815,104640949538,2019-02-01,5.75,1,359,0.0,2049-01-01,41860,X,N


In [7]:
# This is a new column, meant to be the target variable, calculated from if "current loan delinquency status" != '0'
full_perf_df["delinquency"] = full_perf_df["current loan delinquency status"] != '0'
full_perf_df.head()

Unnamed: 0,loan identifier,monthly reporting period,current interest rate,loan age,remaining months to legal maturity,adjusted months to maturity,maturity date,MSA,current loan delinquency status,modification flag,delinquency
1004059,391757295836,2019-02-01,4.88,1,359,0.0,2049-01-01,35620,X,N,True
209046,160527809683,2019-02-01,5.25,2,358,0.0,2048-12-01,41620,X,N,True
16118,104765459314,2019-02-01,5.5,2,358,0.0,2048-12-01,31180,X,N,True
91683,126533834704,2019-02-01,5.63,2,358,0.0,2048-12-01,31080,X,N,True
15815,104640949538,2019-02-01,5.75,1,359,0.0,2049-01-01,41860,X,N,True


In [8]:
# Save the transformed performance dataframe into a csv file.
full_perf_df.to_csv(csv_dir / 'full_perf_df.csv', index = False)

In [9]:
merge_df = acquisition_df.merge(full_perf_df, on="loan identifier", how="left")
merge_df.head()

Unnamed: 0,loan identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,monthly reporting period,current interest rate,loan age,remaining months to legal maturity,adjusted months to maturity,maturity date,MSA,current loan delinquency status,modification flag,delinquency
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,2019-12-01,5.88,14.0,346.0,346.0,2048-10-01,40140.0,0,N,False
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,2019-12-01,4.75,11.0,349.0,349.0,2049-01-01,26420.0,0,N,False
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,2019-12-01,4.88,12.0,348.0,348.0,2048-12-01,0.0,0,N,False
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,2019-10-01,4.88,9.0,351.0,0.0,2049-01-01,16980.0,X,N,True
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,2019-12-01,4.25,8.0,352.0,352.0,2049-04-01,41860.0,0,N,False


In [10]:
# Save the merged dataframe into a csv file.
merge_df.to_csv(csv_dir / 'mergedData.csv', index = False)

In [11]:
#merge_df.isnull().sum()