In [1]:
# Import the required dependencies.
import psycopg2
import pandas as pd
from Config import db_password_cloud
from sqlalchemy import create_engine, func 
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import extract

### *Connection to the database.*

In [2]:
# Database setup.
engine = create_engine(f'postgresql://team7:{db_password_cloud}@team7.cpliq65f81hf.ca-central-1.rds.amazonaws.com:5432/modelingPD')

In [3]:
engine

Engine(postgresql://team7:***@team7.cpliq65f81hf.ca-central-1.rds.amazonaws.com:5432/modelingPD)

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect = True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['merged', 'acquisition']

In [6]:
# Print that the database has been opened successfully.
print('Database opened successfully')

Database opened successfully


In [7]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [8]:
# Create a dataframe with the merged acquisition and performance data.
data_df = pd.read_sql('SELECT * FROM merged', engine)
data_df.head(20)

Unnamed: 0,loan_identifier,origination_channel,seller_name,original_interest_rate,original_upb,original_loan_term,origination_date,first_payment_date,original_ltv,original_cltv,...,relocation_mortgage_indicator,monthly_reporting_period,current_interest_rate,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_deliquency_status,modification_flag


In [9]:
# Close the connection to the database after the dataframe has been created.
session.close()

### *Analysis of the mortgage(performance and acquisition) data.*

In [15]:
# Loading the mortgage data into a pandas dataframe.
file_to_load = ('https://media.githubusercontent.com/media/Azaima-Asghar/Modeling-probability-of-default/TriangleWeek2/mergedData.csv')
mortgage_df = pd.read_csv(file_to_load)
mortgage_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,loan identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,Relocation Mortgage Indicator,monthly reporting period,current interest rate,loan age,remaining months to legal maturity,adjusted months to maturity,maturity date,MSA,current loan delinquency status,modification flag
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,N,,,,,,,,,
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,N,,,,,,,,,
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,N,,,,,,,,,
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,N,2019-10-01,4.88,9.0,351.0,0.0,2049-01-01,16980.0,X,N
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,N,,,,,,,,,


In [11]:
# Get the number of rows and columns of the merged data.
mortgage_df.shape

(301807, 32)

In [12]:
# Count the number of values for each column.
mortgage_df.count()

loan identifier                           301807
Origination Channel                       301807
Seller Name                               301807
Original Interest Rate                    301807
Original UPB                              301807
Original Loan Term                        301807
Origination Date                          301807
First Payment Date                        301807
Original Loan-to-value (LTV)              301807
Original Combined Loan-to-value (CLTV)    301807
Number of Borrowers                       301807
Original Debt to Income Ratio             301710
Borrower Credit Score at Origination      301420
First Time Home Buyer Indicator           301807
Loan Purpose                              301807
Property Type                             301807
Number of Units                           301807
Occupancy Type                            301807
Property State                            301807
Zip Code Short                            301807
Primary Mortgage Ins

In [22]:
# Get the names of all the columns in the mortgage dataframe.
mortgage_df.columns

Index(['loan identifier', 'Origination Channel', 'Seller Name',
       'Original Interest Rate', 'Original UPB', 'Original Loan Term',
       'Origination Date', 'First Payment Date',
       'Original Loan-to-value (LTV)',
       'Original Combined Loan-to-value (CLTV)', 'Number of Borrowers',
       'Original Debt to Income Ratio', 'Borrower Credit Score at Origination',
       'First Time Home Buyer Indicator', 'Loan Purpose ', 'Property Type',
       'Number of Units', 'Occupancy Type', 'Property State', 'Zip Code Short',
       'Primary Mortgage Insurance Percent ', 'Product Type',
       'Relocation Mortgage Indicator', 'monthly reporting period',
       'current interest rate', 'loan age',
       'remaining months to legal maturity', 'adjusted months to maturity',
       'maturity date', 'MSA', 'current loan delinquency status',
       'modification flag'],
      dtype='object')

In [13]:
# Count the number of null values in each column.
mortgage_df.isnull().sum()

loan identifier                                0
Origination Channel                            0
Seller Name                                    0
Original Interest Rate                         0
Original UPB                                   0
Original Loan Term                             0
Origination Date                               0
First Payment Date                             0
Original Loan-to-value (LTV)                   0
Original Combined Loan-to-value (CLTV)         0
Number of Borrowers                            0
Original Debt to Income Ratio                 97
Borrower Credit Score at Origination         387
First Time Home Buyer Indicator                0
Loan Purpose                                   0
Property Type                                  0
Number of Units                                0
Occupancy Type                                 0
Property State                                 0
Zip Code Short                                 0
Primary Mortgage Ins

In [24]:
# Drop the columns with the null values. 
mortgage_data = mortgage_df.drop(['Primary Mortgage Insurance Percent ', 'monthly reporting period', 'current interest rate',
'loan age', 'remaining months to legal maturity', 'adjusted months to maturity', 'maturity date', 'MSA', 'current loan delinquency status',
'modification flag'], axis = 1)
mortgage_data.head()

Unnamed: 0,loan identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,Borrower Credit Score at Origination,First Time Home Buyer Indicator,Loan Purpose,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Product Type,Relocation Mortgage Indicator
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,692.0,N,C,PU,1,P,CA,925,FRM,N
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,722.0,N,P,PU,1,P,TX,770,FRM,N
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,728.0,N,P,SF,1,S,NC,286,FRM,N
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,730.0,Y,P,SF,1,P,IL,600,FRM,N
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,727.0,Y,P,CO,1,P,CA,945,FRM,N


In [30]:
# Check the number of columns in the new dataframe.
mortgage_data.shape

(301807, 22)

In [31]:
# Check the null values in the new dataframe.
mortgage_data.isnull().sum()

loan identifier                             0
Origination Channel                         0
Seller Name                                 0
Original Interest Rate                      0
Original UPB                                0
Original Loan Term                          0
Origination Date                            0
First Payment Date                          0
Original Loan-to-value (LTV)                0
Original Combined Loan-to-value (CLTV)      0
Number of Borrowers                         0
Original Debt to Income Ratio              97
Borrower Credit Score at Origination      387
First Time Home Buyer Indicator             0
Loan Purpose                                0
Property Type                               0
Number of Units                             0
Occupancy Type                              0
Property State                              0
Zip Code Short                              0
Product Type                                0
Relocation Mortgage Indicator     

In [32]:
# Drop the null values present in the dataframe.
clean_mortgage_data = mortgage_data.dropna()
clean_mortgage_data.head()

Unnamed: 0,loan identifier,Origination Channel,Seller Name,Original Interest Rate,Original UPB,Original Loan Term,Origination Date,First Payment Date,Original Loan-to-value (LTV),Original Combined Loan-to-value (CLTV),...,Borrower Credit Score at Origination,First Time Home Buyer Indicator,Loan Purpose,Property Type,Number of Units,Occupancy Type,Property State,Zip Code Short,Product Type,Relocation Mortgage Indicator
0,100000913397,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",6,324000,360,2018-09-01,2018-11-01,80,80,...,692.0,N,C,PU,1,P,CA,925,FRM,N
1,100017539727,B,OTHER,5,307000,360,2018-12-01,2019-02-01,90,90,...,722.0,N,P,PU,1,P,TX,770,FRM,N
2,100018053040,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",5,256000,360,2018-11-01,2019-01-01,90,90,...,728.0,N,P,SF,1,S,NC,286,FRM,N
3,100019764317,C,"WELLS FARGO BANK, N.A.",5,248000,360,2018-12-01,2019-02-01,90,90,...,730.0,Y,P,SF,1,P,IL,600,FRM,N
4,100019765730,B,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",4,490000,360,2019-03-01,2019-05-01,67,67,...,727.0,Y,P,CO,1,P,CA,945,FRM,N


In [33]:
# Check the null values in the clean dataframe.
clean_mortgage_data.isnull().sum()

loan identifier                           0
Origination Channel                       0
Seller Name                               0
Original Interest Rate                    0
Original UPB                              0
Original Loan Term                        0
Origination Date                          0
First Payment Date                        0
Original Loan-to-value (LTV)              0
Original Combined Loan-to-value (CLTV)    0
Number of Borrowers                       0
Original Debt to Income Ratio             0
Borrower Credit Score at Origination      0
First Time Home Buyer Indicator           0
Loan Purpose                              0
Property Type                             0
Number of Units                           0
Occupancy Type                            0
Property State                            0
Zip Code Short                            0
Product Type                              0
Relocation Mortgage Indicator             0
dtype: int64

In [34]:
# Count the number of values in each column.
clean_mortgage_data.count()

loan identifier                           301323
Origination Channel                       301323
Seller Name                               301323
Original Interest Rate                    301323
Original UPB                              301323
Original Loan Term                        301323
Origination Date                          301323
First Payment Date                        301323
Original Loan-to-value (LTV)              301323
Original Combined Loan-to-value (CLTV)    301323
Number of Borrowers                       301323
Original Debt to Income Ratio             301323
Borrower Credit Score at Origination      301323
First Time Home Buyer Indicator           301323
Loan Purpose                              301323
Property Type                             301323
Number of Units                           301323
Occupancy Type                            301323
Property State                            301323
Zip Code Short                            301323
Product Type        