In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config("spark.sql.warehouse.dir", f"/user/itv010698/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [9]:
spark.sql("""CREATE OR REPLACE view itv010698_lending_club_project.customers_loan_v AS
  SELECT
    l.loan_id,
    c.member_id,
    c.emp_title,
    c.emp_length,
    c.home_ownership, 
    c.annual_income,
    c.address_state,
    c.address_zipcode,
    c.address_country,
    c.grade,
    c.sub_grade,
    c.verification_status,
    c.total_high_credit_limit,
    l.loan_amount,
    l.funded_amount,
    l.loan_term_years,
    l.interest_rate,
    l.monthly_installment,
    l.issue_date,
    l.loan_status,
    l.loan_purpose,
    r.total_principal_received,
    r.total_interest_received,
    r.total_late_fee_received,
    r.total_payment_received,
    r.last_payment_amount,
    r.last_payment_date,
    r.next_payment_date,
    d.delinq_2yrs,
    d.delinq_amnt,
    d.mths_since_last_delinq,
    e.public_records,
    e.public_records_bankruptcies,
    e.enquiry_last_6mths
  FROM itv010698_lending_club_project.customers c
  LEFT JOIN itv010698_lending_club_project.loans l ON c.member_id = l.member_id
  LEFT JOIN itv010698_lending_club_project.loans_repayments r ON l.loan_id = r.loan_id
  LEFT JOIN itv010698_lending_club_project.loans_defaulters_delinq d ON c.member_id = d.member_id
  LEFT JOIN itv010698_lending_club_project.loans_defaulters_detail_rec_enq e ON c.member_id = e.member_id""")


### This above view definition is stored in Hive, however data is not stored, it is calculated in runtime

### Data will be loaded now when you fire this query and it will take time to load

In [11]:
spark.sql("SELECT * FROM itv010698_lending_club_project.customers_loan_v LIMIT 10")

loan_id,member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,delinq_2yrs,delinq_amnt,mths_since_last_delinq,public_records,public_records_bankruptcies,enquiry_last_6mths
73954656,0022909e95ed98477...,Sheet metal worker,10,RENT,56500.0,CA,917xx,USA,E,,Source Verified,,15000.0,15000.0,5,20.75,,Mar-2016,Charged Off,debt_consolidation,1906.09,2971.37,0.0,4877.4599609375,403.7,Mar-2017,,,0.0,39.0,1,1,1
111588979,0022d5159e0595946...,Deputy Sheriff,4,RENT,91000.0,FL,334xx,USA,B,,Source Verified,,9600.0,9600.0,3,10.42,,Jun-2017,Fully Paid,debt_consolidation,9600.0,153.63,0.0,9753.6279296875,9453.07,Aug-2017,,,,,0,0,0
13017945,002364f85568d9d16...,Principal,2,MORTGAGE,140000.0,TX,756xx,USA,B,,Source Verified,,11000.0,11000.0,3,12.99,,Mar-2014,Fully Paid,debt_consolidation,11000.0,2152.15,0.0,13152.150390625,3887.4,Jun-2016,,,0.0,9.0,0,0,4
94191411,00251c4a9616c4b6f...,Manager,10,MORTGAGE,150000.0,RI,028xx,USA,A,,Verified,,20000.0,20000.0,3,6.99,,Jan-2017,Fully Paid,credit_card,20000.0,1010.43,0.0,21010.42578125,16086.28,Oct-2017,,,,,0,0,0
8095954,0026e938ff256d8ee...,,6,MORTGAGE,41000.0,NC,276xx,USA,B,,Verified,,18225.0,18225.0,3,12.99,,Oct-2013,Fully Paid,debt_consolidation,18225.0,1609.01,0.0,19834.01171875,14923.86,Aug-2014,,,,,0,0,2
69483249,002e5aa8febbe9d5a...,owner,10,RENT,80000.0,CT,064xx,USA,B,,Not Verified,,16000.0,16000.0,3,8.49,,Jan-2016,Fully Paid,credit_card,16000.0,2172.67,0.0,18172.666015625,504.87,Jan-2019,,,0.0,71.0,2,1,0
142268965,0038347c3f5781368...,Engineering Manager,3,MORTGAGE,167000.0,MI,480xx,USA,A,,Not Verified,,15000.0,15000.0,3,6.67,,Oct-2018,Current,debt_consolidation,1908.73,390.21,0.0,2298.93994140625,460.9,Mar-2019,Apr-2019,,,,0,0,0
122647973,003e4e14bf5215532...,Senior Accountant,10,MORTGAGE,75000.0,VA,233xx,USA,A,,Not Verified,,16000.0,16000.0,3,7.21,,Nov-2017,Current,home_improvement,6687.38,1235.49,0.0,7922.8701171875,495.58,Mar-2019,Apr-2019,,0.0,36.0,0,0,2
97388244,0049a2c970ca284b0...,Vice President Wo...,1,MORTGAGE,338000.0,MD,218xx,USA,B,,Verified,,40000.0,40000.0,5,10.49,,Mar-2017,Current,home_improvement,13542.73,7222.23,0.0,20764.9609375,859.56,Mar-2019,Apr-2019,,,,1,0,1
143395607,0051e2e772e2f63e4...,Roof Leader,10,OWN,36000.0,IN,467xx,USA,A,,Not Verified,,5000.0,5000.0,3,6.46,,Nov-2018,Current,debt_consolidation,509.07,101.78,0.0,610.8499755859375,153.16,Mar-2019,Apr-2019,,,,0,0,0


### Say we have a weekly job that runs every 7 days

### Join of 5 tables is done every 7 days and the results is put in a table

### Now, even though results are quick, they will get little old data (7 days before data)

### If they are ok, we can go ahead creating a managed table

In [12]:
spark.sql("""CREATE TABLE itv010698_lending_club_project.customers_loan_t AS
  SELECT
    l.loan_id,
    c.member_id,
    c.emp_title,
    c.emp_length,
    c.home_ownership, 
    c.annual_income,
    c.address_state,
    c.address_zipcode,
    c.address_country,
    c.grade,
    c.sub_grade,
    c.verification_status,
    c.total_high_credit_limit,
    l.loan_amount,
    l.funded_amount,
    l.loan_term_years,
    l.interest_rate,
    l.monthly_installment,
    l.issue_date,
    l.loan_status,
    l.loan_purpose,
    r.total_principal_received,
    r.total_interest_received,
    r.total_late_fee_received,
    r.total_payment_received,
    r.last_payment_amount,
    r.last_payment_date,
    r.next_payment_date,
    d.delinq_2yrs,
    d.delinq_amnt,
    d.mths_since_last_delinq,
    e.public_records,
    e.public_records_bankruptcies,
    e.enquiry_last_6mths
  FROM itv010698_lending_club_project.customers c
  LEFT JOIN itv010698_lending_club_project.loans l ON c.member_id = l.member_id
  LEFT JOIN itv010698_lending_club_project.loans_repayments r ON l.loan_id = r.loan_id
  LEFT JOIN itv010698_lending_club_project.loans_defaulters_delinq d ON c.member_id = d.member_id
  LEFT JOIN itv010698_lending_club_project.loans_defaulters_detail_rec_enq e ON c.member_id = e.member_id""")

In [13]:
spark.sql("select * from itv010698_lending_club_project.customers_loan_t limit 10")

loan_id,member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,delinq_2yrs,delinq_amnt,mths_since_last_delinq,public_records,public_records_bankruptcies,enquiry_last_6mths
73954656,0022909e95ed98477...,Sheet metal worker,10,RENT,56500.0,CA,917xx,USA,E,,Source Verified,,15000.0,15000.0,5,20.75,,Mar-2016,Charged Off,debt_consolidation,1906.09,2971.37,0.0,4877.4599609375,403.7,Mar-2017,,,0.0,39.0,1,1,1
111588979,0022d5159e0595946...,Deputy Sheriff,4,RENT,91000.0,FL,334xx,USA,B,,Source Verified,,9600.0,9600.0,3,10.42,,Jun-2017,Fully Paid,debt_consolidation,9600.0,153.63,0.0,9753.6279296875,9453.07,Aug-2017,,,,,0,0,0
13017945,002364f85568d9d16...,Principal,2,MORTGAGE,140000.0,TX,756xx,USA,B,,Source Verified,,11000.0,11000.0,3,12.99,,Mar-2014,Fully Paid,debt_consolidation,11000.0,2152.15,0.0,13152.150390625,3887.4,Jun-2016,,,0.0,9.0,0,0,4
94191411,00251c4a9616c4b6f...,Manager,10,MORTGAGE,150000.0,RI,028xx,USA,A,,Verified,,20000.0,20000.0,3,6.99,,Jan-2017,Fully Paid,credit_card,20000.0,1010.43,0.0,21010.42578125,16086.28,Oct-2017,,,,,0,0,0
8095954,0026e938ff256d8ee...,,6,MORTGAGE,41000.0,NC,276xx,USA,B,,Verified,,18225.0,18225.0,3,12.99,,Oct-2013,Fully Paid,debt_consolidation,18225.0,1609.01,0.0,19834.01171875,14923.86,Aug-2014,,,,,0,0,2
69483249,002e5aa8febbe9d5a...,owner,10,RENT,80000.0,CT,064xx,USA,B,,Not Verified,,16000.0,16000.0,3,8.49,,Jan-2016,Fully Paid,credit_card,16000.0,2172.67,0.0,18172.666015625,504.87,Jan-2019,,,0.0,71.0,2,1,0
142268965,0038347c3f5781368...,Engineering Manager,3,MORTGAGE,167000.0,MI,480xx,USA,A,,Not Verified,,15000.0,15000.0,3,6.67,,Oct-2018,Current,debt_consolidation,1908.73,390.21,0.0,2298.93994140625,460.9,Mar-2019,Apr-2019,,,,0,0,0
122647973,003e4e14bf5215532...,Senior Accountant,10,MORTGAGE,75000.0,VA,233xx,USA,A,,Not Verified,,16000.0,16000.0,3,7.21,,Nov-2017,Current,home_improvement,6687.38,1235.49,0.0,7922.8701171875,495.58,Mar-2019,Apr-2019,,0.0,36.0,0,0,2
97388244,0049a2c970ca284b0...,Vice President Wo...,1,MORTGAGE,338000.0,MD,218xx,USA,B,,Verified,,40000.0,40000.0,5,10.49,,Mar-2017,Current,home_improvement,13542.73,7222.23,0.0,20764.9609375,859.56,Mar-2019,Apr-2019,,,,1,0,1
143395607,0051e2e772e2f63e4...,Roof Leader,10,OWN,36000.0,IN,467xx,USA,A,,Not Verified,,5000.0,5000.0,3,6.46,,Nov-2018,Current,debt_consolidation,509.07,101.78,0.0,610.8499755859375,153.16,Mar-2019,Apr-2019,,,,0,0,0
