# Join the tables

### Import libraries

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
import os
import getpass
import csv

### Access to S3

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'

In [3]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

access_key = 'AKIAWOHFNKOOHLXDQIF4'
secret_key = getpass.getpass()

 ········································


In [4]:
sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_key)
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', secret_key)

### Get the data frames

In [70]:
# transform rent data from wide format to long format

rdd_rent = sc.textFile('s3a://msds-durian-candy/rent/Metro_Zri_AllHomesPlusMultifamily.csv.gz')
header_raw = rdd_rent.first()
header_temp = [item for item in csv.reader([header_raw])][0]

new_header = []
new_header.append(header_temp[0]) # RegionID
new_header.extend(['RegionName','StateCode']) # RegionName to 'StateName' and 'StateCode'
new_header.append(header_temp[2])
new_header.extend(['Year','Month']) # From index 3 onwards is date related, we want long format of those columns
new_header.append('ZillowRentIndex')

# The first row is also useless for us since it's for the entire US not individual state
US_row = rdd_rent.filter(lambda line: line != header_raw).first()

def unpivot_widerow_to_longrows(row,header_original):
    new_row_base = []
    new_row_base.append(row[0])
    new_row_base.extend([state_data.strip() for state_data in row[1].split(',')])
    new_row_base.append(row[2])
    
    year_month_list = [year_month.split('-') for year_month in header_original[3:]]
    prices = row[3:]
    
    unpivoted_rows = []
    for i in range(len(year_month_list)):
        year_month_list[i].append(prices[i])
        new_row = new_row_base + year_month_list[i]
        unpivoted_rows.append(new_row)
    
    return unpivoted_rows

unpivoted_rent = (rdd_rent.filter(lambda line: line != header_raw)
         .filter(lambda line: line != US_row)
         .map(lambda row_raw_csv: [item for item in csv.reader([row_raw_csv])][0])
         .flatMap(lambda row: unpivot_widerow_to_longrows(row,header_temp) )
)

rent_data = unpivoted_rent.collect()
rent_data.insert(0,new_header)

long_rent_rdd =sc.parallelize(rent_data)

col_names = long_rent_rdd.first()
long_rent_rdd = long_rent_rdd.filter(lambda line: line != col_names)  # drop the first row, which is header
rent_df = ss.createDataFrame(long_rent_rdd)

for i in range(7):
    rent_df = rent_df.withColumnRenamed(rent_df.columns[i], col_names[i])

rent_df.show(5)

+--------+----------+---------+--------+----+-----+---------------+
|RegionID|RegionName|StateCode|SizeRank|Year|Month|ZillowRentIndex|
+--------+----------+---------+--------+----+-----+---------------+
|  394913|  New York|       NY|       1|2010|   09|           1708|
|  394913|  New York|       NY|       1|2010|   10|           1707|
|  394913|  New York|       NY|       1|2010|   11|           1708|
|  394913|  New York|       NY|       1|2010|   12|           1709|
|  394913|  New York|       NY|       1|2011|   01|           1704|
+--------+----------+---------+--------+----+-----+---------------+
only showing top 5 rows



In [71]:
# other data frames
census_2015_df = ss.read.csv('s3a://msds-durian-candy/census/acs2015_census_tract_data.csv.gz', header=True)
county_2015_df = ss.read.csv('s3a://msds-durian-candy/census/acs2015_county_data.csv.gz', header=True)
census_2017_df = ss.read.csv('s3a://msds-durian-candy/census/acs2017_census_tract_data.csv.gz', header=True)
county_2017_df = ss.read.csv('s3a://msds-durian-candy/census/acs2017_county_data.csv.gz', header=True)
benefits_cost_sharing_df = ss.read.csv('s3a://msds-durian-candy/insurance/BenefitsCostSharing.csv.gz', header=True)
business_rules_df = ss.read.csv('s3a://msds-durian-candy/insurance/BusinessRules.csv.gz', header=True)
network_df = ss.read.csv('s3a://msds-durian-candy/insurance/Network.csv.gz', header=True)
plan_attributes_df = ss.read.csv('s3a://msds-durian-candy/insurance/PlanAttributes.csv.gz', header=True)
rate_df = ss.read.csv('s3a://msds-durian-candy/insurance/Rate.csv.gz', header=True)
service_area_df = ss.read.csv('s3a://msds-durian-candy/insurance/ServiceArea.csv.gz', header=True)

In [79]:
county_2017_df.show(5)

+--------+-------+--------------+--------+-----+------+--------+-----+-----+------+-----+-------+----------------+------+---------+------------+---------------+-------+------------+------------+-------+------+------------+----------+-----+-------+-------+----+-----------+----------+-----------+--------+-----------+----------+------------+----------+------------+
|CountyId|  State|        County|TotalPop|  Men| Women|Hispanic|White|Black|Native|Asian|Pacific|VotingAgeCitizen|Income|IncomeErr|IncomePerCap|IncomePerCapErr|Poverty|ChildPoverty|Professional|Service|Office|Construction|Production|Drive|Carpool|Transit|Walk|OtherTransp|WorkAtHome|MeanCommute|Employed|PrivateWork|PublicWork|SelfEmployed|FamilyWork|Unemployment|
+--------+-------+--------------+--------+-----+------+--------+-----+-----+------+-----+-------+----------------+------+---------+------------+---------------+-------+------------+------------+-------+------+------------+----------+-----+-------+-------+----+----------