### Starter code for group assignment 2
Rent data was loaded first because it is the only dataset in wide format which cannot be joined with other datasets unless converted to long format.

In [43]:
import ast

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
import os
import getpass
import csv

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
ss = SparkSession.builder.getOrCreate()

In [5]:
access_key = 'AKIAWOHFNKOOBLZJIZEU'
secret_key = getpass.getpass()

 ········································


In [6]:
sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_key)
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', secret_key)

## Unpivot the data from wide to long format

The rent data is annoyingly in wide format, the code belows converts it to long format.

In [7]:
rdd_rent = sc.textFile('s3a://msds-durian-candy/rent/Metro_Zri_AllHomesPlusMultifamily.csv.gz')

In [8]:
header_raw = rdd_rent.first()
header_temp = [item for item in csv.reader([header_raw])][0]

new_header = []
new_header.append(header_temp[0]) # RegionID
new_header.extend(['RegionName','StateCode']) # RegionName to 'StateName' and 'StateCode'
new_header.append(header_temp[2])
new_header.extend(['Year','Month']) # From index 3 onwards is date related, we want long format of those columns
new_header.append('ZillowRentIndex')

In [9]:
# The first row is also useless for us since it's for the entire US not individual state
US_row = rdd_rent.filter(lambda line: line != header_raw).first()

In [10]:
def unpivot_widerow_to_longrows(row,header_original):
    new_row_base = []
    new_row_base.append(row[0])
    new_row_base.extend([state_data.strip() for state_data in row[1].split(',')])
    new_row_base.append(row[2])
    
    year_month_list = [year_month.split('-') for year_month in header_original[3:]]
    prices = row[3:]
    
    unpivoted_rows = []
    for i in range(len(year_month_list)):
        year_month_list[i].append(prices[i])
        new_row = new_row_base + year_month_list[i]
        unpivoted_rows.append(new_row)
    
    return unpivoted_rows

In [11]:
unpivoted_rent = (rdd_rent.filter(lambda line: line != header_raw)
         .filter(lambda line: line != US_row)
         .map(lambda row_raw_csv: [item for item in csv.reader([row_raw_csv])][0])
         .flatMap(lambda row: unpivot_widerow_to_longrows(row,header_temp) )
)

In [12]:
rent_data = unpivoted_rent.collect()
rent_data.insert(0,new_header)

In [13]:
long_rent_rdd = sc.parallelize(rent_data)
col_names = long_rent_rdd.first()
long_rent_rdd = long_rent_rdd.filter(lambda line: line != col_names)  # drop the first row, which is header
rent_df = ss.createDataFrame(long_rent_rdd)

for i in range(7):
    rent_df = rent_df.withColumnRenamed(rent_df.columns[i], col_names[i])

rent_df.show(5)


+--------+----------+---------+--------+----+-----+---------------+
|RegionID|RegionName|StateCode|SizeRank|Year|Month|ZillowRentIndex|
+--------+----------+---------+--------+----+-----+---------------+
|  394913|  New York|       NY|       1|2010|   09|           1708|
|  394913|  New York|       NY|       1|2010|   10|           1707|
|  394913|  New York|       NY|       1|2010|   11|           1708|
|  394913|  New York|       NY|       1|2010|   12|           1709|
|  394913|  New York|       NY|       1|2011|   01|           1704|
+--------+----------+---------+--------+----+-----+---------------+
only showing top 5 rows



### Start with Rate table from insurance, expand from there.

* s3a://msds-durian-candy/census/acs2015_census_tract_data.csv.gz
* s3a://msds-durian-candy/census/acs2015_county_data.csv.gz
* s3a://msds-durian-candy/census/acs2017_census_tract_data.csv.gz
* s3a://msds-durian-candy/census/acs2017_county_data.csv.gz
* s3a://msds-durian-candy/insurance/BenefitsCostSharing.csv.gz
* s3a://msds-durian-candy/insurance/BusinessRules.csv.gz
* s3a://msds-durian-candy/insurance/Network.csv.gz
* s3a://msds-durian-candy/insurance/PlanAttributes.csv.gz
* s3a://msds-durian-candy/insurance/Rate.csv.gz
* s3a://msds-durian-candy/insurance/ServiceArea.csv.gz
* s3a://msds-durian-candy/rent/Metro_Zri_AllHomesPlusMultifamily.csv.gz

In [65]:
def guess_schema(spark_df):
    # PySpark's inferSchema is very slow because does an extra pass over the whole data
    # This function will peak at the second row (just in case forgot to set header=True)
    # and guess the type then build the StructType

    struct_field_list = []
    for col in spark_df.columns:
        current_val = spark_df.rdd.take(2)[1][col]
        try:
            current_val = ast.literal_eval(current_val)
        except (SyntaxError, ValueError):
            pass

        if type(current_val) is int:
            struct_field_list.append(StructField(col,IntegerType(),True))
        elif type(current_val) is float:
            struct_field_list.append(StructField(col,DoubleType(),True))
        else:
            struct_field_list.append(StructField(col,StringType(),True))

    return StructType(struct_field_list)

In [67]:
rent_df.printSchema()

root
 |-- RegionID: string (nullable = true)
 |-- RegionName: string (nullable = true)
 |-- StateCode: string (nullable = true)
 |-- SizeRank: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- ZillowRentIndex: string (nullable = true)



In [68]:
ss.createDataFrame(rent_df.rdd,schema=guess_schema(rent_df)).printSchema()

root
 |-- RegionID: integer (nullable = true)
 |-- RegionName: string (nullable = true)
 |-- StateCode: string (nullable = true)
 |-- SizeRank: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- ZillowRentIndex: integer (nullable = true)



In [66]:
guess_schema(rent_df)

StructType(List(StructField(RegionID,IntegerType,true),StructField(RegionName,StringType,true),StructField(StateCode,StringType,true),StructField(SizeRank,IntegerType,true),StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(ZillowRentIndex,IntegerType,true)))

StructType(List(StructField(RegionID,IntegerType,true),StructField(RegionName,StringType,true),StructField(StateCode,StringType,true),StructField(SizeRank,IntegerType,true),StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(ZillowRentIndex,IntegerType,true)))

In [None]:
genera

In [57]:
type(35) is int

True

In [48]:
rent_df.rdd.take(2)[1][:][1]

'New York'

In [53]:
typeast.literal_eval('35')

int

In [56]:
for entry in rent_df.rdd.take(2)[1][:]:
    converted = entry
    try:
        converted = ast.literal_eval(entry)
    except (SyntaxError, ValueError):
        pass
    print(type(converted))

<class 'int'>
<class 'str'>
<class 'str'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [None]:
ast.literal_eval()

In [35]:
rent_df.rdd.take(2)[1]

Row(RegionID='394913', RegionName='New York', StateCode='NY', SizeRank='1', Year='2010', Month='10', ZillowRentIndex='1707')

In [17]:
rent_df.printSchema()

root
 |-- RegionID: string (nullable = true)
 |-- RegionName: string (nullable = true)
 |-- StateCode: string (nullable = true)
 |-- SizeRank: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- ZillowRentIndex: string (nullable = true)



In [15]:
rate_df_raw = ss.read.csv('s3a://msds-durian-candy/insurance/Rate.csv.gz',header=True)

In [16]:
rate_df_raw.printSchema()

root
 |-- BusinessYear: string (nullable = true)
 |-- StateCode: string (nullable = true)
 |-- IssuerId: string (nullable = true)
 |-- SourceName: string (nullable = true)
 |-- VersionNum: string (nullable = true)
 |-- ImportDate: string (nullable = true)
 |-- IssuerId2: string (nullable = true)
 |-- FederalTIN: string (nullable = true)
 |-- RateEffectiveDate: string (nullable = true)
 |-- RateExpirationDate: string (nullable = true)
 |-- PlanId: string (nullable = true)
 |-- RatingAreaId: string (nullable = true)
 |-- Tobacco: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- IndividualRate: string (nullable = true)
 |-- IndividualTobaccoRate: string (nullable = true)
 |-- Couple: string (nullable = true)
 |-- PrimarySubscriberAndOneDependent: string (nullable = true)
 |-- PrimarySubscriberAndTwoDependents: string (nullable = true)
 |-- PrimarySubscriberAndThreeOrMoreDependents: string (nullable = true)
 |-- CoupleAndOneDependent: string (nullable = true)
 |-- CoupleAnd