## Mean Insurance Individual Rate & Census Poverty

In [1]:
import csv
import os

from pyspark import SparkContext, SparkConf

In [2]:
us = {'Alabama': 'AL',
 'Alaska': 'AK',
 'American Samoa': 'AS',
 'Arizona': 'AZ',
 'Arkansas': 'AR',
 'California': 'CA',
 'Colorado': 'CO',
 'Connecticut': 'CT',
 'Dakota': 'DK',
 'Delaware': 'DE',
 'District of Columbia': 'DC',
 'Florida': 'FL',
 'Georgia': 'GA',
 'Guam': 'GU',
 'Hawaii': 'HI',
 'Idaho': 'ID',
 'Illinois': 'IL',
 'Indiana': 'IN',
 'Iowa': 'IA',
 'Kansas': 'KS',
 'Kentucky': 'KY',
 'Louisiana': 'LA',
 'Maine': 'ME',
 'Maryland': 'MD',
 'Massachusetts': 'MA',
 'Michigan': 'MI',
 'Minnesota': 'MN',
 'Mississippi': 'MS',
 'Missouri': 'MO',
 'Montana': 'MT',
 'Nebraska': 'NE',
 'Nevada': 'NV',
 'New Hampshire': 'NH',
 'New Jersey': 'NJ',
 'New Mexico': 'NM',
 'New York': 'NY',
 'North Carolina': 'NC',
 'North Dakota': 'ND',
 'Northern Mariana Islands': 'MP',
 'Ohio': 'OH',
 'Oklahoma': 'OK',
 'Oregon': 'OR',
 'Orleans': 'OL',
 'Pennsylvania': 'PA',
 'Philippine Islands': 'PI',
 'Puerto Rico': 'PR',
 'Rhode Island': 'RI',
 'South Carolina': 'SC',
 'South Dakota': 'SD',
 'Tennessee': 'TN',
 'Texas': 'TX',
 'Utah': 'UT',
 'Vermont': 'VT',
 'Virgin Islands': 'VI',
 'Virginia': 'VA',
 'Washington': 'WA',
 'West Virginia': 'WV',
 'Wisconsin': 'WI',
 'Wyoming': 'WY'}

### Setup Spark

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'

In [4]:
access_key = 'AKIAWOHFNKOOHLXDQIF4'
secret_key = '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'

In [5]:
sc = SparkContext.getOrCreate()

In [6]:
sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_key)
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', secret_key)

In [7]:
test_mode = False

s3 = {'i_rate': 's3a://msds-durian-candy/insurance/Rate.csv.gz',
      'i_plan': 's3a://msds-durian-candy/insurance/PlanAttributes.csv.gz',
      'census': 's3a://msds-durian-candy/census/acs2015_county_data.csv.gz'}

test = {'i_rate': '../data/test/insurance_Rate.csv',
        'i_plan': '../data/s3/insurance/PlanAttributes.csv',
        'census': '../data/s3/census/acs2015_county_data.csv'}

source = test if test_mode else s3

In [8]:
def csv_split(x):
    return next(csv.reader([x], delimiter=',', quotechar='"'))

### Load Insurance Data

In [9]:
i_rate = sc.textFile(source['i_rate']).map(csv_split)
i_plan = sc.textFile(source['i_plan']).map(csv_split)
i_rate_h = i_rate.first()
i_plan_h = i_plan.first()
i_rate_rows = i_rate.filter(lambda x: x != i_rate_h)
i_plan_rows = i_plan.filter(lambda x: x != i_plan_h)

In [10]:
i_rate_kv = (i_rate_rows
             .filter(lambda x: 1 < float(x[i_rate_h.index('IndividualRate')]) < 9999)
             .map(lambda x: (x[i_rate_h.index('PlanId')], (x[i_rate_h.index('StateCode')], float(x[i_rate_h.index('IndividualRate')])))))
i_plan_kv = i_plan_rows.map(lambda x: (x[i_plan_h.index('StandardComponentId')], x[i_plan_h.index('PlanType')]))

In [11]:
i_kv = i_rate_kv.leftOuterJoin(i_plan_kv).map(lambda x: ((x[1][0][0], x[1][1]), (x[1][0][1], 1)))
i_r = i_kv.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).mapValues(lambda x: x[0] / x[1]).map(lambda x: (x[0][0], (x[0][1], x[1])))

### Load Census Data

In [12]:
census = sc.textFile(source['census']).map(csv_split)
census_h = census.first()
census_rows = census.filter(lambda x: x != census_h)

In [13]:
census_kv = (census_rows
             .map(lambda x: (us[x[census_h.index('State')]], x[3:]))
             .mapValues(lambda row: [float(val) if val else 0 for val in row])
             .mapValues(lambda row: [1] + row))
census_r = (census_kv
            .reduceByKey(lambda list1, list2: [sum(x) for x in zip(list1, list2)])
            .mapValues(lambda row: [x / row[0] for x in row]))

### Join Insurance and Census Data

In [14]:
df_r = i_r.leftOuterJoin(census_r).map(lambda x: (x[0], x[1][0][0], x[1][0][1], *x[1][1][1:])).sortBy(lambda x: x[0])

### Write Results to CSV File

In [15]:
df_h = ['State', 'PlanType', 'IndividualRate', *census_h[3:]]
df = df_r.collect()

In [16]:
filename = '../vizdata/mean_idvrate_poverty.csv' 
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
    f.write(f'{",".join(df_h)}\n')
    for row in df:
        row_f = [str(elem) for elem in row]
        f.write(f'{",".join(row_f)}\n')

### Stop Spark

In [17]:
sc.stop()