In [84]:
from pyspark import SparkContext
import pandas as pd
import plotly
import plotly.graph_objects as go
import os
import plotly.express as px

In [85]:
sc = SparkContext.getOrCreate()

## Grabs Data

In [86]:
# Gathers the Data Needed
pov_idvrate = sc.textFile("../vizdata/mean_idvrate_poverty.csv")
idvrate_year = sc.textFile("../vizdata/mean_zri_and_rate.csv")
zri_year = sc.textFile("../vizdata/mean_zri_by_state_year.csv")

## Takes a Peek at data we will be using

In [87]:
pov_idvrate.take(6)

['State,PlanType,IndividualRate,Poverty',
 'AK,PPO,699.60680486075,13.341379310344827',
 'AK,Indemnity,64.46076923076926,13.341379310344827',
 'AL,POS,513.9909653706823,21.76268656716418',
 'AL,PPO,257.14269420543195,21.76268656716418',
 'AL,HMO,381.57607377533054,21.76268656716418']

In [88]:
idvrate_year.take(6)

['StateCode,Year,MeanZRI,PlanType,IndividualRate',
 'TX,2014,1069.3242909987669,PPO,226.36977247050865',
 'TX,2014,1069.3242909987669,HMO,319.85069597764453',
 'TX,2014,1069.3242909987669,None,98.36895484949834',
 'TX,2014,1069.3242909987669,EPO,75.0144119000162',
 'TX,2014,1069.3242909987669,POS,465.2272487238162']

In [89]:
zri_year.take(6)

['StateCode,Year,Mean_ZRI',
 'NY,2014,1478.8910891089108',
 'NY,2015,1487.516129032258',
 'NY,2016,1483.1184210526317',
 'CA,2014,1535.36476426799',
 'CA,2015,1692.0721393034826']

## Ridgeline Plot by Plan Type

In [90]:
# Makes Data into value key pairs
indv_rate_plan_map = pov_idvrate.map(lambda x : x.split(","))
indv_rate_plan_map = indv_rate_plan_map.map(lambda x: (x[1],x[2])).groupByKey()
indv_rate_plan_map = indv_rate_plan_map.map(lambda x: (x[0],list(x[1])))

In [91]:
# Sets Data
PPO_data = indv_rate_plan_map.collect()[1][1]
HMO_data = indv_rate_plan_map.collect()[2][1]
Imdemn_data = indv_rate_plan_map.collect()[4][1]
POS_data = indv_rate_plan_map.collect()[5][1]
EPO_data = indv_rate_plan_map.collect()[6][1]

In [96]:
# Plots Data with violin plots
fig = go.Figure()
fig.add_trace(go.Violin(x=PPO_data,name='PPO'))
fig.add_trace(go.Violin(x=HMO_data,name='HMO'))
fig.add_trace(go.Violin(x=Imdemn_data,name='Indemnity'))
fig.add_trace(go.Violin(x=POS_data,name='POS'))
fig.add_trace(go.Violin(x=EPO_data,name='EPO'))
fig.update_traces(orientation='h', side='positive', width=1.9, points=False)

fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False,width = 1000, height = 600,
                     yaxis = go.layout.YAxis(
                         ticktext=['PPO','HMO','Indemnity','POS','EPO'],
                     ),
                     xaxis = go.layout.XAxis(
                         range = [0,800]
                     ),
                     plot_bgcolor='rgba(0,0,0,0)',
                     title="Distribution of Rate by Plan Type",
                     xaxis_title="Insurance Rate ($)"
                 )
fig.show()

## Line Chart for Mean Individual Rate and Rent Type, both normalized

In [48]:
# Makes function for normalizating
def normalize(num):
    min_num = min(num)
    max_num = max(num)
    return [(n - min_num)/(max_num-min_num) for n in num]

In [111]:
# Normailizes both insurance column, makes pandasdataframe
idvrate_year_split = idvrate_year.map(lambda x : x.split(","))
individ_rate = idvrate_year_split.map(lambda x : x[4]).collect()[1:]
year = idvrate_year_split.map(lambda x : x[1]).collect()[1:]
plan_type = idvrate_year_split.map(lambda x : x[3]).collect()[1:]
individ_rate = [float(n) for n in individ_rate]
normal_individ_rate = normalize(individ_rate)
d = {'year':year, 'ins_rate':normal_individ_rate,'plan_type':plan_type}
data_year_ins_rate = pd.DataFrame(data=d)
data_year_ins_rate = data_year_ins_rate.loc[data_year_ins_rate.plan_type=='PPO']
data_year_ins_rate = data_year_ins_rate .groupby('year').agg({'ins_rate':'mean'}).reset_index()
data_year_ins_rate

Unnamed: 0,year,ins_rate
0,2014,0.402135
1,2015,0.403736
2,2016,0.381485


In [112]:
# Normailizes zri column, makes pandasdataframe
zri_year_split = zri_year.map(lambda x : x.split(","))
zri_rate = zri_year_split.map(lambda x : x[2]).collect()[1:]
year = zri_year_split.map(lambda x : x[1]).collect()[1:]
zri_rate = [float(n) for n in zri_rate]
normal_zri_rate= normalize(zri_rate)
d = {'year':year, 'zri_rate':normal_zri_rate}
data2 = pd.DataFrame(data=d).groupby('year').agg({'zri_rate':'mean'}).reset_index()
data2

Unnamed: 0,year,zri_rate
0,2014,0.070394
1,2015,0.067788
2,2016,0.062513


In [114]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_year_ins_rate.year, y=data_year_ins_rate.ins_rate ,name='PPO Insurance Rate'))
fig.add_trace(go.Scatter(x=data2.year, y=data2.zri_rate, name='Zillow Rental Index'))

fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False,width = 1000, height = 600,
                     yaxis = go.layout.YAxis(
                         tickvals=[0.1,0.2,0.3,0.4,0.5],
                         range=[0,0.5],
                     ),
                     xaxis = go.layout.XAxis(
                         range=[2013.99,2016.05],
                         tickvals=[2014,2015,2016]
                     ),
                     plot_bgcolor='rgba(0,0,0,0)',
                     title="Trend of Insurance Rate vs Zillow Rental Index",
                     xaxis_title="Year",
                     yaxis_title="Normalized Amount"
                 )
fig.show()

In [None]:
sc.stop()