# Explore Insurance Coverage Data

In [1]:
import pandas as pd

import warnings
warnings.simplefilter(action='ignore')

### First insurance dataset

In [18]:
# read insurance coverage data in
insurance_in = pd.read_csv('data/2017-insurance-coverage.csv', encoding = "ISO-8859-1", header=1)

In [20]:
insurance_in.head()

Unnamed: 0,Id,Id2,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - Under 6 years:,Margin of Error; Male: - Under 6 years:,Estimate; Male: - Under 6 years: - With health insurance coverage,...,Estimate; Female: - 65 to 74 years: - With health insurance coverage,Margin of Error; Female: - 65 to 74 years: - With health insurance coverage,Estimate; Female: - 65 to 74 years: - No health insurance coverage,Margin of Error; Female: - 65 to 74 years: - No health insurance coverage,Estimate; Female: - 75 years and over:,Margin of Error; Female: - 75 years and over:,Estimate; Female: - 75 years and over: - With health insurance coverage,Margin of Error; Female: - 75 years and over: - With health insurance coverage,Estimate; Female: - 75 years and over: - No health insurance coverage,Margin of Error; Female: - 75 years and over: - No health insurance coverage
0,0500000US01001,1001,"Autauga County, Alabama",54170,270,26247,253,1941,175,1919,...,2567,50,0,27,1738,137,1738,137,0,27
1,0500000US01003,1003,"Baldwin County, Alabama",200929,427,97535,444,7221,399,6966,...,12134,110,8,12,8217,115,8215,115,2,5
2,0500000US01005,1005,"Barbour County, Alabama",23326,241,11225,218,880,76,831,...,1464,28,6,9,1028,84,1028,84,0,21
3,0500000US01007,1007,"Bibb County, Alabama",20553,190,10329,201,845,128,845,...,948,57,0,21,865,74,865,74,0,21
4,0500000US01009,1009,"Blount County, Alabama",57207,155,28269,153,1980,84,1897,...,3194,70,20,34,2186,104,2178,103,8,11


In [109]:
# look at all columns
list(insurance_in.columns)

['Id',
 'Id2',
 'Geography',
 'Estimate; Total:',
 'Margin of Error; Total:',
 'Estimate; Male:',
 'Margin of Error; Male:',
 'Estimate; Male: - Under 6 years:',
 'Margin of Error; Male: - Under 6 years:',
 'Estimate; Male: - Under 6 years: - With health insurance coverage',
 'Margin of Error; Male: - Under 6 years: - With health insurance coverage',
 'Estimate; Male: - Under 6 years: - No health insurance coverage',
 'Margin of Error; Male: - Under 6 years: - No health insurance coverage',
 'Estimate; Male: - 6 to 18 years:',
 'Margin of Error; Male: - 6 to 18 years:',
 'Estimate; Male: - 6 to 18 years: - With health insurance coverage',
 'Margin of Error; Male: - 6 to 18 years: - With health insurance coverage',
 'Estimate; Male: - 6 to 18 years: - No health insurance coverage',
 'Margin of Error; Male: - 6 to 18 years: - No health insurance coverage',
 'Estimate; Male: - 19 to 25 years:',
 'Margin of Error; Male: - 19 to 25 years:',
 'Estimate; Male: - 19 to 25 years: - With health 

First impression is that insurance coverage data is raw numbers, so I will need to normalize by the population. The data also breaks down into age categories, which I can considers as a factor when looking at vulnerable populations. This dataset includes categories for total population along with insurance coverage, and since the categories don't match up with the age group dataset I explored above, maybe I should just scrap the age dataset and use this instead. And if we're looking at vulnerable populations, I am not too concerned about those 'with health insurance coverage'.

In [57]:
insurance = insurance_in.copy()

In [58]:
# first separate females out to perform operations
colNames = insurance.columns.str.contains('Id2|Geography|Female', case=False)
female = insurance.iloc[:, colNames]
female.head()

Unnamed: 0,Id2,Geography,Estimate; Female:,Margin of Error; Female:,Estimate; Female: - Under 6 years:,Margin of Error; Female: - Under 6 years:,Estimate; Female: - Under 6 years: - With health insurance coverage,Margin of Error; Female: - Under 6 years: - With health insurance coverage,Estimate; Female: - Under 6 years: - No health insurance coverage,Margin of Error; Female: - Under 6 years: - No health insurance coverage,...,Estimate; Female: - 65 to 74 years: - With health insurance coverage,Margin of Error; Female: - 65 to 74 years: - With health insurance coverage,Estimate; Female: - 65 to 74 years: - No health insurance coverage,Margin of Error; Female: - 65 to 74 years: - No health insurance coverage,Estimate; Female: - 75 years and over:,Margin of Error; Female: - 75 years and over:,Estimate; Female: - 75 years and over: - With health insurance coverage,Margin of Error; Female: - 75 years and over: - With health insurance coverage,Estimate; Female: - 75 years and over: - No health insurance coverage,Margin of Error; Female: - 75 years and over: - No health insurance coverage
0,1001,"Autauga County, Alabama",27923,200,1816,143,1795,140,21,24,...,2567,50,0,27,1738,137,1738,137,0,27
1,1003,"Baldwin County, Alabama",103394,312,6489,330,6413,345,76,59,...,12134,110,8,12,8217,115,8215,115,2,5
2,1005,"Barbour County, Alabama",12101,113,860,78,838,77,22,18,...,1464,28,6,9,1028,84,1028,84,0,21
3,1007,"Bibb County, Alabama",10224,114,697,137,655,147,42,47,...,948,57,0,21,865,74,865,74,0,21
4,1009,"Blount County, Alabama",28938,158,2040,141,1965,156,75,53,...,3194,70,20,34,2186,104,2178,103,8,11


In [59]:
# remove the margin of error columns
colNames = female.columns.str.contains('Id2|Geography|Estimate', case=False)
female = female.iloc[:, colNames]
female.head()

Unnamed: 0,Id2,Geography,Estimate; Female:,Estimate; Female: - Under 6 years:,Estimate; Female: - Under 6 years: - With health insurance coverage,Estimate; Female: - Under 6 years: - No health insurance coverage,Estimate; Female: - 6 to 18 years:,Estimate; Female: - 6 to 18 years: - With health insurance coverage,Estimate; Female: - 6 to 18 years: - No health insurance coverage,Estimate; Female: - 19 to 25 years:,...,Estimate; Female: - 45 to 54 years: - No health insurance coverage,Estimate; Female: - 55 to 64 years:,Estimate; Female: - 55 to 64 years: - With health insurance coverage,Estimate; Female: - 55 to 64 years: - No health insurance coverage,Estimate; Female: - 65 to 74 years:,Estimate; Female: - 65 to 74 years: - With health insurance coverage,Estimate; Female: - 65 to 74 years: - No health insurance coverage,Estimate; Female: - 75 years and over:,Estimate; Female: - 75 years and over: - With health insurance coverage,Estimate; Female: - 75 years and over: - No health insurance coverage
0,1001,"Autauga County, Alabama",27923,1816,1795,21,5171,4989,182,2180,...,493,3447,3098,349,2567,2567,0,1738,1738,0
1,1003,"Baldwin County, Alabama",103394,6489,6413,76,16514,15781,733,7622,...,1990,15150,13602,1548,12142,12134,8,8217,8215,2
2,1005,"Barbour County, Alabama",12101,860,838,22,1975,1931,44,985,...,224,1687,1512,175,1470,1464,6,1028,1028,0
3,1007,"Bibb County, Alabama",10224,697,655,42,1542,1539,3,849,...,199,1408,1309,99,948,948,0,865,865,0
4,1009,"Blount County, Alabama",28938,2040,1965,75,4908,4678,230,2171,...,477,3785,3394,391,3214,3194,20,2186,2178,8


In [60]:
# filter data to create list of all columns designated 'No health insurance coverage'
female_without = female.filter(like='No', axis=1)
female_without_columns = female_without.columns
female_without_columns

Index(['Estimate; Female: - Under 6 years: - No health insurance coverage',
       'Estimate; Female: - 6 to 18 years: - No health insurance coverage',
       'Estimate; Female: - 19 to 25 years: - No health insurance coverage',
       'Estimate; Female: - 26 to 34 years: - No health insurance coverage',
       'Estimate; Female: - 35 to 44 years: - No health insurance coverage',
       'Estimate; Female: - 45 to 54 years: - No health insurance coverage',
       'Estimate; Female: - 55 to 64 years: - No health insurance coverage',
       'Estimate; Female: - 65 to 74 years: - No health insurance coverage',
       'Estimate; Female: - 75 years and over: - No health insurance coverage'],
      dtype='object')

In [61]:
# create new columns using sum operations
female['totalFemale_noCoverage'] = female[female_without_columns].sum(axis=1)
female['under18Female_total'] = female['Estimate; Female: - Under 6 years:'] + female['Estimate; Female: - 6 to 18 years:']
female['under18Female_noCoverage'] = female['Estimate; Female: - Under 6 years: - No health insurance coverage'] + female['Estimate; Female: - 6 to 18 years: - No health insurance coverage']
female['over65Female_total'] = female['Estimate; Female: - 65 to 74 years:'] + female['Estimate; Female: - 75 years and over:']
female['over65Female_noCoverage'] = female['Estimate; Female: - 65 to 74 years: - No health insurance coverage'] + female['Estimate; Female: - 75 years and over: - No health insurance coverage']

In [62]:
#inspect all the columns we now have
list(female.columns)

['Id2',
 'Geography',
 'Estimate; Female:',
 'Estimate; Female: - Under 6 years:',
 'Estimate; Female: - Under 6 years: - With health insurance coverage',
 'Estimate; Female: - Under 6 years: - No health insurance coverage',
 'Estimate; Female: - 6 to 18 years:',
 'Estimate; Female: - 6 to 18 years: - With health insurance coverage',
 'Estimate; Female: - 6 to 18 years: - No health insurance coverage',
 'Estimate; Female: - 19 to 25 years:',
 'Estimate; Female: - 19 to 25 years: - With health insurance coverage',
 'Estimate; Female: - 19 to 25 years: - No health insurance coverage',
 'Estimate; Female: - 26 to 34 years:',
 'Estimate; Female: - 26 to 34 years: - With health insurance coverage',
 'Estimate; Female: - 26 to 34 years: - No health insurance coverage',
 'Estimate; Female: - 35 to 44 years:',
 'Estimate; Female: - 35 to 44 years: - With health insurance coverage',
 'Estimate; Female: - 35 to 44 years: - No health insurance coverage',
 'Estimate; Female: - 45 to 54 years:',
 '

In [63]:
female_coverage = female[['Id2',
                         'Geography',
                         'Estimate; Female:',
                         'totalFemale_noCoverage',
                         'Estimate; Female: - Under 6 years:',
                         'Estimate; Female: - Under 6 years: - No health insurance coverage',
                         'under18Female_total',
                         'under18Female_noCoverage',
                         'over65Female_total',
                         'over65Female_noCoverage']]

In [67]:
# rename columns
female_coverage = female_coverage.rename(columns={'Id2': 'GeoId', 
                                        'Geography': 'name',
                                        'Estimate; Female:': 'totalFemale',
                                        'Estimate; Female: - Under 6 years:': 'under6Female_total',
                                        'Estimate; Female: - Under 6 years: - No health insurance coverage': 'under6Female_noCoverage'})
female_coverage.head()

Unnamed: 0,GeoId,name,totalFemale,totalFemale_noCoverage,under6Female_total,under6Female_noCoverage,under18Female_total,under18Female_noCoverage,over65Female_total,over65Female_noCoverage
0,1001,"Autauga County, Alabama",27923,2585,1816,21,6987,203,4305,0
1,1003,"Baldwin County, Alabama",103394,9831,6489,76,23003,809,20359,10
2,1005,"Barbour County, Alabama",12101,1196,860,22,2835,66,2498,6
3,1007,"Bibb County, Alabama",10224,784,697,42,2239,45,1813,0
4,1009,"Blount County, Alabama",28938,2800,2040,75,6948,305,5400,28


Now repeat for the male columns.

In [68]:
# separate males out to perform operations
colNames = insurance.columns.str.contains('Id2|Geography|Male', case=False)
male = insurance.iloc[:, colNames]
male.head()

Unnamed: 0,Id2,Geography,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - Under 6 years:,Margin of Error; Male: - Under 6 years:,Estimate; Male: - Under 6 years: - With health insurance coverage,Margin of Error; Male: - Under 6 years: - With health insurance coverage,Estimate; Male: - Under 6 years: - No health insurance coverage,Margin of Error; Male: - Under 6 years: - No health insurance coverage,...,Estimate; Female: - 65 to 74 years: - With health insurance coverage,Margin of Error; Female: - 65 to 74 years: - With health insurance coverage,Estimate; Female: - 65 to 74 years: - No health insurance coverage,Margin of Error; Female: - 65 to 74 years: - No health insurance coverage,Estimate; Female: - 75 years and over:,Margin of Error; Female: - 75 years and over:,Estimate; Female: - 75 years and over: - With health insurance coverage,Margin of Error; Female: - 75 years and over: - With health insurance coverage,Estimate; Female: - 75 years and over: - No health insurance coverage,Margin of Error; Female: - 75 years and over: - No health insurance coverage
0,1001,"Autauga County, Alabama",26247,253,1941,175,1919,182,22,28,...,2567,50,0,27,1738,137,1738,137,0,27
1,1003,"Baldwin County, Alabama",97535,444,7221,399,6966,474,255,225,...,12134,110,8,12,8217,115,8215,115,2,5
2,1005,"Barbour County, Alabama",11225,218,880,76,831,94,49,59,...,1464,28,6,9,1028,84,1028,84,0,21
3,1007,"Bibb County, Alabama",10329,201,845,128,845,128,0,21,...,948,57,0,21,865,74,865,74,0,21
4,1009,"Blount County, Alabama",28269,153,1980,84,1897,114,83,76,...,3194,70,20,34,2186,104,2178,103,8,11


### Second insurance dataset

In [70]:
# read insurance coverage data in
insurance_in = pd.read_csv('data/2018-uninsured.csv', encoding = "ISO-8859-1", header=1)
insurance_in.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total!!Subject!!Total civilian noninstitutionalized population,Margin of Error!!Total MOE!!Subject!!Total civilian noninstitutionalized population,Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population,Margin of Error!!Total Uninsured MOE!!Subject!!Total civilian noninstitutionalized population,Estimate!!Total!!AGE!!Under 19 years,Margin of Error!!Total MOE!!AGE!!Under 19 years,Estimate!!Total Uninsured!!AGE!!Under 19 years,Margin of Error!!Total Uninsured MOE!!AGE!!Under 19 years,...,Estimate!!Total Uninsured!!150 to 199 percent of the poverty level,Margin of Error!!Total Uninsured MOE!!150 to 199 percent of the poverty level,Estimate!!Total!!200 to 299 percent of the poverty level,Margin of Error!!Total MOE!!200 to 299 percent of the poverty level,Estimate!!Total Uninsured!!200 to 299 percent of the poverty level,Margin of Error!!Total Uninsured MOE!!200 to 299 percent of the poverty level,Estimate!!Total!!At or above 300 percent of the poverty level,Margin of Error!!Total MOE!!At or above 300 percent of the poverty level,Estimate!!Total Uninsured!!At or above 300 percent of the poverty level,Margin of Error!!Total Uninsured MOE!!At or above 300 percent of the poverty level
0,0500000US04013,"Maricopa County, Arizona",4380451.0,2418.0,473197.0,19357.0,25.4,0.1,19.4,1.3,...,15.6,1.6,17.2,0.6,22.2,1.9,52.5,0.6,28.9,1.7
1,0500000US04019,"Pima County, Arizona",1017187.0,2565.0,97882.0,8228.0,22.9,0.2,17.7,2.4,...,19.0,3.3,16.8,1.2,22.5,4.3,46.1,1.4,20.8,3.5
2,0500000US06019,"Fresno County, California",982821.0,3189.0,73395.0,5731.0,30.0,0.2,7.8,2.2,...,14.5,3.2,16.8,1.3,20.1,3.5,37.9,1.5,24.5,3.8
3,0500000US06029,"Kern County, California",869297.0,5638.0,71840.0,10384.0,31.1,0.2,15.5,5.0,...,18.4,6.4,18.5,1.5,22.1,5.7,36.2,1.8,20.6,5.1
4,0500000US06037,"Los Angeles County, California",10034790.0,5117.0,906316.0,22946.0,23.1,0.1,8.9,0.5,...,14.5,0.9,16.2,0.4,20.7,1.1,50.0,0.5,30.2,1.2


In [71]:
list(insurance_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total!!Subject!!Total civilian noninstitutionalized population',
 'Margin of Error!!Total MOE!!Subject!!Total civilian noninstitutionalized population',
 'Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population',
 'Margin of Error!!Total Uninsured MOE!!Subject!!Total civilian noninstitutionalized population',
 'Estimate!!Total!!AGE!!Under 19 years',
 'Margin of Error!!Total MOE!!AGE!!Under 19 years',
 'Estimate!!Total Uninsured!!AGE!!Under 19 years',
 'Margin of Error!!Total Uninsured MOE!!AGE!!Under 19 years',
 'Estimate!!Total!!AGE!!Under 19 years!!Under 6 years',
 'Margin of Error!!Total MOE!!AGE!!Under 19 years!!Under 6 years',
 'Estimate!!Total Uninsured!!AGE!!Under 19 years!!Under 6 years',
 'Margin of Error!!Total Uninsured MOE!!AGE!!Under 19 years!!Under 6 years',
 'Estimate!!Total!!AGE!!Under 19 years!!6 to 18 years',
 'Margin of Error!!Total MOE!!AGE!!Under 19 years!!6 to 18 years',
 'Estimate!!Total Unins

This dataset contains tons of information that I don't need. I really only want to keep the most basic uninsured data: total uninsured per county, and then some by age (young children, elderly).

Columns I want to keep:

- 'id'
- 'Geographic Area Name'
- 'Estimate!!Total!!Subject!!Total civilian noninstitutionalized population'
- 'Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population'
- 'Estimate!!Total!!AGE!!Under 19 years!!Under 6 years'
- 'Estimate!!Total Uninsured!!AGE!!Under 19 years!!Under 6 years'
- 'Estimate!!Total!!AGE!!Under 19 years'
- 'Estimate!!Total Uninsured!!AGE!!Under 19 years'
- 'Estimate!!Total!!AGE!!65 years and older'
- 'Estimate!!Total Uninsured!!AGE!!65 years and older

In [78]:
insurance = insurance_in[['id',
                         'Geographic Area Name',
                         'Estimate!!Total!!Subject!!Total civilian noninstitutionalized population',
                         'Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population',
                         'Estimate!!Total!!AGE!!Under 19 years!!Under 6 years',
                         'Estimate!!Total Uninsured!!AGE!!Under 19 years!!Under 6 years',
                         'Estimate!!Total!!AGE!!Under 19 years',
                         'Estimate!!Total Uninsured!!AGE!!Under 19 years',
                         'Estimate!!Total!!AGE!!65 years and older',
                         'Estimate!!Total Uninsured!!AGE!!65 years and older']]

insurance.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total!!Subject!!Total civilian noninstitutionalized population,Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population,Estimate!!Total!!AGE!!Under 19 years!!Under 6 years,Estimate!!Total Uninsured!!AGE!!Under 19 years!!Under 6 years,Estimate!!Total!!AGE!!Under 19 years,Estimate!!Total Uninsured!!AGE!!Under 19 years,Estimate!!Total!!AGE!!65 years and older,Estimate!!Total Uninsured!!AGE!!65 years and older
0,0500000US04013,"Maricopa County, Arizona",4380451.0,473197.0,7.6,4.4,25.4,19.4,15.1,1.1
1,0500000US04019,"Pima County, Arizona",1017187.0,97882.0,6.7,3.2,22.9,17.7,20.0,1.1
2,0500000US06019,"Fresno County, California",982821.0,73395.0,9.6,2.1,30.0,7.8,12.1,1.5
3,0500000US06029,"Kern County, California",869297.0,71840.0,9.2,2.7,31.1,15.5,11.1,0.9
4,0500000US06037,"Los Angeles County, California",10034790.0,906316.0,7.1,2.0,23.1,8.9,13.4,2.0


In [79]:
# rename columns
insurance = insurance.rename(columns={'Geographic Area Name': 'name',
                                     'Estimate!!Total!!Subject!!Total civilian noninstitutionalized population': 'totalPop',
                                     'Estimate!!Total Uninsured!!Subject!!Total civilian noninstitutionalized population': 'totalUninsured',
                                     'Estimate!!Total!!AGE!!Under 19 years!!Under 6 years': 'under6',
                                     'Estimate!!Total Uninsured!!AGE!!Under 19 years!!Under 6 years': 'under6Uninsured',
                                     'Estimate!!Total!!AGE!!Under 19 years': 'under19',
                                     'Estimate!!Total Uninsured!!AGE!!Under 19 years': 'under19Uninsured',
                                     'Estimate!!Total!!AGE!!65 years and older': 'over65',
                                     'Estimate!!Total Uninsured!!AGE!!65 years and older': 'over65Uninsured'})

insurance.head()

Unnamed: 0,id,name,totalPop,totalUninsured,under6,under6Uninsured,under19,under19Uninsured,over65,over65Uninsured
0,0500000US04013,"Maricopa County, Arizona",4380451.0,473197.0,7.6,4.4,25.4,19.4,15.1,1.1
1,0500000US04019,"Pima County, Arizona",1017187.0,97882.0,6.7,3.2,22.9,17.7,20.0,1.1
2,0500000US06019,"Fresno County, California",982821.0,73395.0,9.6,2.1,30.0,7.8,12.1,1.5
3,0500000US06029,"Kern County, California",869297.0,71840.0,9.2,2.7,31.1,15.5,11.1,0.9
4,0500000US06037,"Los Angeles County, California",10034790.0,906316.0,7.1,2.0,23.1,8.9,13.4,2.0
