# SC1015 : Introduction to Data Science and Artificial Intelligence
# Mini-Project : Spread of Covid in US Counties

In [166]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

In [167]:
usa_county = pd.read_csv('usa_county_wise.csv')
usa_county.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,63072001,PR,PRI,630,72001.0,Adjuntas,Puerto Rico,US,18.180117,-66.754367,"Adjuntas, Puerto Rico, US",1/22/20,0,0
4,63072003,PR,PRI,630,72003.0,Aguada,Puerto Rico,US,18.360255,-67.175131,"Aguada, Puerto Rico, US",1/22/20,0,0


Here, we have a dataset with over 600,000 data points. This covers all 3000+ counties over the span of over 6 months, covering number of infected and deaths by Covid per day. While very extensive, we felt that not all the data was relevant, and instead wanted to show a general trend, so we felt working with all 600,000 data points was not necessary. The code below was used to truncate the data set, filtering by a specific date, in this case 27 July 2020.

In [168]:
#usa_county_27july = usa_county[usa_county['Date']=='7/27/20']
#usa_county_27july.to_csv('usa_county_27july.csv', index=False)

Having done that, we will start importing all the relevant data sets, namely:
> Average Household Size and Population Density  
> Health Insurance Coverage  
> Income and Benefits  
> Occupation by Sex  
> Population and Poverty Status  
> USA County Covid Tracking

We felt these demographic factors might give insight to how and why Covid may have spread differently based on each county. We hope to be able to merge these data sets, as a part of our data cleaning, so that we may view the data points together.

In [169]:
housePop = pd.read_csv('Average_Household_Size_and_Population_Density_-_County.csv')
housePop.head()

Unnamed: 0,COUNTYNS,Geographic Identifier - FIPS Code,Area of Land (square meters),Area of Water (square meters),Name,State,Average Household Size,Average Household Size - Margin of Error,Average Household Size of Owner-Occupied Unit,Average Household Size of Owner-Occupied Unit - Margin of Error,...,Total Population,Total Population - Margin of Error,Population Density (people per square kilometer),created_user,created_date,last_edited_user,last_edited_date,Shape__Area,Shape__Length,Population Density - Margin of Error
0,161526,1001,1539602000.0,25706961,Autauga County,Alabama,2.59,0.05,2.59,0.07,...,55200,,35.853419,esri_demographics,1/4/2020 20:53,esri_demographics,1/4/2020 20:53,2.066037,0.150256,
1,161527,1003,4117547000.0,1133055836,Baldwin County,Alabama,2.61,0.04,2.66,0.06,...,208107,,50.541504,esri_demographics,1/4/2020 20:53,esri_demographics,1/4/2020 20:53,4.483746,0.409904,
2,161528,1005,2292145000.0,50538698,Barbour County,Alabama,2.49,0.07,2.44,0.11,...,25782,,11.247981,esri_demographics,1/4/2020 20:53,esri_demographics,1/4/2020 20:53,2.695262,0.22327,
3,161529,1007,1612167000.0,9602089,Bibb County,Alabama,2.99,0.14,3.05,0.18,...,22527,,13.973114,esri_demographics,1/4/2020 20:53,esri_demographics,1/4/2020 20:53,1.887514,0.156473,
4,161530,1009,1670104000.0,15015423,Blount County,Alabama,2.77,0.05,2.85,0.07,...,57645,,34.515816,esri_demographics,1/4/2020 20:53,esri_demographics,1/4/2020 20:53,2.423552,0.164405,


In [170]:
housePop.describe()

Unnamed: 0,COUNTYNS,Geographic Identifier - FIPS Code,Area of Land (square meters),Area of Water (square meters),Average Household Size,Average Household Size - Margin of Error,Average Household Size of Owner-Occupied Unit,Average Household Size of Owner-Occupied Unit - Margin of Error,Average Household Size of Renter-Occupied Unit,Average Household Size of Renter-Occupied Unit - Margin of Error,Total Population,Total Population - Margin of Error,Population Density (people per square kilometer),Shape__Area,Shape__Length,Population Density - Margin of Error
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3219.0,3219.0,3220.0,3220.0,3220.0,116.0,3220.0,3220.0,3220.0,0.0
mean,962889.7,31393.60528,2844093000.0,214325100.0,2.529093,0.081494,2.563088,0.102336,2.44495,0.202314,101332.3,126.905172,112.623419,2.547094,0.342884,
std,517423.6,16292.078954,9255389000.0,1220804000.0,0.279374,0.067781,0.274885,0.07981,0.393263,0.182878,326096.4,55.460127,693.24194,3.0572,1.788998,
min,23901.0,1001.0,5300265.0,0.0,1.34,0.01,1.79,0.01,1.36,0.01,75.0,16.0,0.014362,0.131432,0.00055,
25%,484988.8,19032.5,1085268000.0,7009854.0,2.35,0.04,2.4,0.05,2.2,0.1,11214.25,94.25,6.636347,1.631522,0.114605,
50%,974158.5,30024.0,1565689000.0,19386010.0,2.49,0.07,2.52,0.09,2.4,0.17,25950.5,116.5,17.82277,1.984084,0.164951,
75%,1419973.0,46105.5,2368884000.0,60692410.0,2.65,0.1,2.69,0.13,2.64,0.25,66552.25,157.0,51.347991,2.629431,0.2466,
max,2516404.0,72153.0,377000000000.0,25992280000.0,4.97,1.14,4.88,1.46,6.81,5.49,10098050.0,380.0,27819.8048,63.716966,74.257081,


In [171]:
healthInc = pd.read_csv('Health_Insurance_Coverage_-_Counties_2015-2019.csv')
healthInc.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,County_Name,LSAD,ALAND,AWATER,GEO_PARENT_NAME,...,Civilian noninstitutionalized population under 19 years,Civilian noninstitutionalized population under 19 years *margin of error,Total Civilian Noninstitutionalized Population under 19 yrs - No health insurance coverage,Total Civilian Noninstitutionalized Population under 19 yrs - No health insurance coverage *margin of error,Percent of Population with No Health Insurance Coverage,Percent of Population with No Health Insurance Coverage *margin of error,Total Civilian Noninstitutionalized Population - No health insurance coverage,Total Civilian Noninstitutionalized Population - No health insurance coverage *margin of error,Shape__Area,Shape__Length
0,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,Alabama,...,13915,166,242,131,7.1,1.0,3856,561,2.052664,0.150257
1,1,3,161527,0500000US01003,1003,Baldwin,6,4117622000.0,1132980868,Alabama,...,48427,382,1836,537,8.9,0.7,18679,1449,4.278237,0.409922
2,1,5,161528,0500000US01005,1005,Barbour,6,2292160000.0,50523213,Alabama,...,5594,76,184,110,11.3,1.5,2544,342,2.566226,0.223253
3,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,Alabama,...,4759,77,94,82,10.7,2.2,2201,466,1.886955,0.156525
4,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015467,Alabama,...,13974,151,822,347,10.8,1.4,6159,810,2.392512,0.164404


In [172]:
healthInc.describe()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,LSAD,ALAND,AWATER,POP_DENSITY,Employer Insurance only - Pop < 19,Employer Insurance only - Pop < 19 *margin of error,...,Civilian noninstitutionalized population under 19 years,Civilian noninstitutionalized population under 19 years *margin of error,Total Civilian Noninstitutionalized Population under 19 yrs - No health insurance coverage,Total Civilian Noninstitutionalized Population under 19 yrs - No health insurance coverage *margin of error,Percent of Population with No Health Insurance Coverage,Percent of Population with No Health Insurance Coverage *margin of error,Total Civilian Noninstitutionalized Population - No health insurance coverage,Total Civilian Noninstitutionalized Population - No health insurance coverage *margin of error,Shape__Area,Shape__Length
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,...,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,31.290683,102.92205,962889.7,31393.60528,6.569565,2844436000.0,213976100.0,112.750562,11564.554969,524.306211,...,24350.4,162.774845,1232.237888,233.813043,9.549627,1.734658,8839.888199,605.52764,2.581581,0.342835
std,16.277207,106.693846,517423.6,16292.078954,2.652226,9256066000.0,1220470000.0,692.745183,36062.877369,605.854098,...,79115.33,146.676597,5043.523352,293.690871,5.086483,1.404545,35466.447087,749.167229,3.558888,1.790048
min,1.0,1.0,23901.0,1001.0,0.0,5300262.0,0.0,0.014261,1.0,4.0,...,1.0,4.0,0.0,2.0,0.0,0.1,0.0,4.0,0.12611,0.00055
25%,19.0,35.0,484988.8,19032.5,6.0,1085267000.0,7004659.0,6.683385,944.25,192.0,...,2590.5,72.0,114.0,72.0,5.7,0.9,923.75,212.0,1.62224,0.114586
50%,30.0,79.0,974158.5,30024.0,6.0,1565686000.0,19302000.0,17.895454,2415.0,340.5,...,6118.5,121.0,325.5,150.0,8.6,1.4,2259.5,400.0,1.975052,0.164877
75%,46.0,133.0,1419973.0,46105.5,6.0,2368903000.0,60223250.0,51.768885,7056.5,614.0,...,15791.75,195.0,832.0,283.25,12.0,2.2,5625.5,699.0,2.594089,0.2466
max,72.0,840.0,2516404.0,72153.0,25.0,377000000000.0,25973300000.0,27806.76695,937617.0,8901.0,...,2346589.0,1712.0,158267.0,4518.0,46.3,36.3,965181.0,11067.0,71.549539,74.257061


In [173]:
incBen = pd.read_csv('Income_and_Benefits_-_Counties_2015-2019.csv')
incBen.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,GEO_PARENT_NAME,...,Total Households - With Supplemental Security Income,Total Households - With Supplemental Security Income *margin of error,Total Households - With cash public assistance income,Total Households - With cash public assistance income *margin of error,Total Households - With Food Stamp/SNAP benefits in the past 12 months,Total Households - With Food Stamp/SNAP benefits in the past 12 months *margin of error,"Percent of Households with income in the past 12 months that was less than $75,000","Percent of Households with income in the past 12 months that was less than $75,000 *margin of error",SHAPE_Length,SHAPE_Area
0,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,Alabama,...,1302,235,253,102,2746,407,60.9,3.7,2.052664,0.150257
1,1,3,161527,0500000US01003,1003,Baldwin,6,4117622000.0,1132980868,Alabama,...,3568,408,910,264,6269,678,61.1,2.0,4.278237,0.409922
2,1,5,161528,0500000US01005,1005,Barbour,6,2292160000.0,50523213,Alabama,...,1019,184,244,80,2509,238,80.1,4.7,2.566226,0.223253
3,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,Alabama,...,625,183,148,91,1113,253,69.0,6.7,1.886955,0.156525
4,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015467,Alabama,...,1721,286,361,137,1977,264,70.1,3.6,2.392512,0.164404


In [174]:
incBen.describe()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,LSAD,ALAND,AWATER,POP_DENSITY,Households: Receiving Food Stamps/SNAP (%),Households: Receiving Food Stamps/SNAP (%) *margin of error,...,Total Households - With Supplemental Security Income,Total Households - With Supplemental Security Income *margin of error,Total Households - With cash public assistance income,Total Households - With cash public assistance income *margin of error,Total Households - With Food Stamp/SNAP benefits in the past 12 months,Total Households - With Food Stamp/SNAP benefits in the past 12 months *margin of error,"Percent of Households with income in the past 12 months that was less than $75,000","Percent of Households with income in the past 12 months that was less than $75,000 *margin of error",SHAPE_Length,SHAPE_Area
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,...,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3142.0,3142.0,3220.0,3220.0
mean,31.290683,102.92205,962889.7,31393.60528,6.569565,2844537000.0,213976100.0,112.750562,13.553571,2.143696,...,2002.686025,204.53354,915.497516,133.21646,4549.725776,306.002795,66.598918,4.706429,2.581581,0.342835
std,16.277207,106.693846,517423.6,16292.078954,2.652226,9257767000.0,1220470000.0,692.745183,8.156595,1.362611,...,6644.036851,217.6244,3351.434522,157.858683,14204.635178,315.133197,10.240971,3.000541,3.558888,1.790048
min,1.0,1.0,23901.0,1001.0,0.0,5300262.0,0.0,0.014261,0.0,0.1,...,0.0,2.0,0.0,2.0,0.0,2.0,22.8,0.2,0.12611,0.00055
25%,19.0,35.0,484988.8,19032.5,6.0,1085267000.0,7004659.0,6.683385,8.4,1.2,...,225.75,74.0,81.0,44.0,523.0,120.0,61.2,2.8,1.62224,0.114586
50%,30.0,79.0,974158.5,30024.0,6.0,1565686000.0,19302000.0,17.895454,12.2,1.9,...,615.5,150.0,213.0,87.0,1373.5,221.5,68.0,4.4,1.975052,0.164877
75%,46.0,133.0,1419973.0,46105.5,6.0,2368903000.0,60223250.0,51.768885,16.5,2.8,...,1522.75,251.0,590.25,161.0,3431.0,377.0,73.6,6.0,2.594089,0.2466
max,72.0,840.0,2516404.0,72153.0,25.0,377039000000.0,25973300000.0,27806.76695,62.8,19.1,...,223761.0,2663.0,112441.0,2009.0,289808.0,3638.0,89.1,69.1,71.549539,74.257061


In [175]:
occupSex = pd.read_csv('Occupation_by_Sex_-_Counties_2015-2019.csv')
occupSex.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,GEO_PARENT_NAME,...,Percent of Civilian Employed Population 16 Years and Over in Food Preparation and Serving Related Occupations,Percent of Civilian Employed Population 16 Years and Over in Food Preparation and Serving Related Occupations*margin of error,Percent of Civilian Employed Population 16 Years and Over in Sales and Related Occupations,Percent of Civilian Employed Population 16 Years and Over in Sales and Related Occupations*margin of error,"Male - Natural resources, construction, and maintenance occupations","Percent of Civilian Employed Population 16 Years and Over in Natural Resources, Construction, and Maintenance Occupations","Percent of Civilian Employed Population 16 Years and Over in Natural Resources, Construction, and Maintenance Occupations*margin of error","Total - Production, transportation, and material moving occupations*margin of error",SHAPE_Length,SHAPE_Area
0,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,Alabama,...,6.0,1.4,10.2,1.6,1882,7.8,1.2,553,2.052664,0.150257
1,1,3,161527,0500000US01003,1003,Baldwin,6,4117622000.0,1132980868,Alabama,...,6.3,0.7,13.1,1.1,8375,9.3,1.0,1053,4.278237,0.409922
2,1,5,161528,0500000US01005,1005,Barbour,6,2292160000.0,50523213,Alabama,...,4.2,1.3,7.4,1.7,1122,14.1,2.5,263,2.566226,0.223253
3,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,Alabama,...,3.6,1.5,11.5,3.1,1356,16.8,3.7,359,1.886955,0.156525
4,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015467,Alabama,...,3.2,0.9,8.1,1.3,3428,16.2,1.7,448,2.392512,0.164404


In [176]:
occupSex.describe()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,LSAD,ALAND,AWATER,Total - Civilian employed population 16 years and over,Total - Civilian employed population 16 years and over *margin of error,Male - Civilian employed population 16 years and over,...,Percent of Civilian Employed Population 16 Years and Over in Food Preparation and Serving Related Occupations,Percent of Civilian Employed Population 16 Years and Over in Food Preparation and Serving Related Occupations*margin of error,Percent of Civilian Employed Population 16 Years and Over in Sales and Related Occupations,Percent of Civilian Employed Population 16 Years and Over in Sales and Related Occupations*margin of error,"Male - Natural resources, construction, and maintenance occupations","Percent of Civilian Employed Population 16 Years and Over in Natural Resources, Construction, and Maintenance Occupations","Percent of Civilian Employed Population 16 Years and Over in Natural Resources, Construction, and Maintenance Occupations*margin of error","Total - Production, transportation, and material moving occupations*margin of error",SHAPE_Length,SHAPE_Area
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,...,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,31.290683,102.92205,962889.7,31393.60528,6.569565,2844537000.0,213976100.0,48407.05,638.408075,25418.0,...,5.457453,1.599907,9.231553,1.931398,4077.551553,12.538665,2.265466,403.540683,2.581581,0.342835
std,16.277207,106.693846,517423.6,16292.078954,2.652226,9257767000.0,1220470000.0,160707.0,677.716564,85584.93,...,1.830762,1.358426,2.166414,1.386342,12262.954236,4.166201,1.774543,422.525752,3.558888,1.790048
min,1.0,1.0,23901.0,1001.0,0.0,5300262.0,0.0,33.0,11.0,25.0,...,0.0,0.1,0.0,0.1,0.0,0.0,0.1,5.0,0.12611,0.00055
25%,19.0,35.0,484988.8,19032.5,6.0,1085267000.0,7004659.0,4608.0,253.0,2469.0,...,4.4,0.9,7.9,1.1,583.0,9.7,1.1,156.0,1.62224,0.114586
50%,30.0,79.0,974158.5,30024.0,6.0,1565686000.0,19302000.0,10750.0,437.0,5711.5,...,5.3,1.3,9.3,1.7,1311.5,12.1,1.9,282.0,1.975052,0.164877
75%,46.0,133.0,1419973.0,46105.5,6.0,2368903000.0,60223250.0,29483.0,748.25,15582.25,...,6.3,1.925,10.6,2.4,3189.5,14.8,2.9,485.25,2.594089,0.2466
max,72.0,840.0,2516404.0,72153.0,25.0,377039000000.0,25973300000.0,4929863.0,9150.0,2663007.0,...,20.1,42.4,22.1,42.4,359772.0,46.0,42.4,5282.0,71.549539,74.257061


In [177]:
popPov = pd.read_csv('Population_and_Poverty_Status_-_Counties_2015-2019.csv')
popPov.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,GEO_PARENT_NAME,...,Percentage of people whose income in the past 12 months is below the poverty level - Related children of the householder under 5 years,Percentage of people whose income in the past 12 months is below the poverty level - Related children of the householder under 5 years *margin of error,Percentage of people whose income in the past 12 months is below the poverty level - 65 years and over,Percentage of people whose income in the past 12 months is below the poverty level - 65 years and over *margin of error,Percentage of people whose income in the past 12 months is below the poverty level,Percentage of people whose income in the past 12 months is below the poverty level - All people *margin of error,Total NonFamily Hhlds w/Male Householder *margin of error,Total NonFamily Hhlds w/Female Householder *margin of error,Shape__Area,Shape__Length
0,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,Alabama,...,21.2,6.2,8.7,2.1,15.2,1.8,390,385,2.052664,0.150257
1,1,3,161527,0500000US01003,1003,Baldwin,6,4117622000.0,1132980868,Alabama,...,16.0,3.7,7.4,1.4,10.4,0.9,891,1079,4.278237,0.409922
2,1,5,161528,0500000US01005,1005,Barbour,6,2292160000.0,50523213,Alabama,...,59.0,7.1,16.8,3.1,30.7,2.4,235,219,2.566226,0.223253
3,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,Alabama,...,17.3,12.6,6.9,3.2,18.1,4.5,216,228,1.886955,0.156525
4,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015467,Alabama,...,21.0,5.4,10.9,2.8,13.6,1.7,368,378,2.392512,0.164404


In [178]:
popPov.describe()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,LSAD,ALAND,AWATER,POP_DENSITY,Total Households Below the Poverty Level,Total Households Below the Poverty Level *margin of error,...,Percentage of people whose income in the past 12 months is below the poverty level - Related children of the householder under 5 years,Percentage of people whose income in the past 12 months is below the poverty level - Related children of the householder under 5 years *margin of error,Percentage of people whose income in the past 12 months is below the poverty level - 65 years and over,Percentage of people whose income in the past 12 months is below the poverty level - 65 years and over *margin of error,Percentage of people whose income in the past 12 months is below the poverty level,Percentage of people whose income in the past 12 months is below the poverty level - All people *margin of error,Total NonFamily Hhlds w/Male Householder *margin of error,Total NonFamily Hhlds w/Female Householder *margin of error,Shape__Area,Shape__Length
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,...,3219.0,3219.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,31.290683,102.92205,962889.7,31393.60528,6.569565,2844436000.0,213976100.0,112.750562,5014.081988,331.46087,...,24.442591,9.207766,10.665559,3.136925,15.893385,2.508944,341.631988,345.832609,2.581581,0.342835
std,16.277207,106.693846,517423.6,16292.078954,2.652226,9256066000.0,1220470000.0,692.745183,15831.216075,335.315926,...,14.312147,7.020335,6.730415,3.05497,8.087809,1.608233,354.654053,372.099228,3.558888,1.790048
min,1.0,1.0,23901.0,1001.0,0.0,5300262.0,0.0,0.014261,2.0,2.0,...,0.0,0.5,0.0,0.2,2.4,0.1,7.0,12.0,0.12611,0.00055
25%,19.0,35.0,484988.8,19032.5,6.0,1085267000.0,7004659.0,6.683385,661.0,135.0,...,14.75,4.8,7.1,1.6,10.7,1.4,140.0,135.0,1.62224,0.114586
50%,30.0,79.0,974158.5,30024.0,6.0,1565686000.0,19302000.0,17.895454,1557.0,240.0,...,22.0,7.8,9.2,2.6,14.4,2.2,237.0,235.0,1.975052,0.164877
75%,46.0,133.0,1419973.0,46105.5,6.0,2368903000.0,60223250.0,51.768885,3812.0,409.25,...,31.1,11.6,12.0,4.0,18.8,3.3,406.0,405.25,2.594089,0.2466
max,72.0,840.0,2516404.0,72153.0,25.0,377000000000.0,25973300000.0,27806.76695,481979.0,4539.0,...,100.0,100.0,60.0,100.0,64.5,19.0,4343.0,4295.0,71.549539,74.257061


In [179]:
usa_covid = pd.read_csv('usa_county_27july.csv')
usa_covid.head()

Unnamed: 0,UID,iso2,iso3,code3,Geographic Identifier - FIPS Code,County_Name,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",7/27/20,932,20
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",7/27/20,2727,17
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",7/27/20,529,4
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",7/27/20,327,2
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",7/27/20,601,1


In [180]:
usa_covid.describe()

Unnamed: 0,UID,code3,Geographic Identifier - FIPS Code,Lat,Long_,Confirmed,Deaths
count,3340.0,3340.0,3330.0,3340.0,3340.0,3340.0,3340.0
mean,83429580.0,834.491617,33061.684685,36.707212,-88.601474,1284.508683,44.314671
std,4315345.0,36.498055,18638.940791,9.062922,21.718982,6844.463318,448.443711
min,16.0,16.0,60.0,-14.271,-174.1596,0.0,0.0
25%,84018110.0,840.0,19079.5,33.895587,-97.790204,39.0,0.0
50%,84029210.0,840.0,31014.0,38.002344,-89.48671,151.0,2.0
75%,84046120.0,840.0,47130.5,41.573069,-82.311265,569.0,12.0
max,84100000.0,850.0,99999.0,69.314792,145.6739,224051.0,23500.0


As we can see, most of the data sets have 3,220 data points, while the Covid dataset has 3,340 data points. Looking into file, we noted why: the Covid dataset also included categories for travellers. People who travelled out of state and caught Covid were still registered, but under a different category than their home state. As such, there are some additional county data points which are not reflected in the other data sets. To enable us to perform a coherent study, we decided to ignore these data points for now, and will do a separate investigation on them later.

Now, we will start merging the data sets, one at a time using the pandas merge command, using the GNIS County Code as the key. (FOLLOW UP)

In [181]:
county_covid = pd.merge(housePop, healthInc, on="COUNTYNS")

In [182]:
county_covid = pd.merge(county_covid, incBen, on="COUNTYNS")

In [183]:
county_covid = pd.merge(county_covid, occupSex, on="COUNTYNS")

In [184]:
county_covid = pd.merge(county_covid, popPov, on="COUNTYNS")

In [185]:
#county_covid.to_csv('county_megaset.csv', index=False)

In [186]:
county_covid['Geographic Identifier - FIPS Code'].describe()

count     3220.000000
mean     31393.605280
std      16292.078954
min       1001.000000
25%      19032.500000
50%      30024.000000
75%      46105.500000
max      72153.000000
Name: Geographic Identifier - FIPS Code, dtype: float64

Since we are merging similar datasets, some of the columns are identical, to avoid future complications, we will remove those duplicates first.

In [187]:
county_covid = county_covid.loc[:,~county_covid.columns.duplicated()]

In [188]:
county_covid = pd.merge(county_covid, usa_covid, on="Geographic Identifier - FIPS Code")

In [189]:
county_covid.describe()

Unnamed: 0,COUNTYNS,Geographic Identifier - FIPS Code,Area of Land (square meters),Area of Water (square meters),Average Household Size,Average Household Size - Margin of Error,Average Household Size of Owner-Occupied Unit,Average Household Size of Owner-Occupied Unit - Margin of Error,Average Household Size of Renter-Occupied Unit,Average Household Size of Renter-Occupied Unit - Margin of Error,...,Total NonFamily Hhlds w/Male Householder *margin of error,Total NonFamily Hhlds w/Female Householder *margin of error,Shape__Area,Shape__Length,UID,code3,Lat,Long_,Confirmed,Deaths
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3219.0,3219.0,3220.0,3220.0,...,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,962889.7,31393.60528,2844093000.0,214325100.0,2.529093,0.081494,2.563088,0.102336,2.44495,0.202314,...,341.631988,345.832609,2.581581,0.342835,83522700.0,834.913043,37.965788,-91.661759,1316.137578,45.27764
std,517423.6,16292.078954,9255389000.0,1220804000.0,0.279374,0.067781,0.274885,0.07981,0.393263,0.182878,...,354.654053,372.099228,3.558888,1.790048,3222722.0,32.290983,6.0952,13.388721,6962.304363,456.31641
min,23901.0,1001.0,5300265.0,0.0,1.34,0.01,1.79,0.01,1.36,0.01,...,7.0,12.0,0.12611,0.00055,63072000.0,630.0,17.982429,-174.1596,0.0,0.0
25%,484988.8,19032.5,1085268000.0,7009854.0,2.35,0.04,2.4,0.05,2.2,0.1,...,140.0,135.0,1.62224,0.114586,84018060.0,840.0,34.353621,-98.086287,43.0,0.0
50%,974158.5,30024.0,1565689000.0,19386010.0,2.49,0.07,2.52,0.09,2.4,0.17,...,237.0,235.0,1.975052,0.164877,84029100.0,840.0,38.212711,-89.945118,158.0,2.0
75%,1419973.0,46105.5,2368884000.0,60692410.0,2.65,0.1,2.69,0.13,2.64,0.25,...,406.0,405.25,2.594089,0.2466,84045040.0,840.0,41.694461,-82.990516,585.0,13.0
max,2516404.0,72153.0,377000000000.0,25992280000.0,4.97,1.14,4.88,1.46,6.81,5.49,...,4343.0,4295.0,71.549539,74.257061,84056040.0,840.0,69.314792,-65.28813,224051.0,23500.0


In [190]:
county_covid.to_csv('county_megaset.csv', index=False)

Now, we have the data set that we want to work with, which we will then conduct our Exploratory Data Analysis.