In [1]:
# The following program cleans and then outputs the cleaned broadband data.
# The data was originally grabbed as an excel file from the following link, then saved as a csv.
#    https://techdatasociety.asu.edu/broadband-data-portal/dataaccess/countydata

# Dependencies
import pandas as pd

# Paths to be used
import_path = "CSV_files/broadband_long2000-2018rev.csv"
output_path = "CSV_files/clean_broadband.csv"

In [2]:
# rows containing any invalid values are dropped.
broadband_df = pd.read_csv(import_path)
clean_df = broadband_df.dropna(how = "any")
clean_df = clean_df.rename(columns={"statenam" : "State",
                                    "county":"County",
                                    "year": "Year",
                                   "id":"GEO ID",
                                   "broadband":"Broadband (%)",
                                   "cfips":"CFIPS"})

# Broadband access is turned into a percentage then the dataframe is output
clean_df["Broadband (%)"] = 100 * clean_df["Broadband (%)"] 
clean_df

Unnamed: 0,State,County,Year,GEO ID,Broadband (%),CFIPS
17,Alabama,Autauga County,2017,0500000US01001,61.818182,1001
18,Alabama,Autauga County,2018,0500000US01001,78.899997,1001
26,Alabama,Baldwin County,2007,0500000US01003,58.852088,1003
27,Alabama,Baldwin County,2008,0500000US01003,59.151804,1003
28,Alabama,Baldwin County,2009,0500000US01003,59.451514,1003
...,...,...,...,...,...,...
59488,Wyoming,Uinta County,2018,0500000US56041,88.200003,56041
59506,Wyoming,Washakie County,2017,0500000US56043,62.295079,56043
59507,Wyoming,Washakie County,2018,0500000US56043,78.299993,56043
59525,Wyoming,Weston County,2017,0500000US56045,58.620691,56045


In [3]:
# Check that all broadband access data is in an acceptable range
print(f'The maximum value is {clean_df["Broadband (%)"].max()}')
print(f'The minimum value is {clean_df["Broadband (%)"].min()}')

The maximum value is 97.388148
The minimum value is 1.125842


In [4]:
# Drop all duplicate values, which will be one's with the same year and location
clean_df.drop_duplicates(["Year", "GEO ID"])

Unnamed: 0,State,County,Year,GEO ID,Broadband (%),CFIPS
17,Alabama,Autauga County,2017,0500000US01001,61.818182,1001
18,Alabama,Autauga County,2018,0500000US01001,78.899997,1001
26,Alabama,Baldwin County,2007,0500000US01003,58.852088,1003
27,Alabama,Baldwin County,2008,0500000US01003,59.151804,1003
28,Alabama,Baldwin County,2009,0500000US01003,59.451514,1003
...,...,...,...,...,...,...
59488,Wyoming,Uinta County,2018,0500000US56041,88.200003,56041
59506,Wyoming,Washakie County,2017,0500000US56043,62.295079,56043
59507,Wyoming,Washakie County,2018,0500000US56043,78.299993,56043
59525,Wyoming,Weston County,2017,0500000US56045,58.620691,56045


In [5]:
# Clean dataframe is output as a csv file.
clean_df.to_csv(output_path, index=False)