# Setup

## Import Libraries

In [1]:
import pandas as pd
import os
import json

## Set file locations

In [2]:
# tract data
tract_data_csv = "../00_Data/cleaned_data/cleaned_pdb_tract.csv"
# tract_data_json = "../00_Data/cleaned_data/cleaned_pdb_tract.json"

# block data
block_data_csv = "../00_Data/cleaned_data/cleaned_pdb_block_group.csv"
# block_data_json = "../00_Data/cleaned_data/cleaned_pdb_block_group.json"

# site data
site_data_csv = "../00_Data/cleaned_data/cleaned_priorities_list.csv"
# site_data_json = "../00_Data/cleaned_data/cleaned_priorities_list.json"

# inspection csv
inspection_csv = "../00_Data/inspection_files/inspection_file.csv"

# cleaned, merged data
merged_data_csv = "../00_Data/cleaned_data/cleaned_merged_data.csv"
merged_data_json = "../00_Data/cleaned_data/cleaned_merged_data.json"

# Import Data

In [3]:
# Import census data
census_df = pd.read_csv(block_data_csv)
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220354 entries, 0 to 220353
Columns: 345 entries, fips_block_group to has_superfund
dtypes: float64(337), int64(6), object(2)
memory usage: 580.0+ MB


In [4]:
# Import superfund site data
site_df = pd.read_csv(site_data_csv)
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Columns: 359 entries, fips_full to pct_bilq_mailout_count_cen_2010
dtypes: float64(342), int64(4), object(13)
memory usage: 3.7+ MB


# Merge Data

In [5]:
merged_df = pd.merge(census_df, site_df, how='outer', on='fips_block_group')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220426 entries, 0 to 220425
Columns: 703 entries, fips_block_group to pct_bilq_mailout_count_cen_2010_y
dtypes: float64(687), int64(1), object(15)
memory usage: 1.2+ GB


## Inspect/Clean the Merged Data
Looking at the .info() above, we notice that there are some entries that may be missing some information, i.e. there are FIPS block IDs that are not part of the census data. Let's take a closer look at those. Later, we need to make some decisions about how we deal with that data.

### Inspection

In [6]:
inspection_df = merged_df[merged_df['state_name_x'].isnull()]

In [7]:
inspection_df.to_csv(inspection_csv, index = False)

In [8]:
merged_df = merged_df.dropna(subset=['state_name_x'])

### Cleaning

In [9]:
# drop the duplicate columns, and the 'moe' columns
column_names = list(merged_df.columns.values)

exclusion_list = ['_y', 'moe']
selected_columns = [x for x in column_names if all(y not in x for y in exclusion_list)]

clean_merged_df = merged_df[selected_columns]

# tidy up the column names
clean_merged_df.columns = clean_merged_df.columns.str.rstrip('_x')

# fill in nulls with zeros
clean_merged_df.fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [10]:
clean_merged_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220423 entries, 0 to 220422
Data columns (total 240 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fips_block_group                  220423 non-null  int64  
 1   state                             220423 non-null  float64
 2   state_name                        220423 non-null  object 
 3   county                            220423 non-null  float64
 4   county_name                       220423 non-null  object 
 5   tract                             220423 non-null  float64
 6   block_group                       220423 non-null  float64
 7   flag                              220423 non-null  float64
 8   land_area                         220423 non-null  float64
 9   aian_land                         220423 non-null  float64
 10  urbanized_area_pop_cen_2010       220423 non-null  float64
 11  urban_cluster_pop_cen_2010        220423 non-null  

# Export the Merged Data

In [11]:
# Export to csv
clean_merged_df.to_csv(merged_data_csv, index = False)

In [12]:
# # Export to flat json
# clean_merged_df.to_json(merged_data_json, orient='records')