# Multivariate GHG Emission Clustering with CO2 and Socioeconomic Data: ***Data Pipeline***
GE Vernova (Team 2): Dan Dryer, Joanna Rashid, Nhu Y Pham


## Social Vulnerability Data from CDC 2014-2020
cleaning, merging, and imputing missing values
Data source: https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html
Metadata: https://www.atsdr.cdc.gov/placeandhealth/svi/documentation/pdf/SVI2018Documentation_01192022_1.pdf

In [13]:
#load packages
import pandas as pd
import numpy as np

In [15]:
#read in cdc SVI data
cdc_svi_2014 = pd.read_csv('https://storage.googleapis.com/oco2-sedac-2014-2018/CDC_SVI_Raw_Data/SVI2014_US.csv')
cdc_svi_2016 = pd.read_csv('https://storage.googleapis.com/oco2-sedac-2014-2018/CDC_SVI_Raw_Data/SVI2016_US.csv')
cdc_svi_2018 = pd.read_csv('https://storage.googleapis.com/oco2-sedac-2014-2018/CDC_SVI_Raw_Data/SVI2018_US.csv')
cdc_svi_2020 = pd.read_csv('https://storage.googleapis.com/oco2-sedac-2014-2018/CDC_SVI_Raw_Data/SVI2020_US.csv')

### Subsetting variables of interest and joining data

Variables are selected are percentile rankings for the socioeconomic vulnerability index, and its four major areas (housing, minority status, housing and transportation type, household composition). Population, number of housing units, number of household, and unemployment rate are also selected based on consistent availability and calculation method from 2014-2020. Additional geographic identifiers are selected as well.


In [16]:
#select rows of interest 2014
cdc_svi_2014 = cdc_svi_2014[['ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION',
                             'E_TOTPOP','E_HU','E_HH', 'E_UNEMP','RPL_THEME1','RPL_THEME2','RPL_THEME3',
                             'RPL_THEME4','RPL_THEMES']]

#select rows of interest 2016
cdc_svi_2016 = cdc_svi_2016[['FIPS','E_TOTPOP','E_HU','E_HH', 'E_UNEMP','RPL_THEME1',
                            'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

#select rows of interest 2018
cdc_svi_2018 = cdc_svi_2018[['FIPS', 'E_TOTPOP','E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                             'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

#select var 2020
cdc_svi_2020 = cdc_svi_2020[['FIPS', 'E_TOTPOP','E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                             'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

In [17]:
#rename columns to add year identifier
#add suffix to select rows of interest 2014
cdc_svi_2014_renamed = cdc_svi_2014.add_suffix('_2014')
cdc_svi_2016_renamed = cdc_svi_2016.add_suffix('_2016')
cdc_svi_2018_renamed = cdc_svi_2018.add_suffix('_2018')
cdc_svi_2020_renamed = cdc_svi_2020.add_suffix('_2020')

#drop _2014 from cdc_svi_2014_renamed
cdc_svi_2014_renamed.rename(columns={'ST_2014':'ST', 'STATE_2014':'STATE', 'ST_ABBR_2014':'ST_ABBR',
                                                            'STCNTY_2014':'STCNTY', 'COUNTY_2014':'COUNTY', 'FIPS_2014':'FIPS',
                                                            'LOCATION_2014':'LOCATION'}, inplace=True)

#rename FIPS column 'FIPS'
cdc_svi_2016_renamed.rename(columns={'FIPS_2016':'FIPS'}, inplace=True)
cdc_svi_2018_renamed.rename(columns={'FIPS_2018':'FIPS'}, inplace=True)
cdc_svi_2020_renamed.rename(columns={'FIPS_2020':'FIPS'}, inplace=True)

In [18]:
#join all dfs on FIPS
df_join = pd.merge(cdc_svi_2014_renamed, cdc_svi_2016_renamed, on='FIPS', how='outer')
df_join = pd.merge(df_join, cdc_svi_2018_renamed, on='FIPS', how='outer')
df_join = pd.merge(df_join, cdc_svi_2020_renamed, on='FIPS', how='outer')

In [20]:
df_join.describe()

Unnamed: 0,ST,STCNTY,FIPS,E_TOTPOP_2014,E_HU_2014,E_HH_2014,E_UNEMP_2014,RPL_THEME1_2014,RPL_THEME2_2014,RPL_THEME3_2014,...,RPL_THEMES_2018,E_TOTPOP_2020,E_HU_2020,E_HH_2020,E_UNEMP_2020,RPL_THEME1_2020,RPL_THEME2_2020,RPL_THEME3_2020,RPL_THEME4_2020,RPL_THEMES_2020
count,72842.0,72842.0,95623.0,72842.0,72842.0,72842.0,72842.0,72382.0,72842.0,72842.0,...,72837.0,84122.0,84122.0,84122.0,84122.0,84122.0,84122.0,84122.0,84122.0,84122.0
mean,27.742745,27828.738969,27866840000.0,4312.169957,1822.314503,1595.385794,199.126617,0.5,-5.040728,-5.040732,...,-8.611694,3882.091581,1645.618875,1454.485378,105.44823,-7.912152,-7.436899,-5.726554,-8.660736,-8.898314
std,15.788699,15813.196589,16003120000.0,2074.881738,849.374029,745.286957,136.658091,0.288681,74.230478,74.230478,...,94.996397,1657.514889,672.645603,606.484129,89.162035,91.309095,88.713532,78.639731,95.248981,96.464885
min,1.0,1001.0,1001020000.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0,...,-999.0,0.0,0.0,0.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0
25%,12.0,12127.0,12115000000.0,2894.0,1256.0,1089.0,104.0,0.25,0.25,0.25,...,0.2431,2704.0,1189.0,1031.0,44.0,0.2436,0.244,0.2439,0.243,0.2429
50%,27.0,27129.0,27151960000.0,4048.0,1717.0,1512.0,170.0,0.5,0.5,0.5,...,0.4954,3727.0,1589.0,1405.0,84.0,0.4958,0.496,0.4961,0.4954,0.4952
75%,41.0,41039.0,42003410000.0,5410.75,2273.0,2008.0,263.0,0.75,0.75,0.75,...,0.7477,4888.0,2044.0,1827.0,142.0,0.7479,0.7479,0.7479,0.7477,0.7476
max,56.0,56045.0,56045950000.0,46330.0,24087.0,19099.0,2364.0,1.0,1.0,1.0,...,1.0,39373.0,13037.0,8078.0,1459.0,1.0,1.0,0.9959,1.0,1.0


In [21]:
#for all values in RPL columns, replace -999 with NaN
df_join = df_join.replace(-999, np.nan)

### Imputing values for 2015, 2017, and 2019
Imputation is conducted by taking the average of the preceding and succeeding year's values for each variable. For example, the value for 2015 is the average of the 2014 and 2016 values. This is done for each variable in the dataset.

In [22]:
# Create a list of years to impute (2015, 2017, and 2019)
years_to_impute = [2015, 2017, 2019]

# Iterate over the years to impute
for year in years_to_impute:
    # Iterate over the columns
    for column in df_join.columns:
        if column.startswith('E_') or column.startswith('RPL_'):
            # Extract the column prefix and check if it matches 'E_' or 'RPL_'
            column_prefix = column.split('_')[0]
            name = column.split('_')[1]
            if column_prefix in ['E', 'RPL']:
                # Create a new column name for the imputed values
                new_column_name = f'{column_prefix}_{name}_{year}'

                # Compute the average for the current column
                preceding_year_column = f'{column_prefix}_{name}_{year - 1}'
                succeeding_year_column = f'{column_prefix}_{name}_{year + 1}'

                #impute with mean of subsequent and prior year
                imputed_values = (
                        (df_join[preceding_year_column] + df_join[succeeding_year_column]) / 2
                )

                # Add the new column with the imputed values to the data frame
                df_join[new_column_name] = imputed_values


In [23]:
#convert from wide to long creating new year column
df_svi_long = pd.wide_to_long(df_join, stubnames=['E_TOTPOP', 'E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                                                   'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'RPL_THEMES'],
                               i=['ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION'],
                               j='year', sep='_', suffix='\w+').reset_index()

In [24]:
#rename columns to desciptive labels
df_svi_long.rename(columns={'E_TOTPOP':'total_population', 'E_HU':'housing_units',
                             'E_HH':'num_households', 'E_UNEMP':'unemployment',
                            'RPL_THEME1':'socioeconomic',
                             'RPL_THEME2':'household_comp', 'RPL_THEME3':'minority_status', 'RPL_THEME4':'housing_type',
                             'RPL_THEMES':'overall_svi'}, inplace=True)

In [26]:
df_svi_long.describe()

Unnamed: 0,ST,STCNTY,FIPS,year,total_population,housing_units,num_households,unemployment,socioeconomic,household_comp,minority_status,housing_type,overall_svi
count,509894.0,509894.0,669361.0,669361.0,509651.0,509651.0,509651.0,509626.0,505423.0,505972.0,506642.0,505500.0,505197.0
mean,27.742745,27828.738969,27866840000.0,2017.0,4243.128509,1792.404765,1575.319822,148.233169,0.500908,0.502084,0.500335,0.501739,0.501316
std,15.788606,15813.10355,16003050000.0,2.000001,2044.698469,827.452984,733.363478,113.422655,0.287023,0.283026,0.286495,0.285716,0.287429
min,1.0,1001.0,1001020000.0,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,12127.0,12115000000.0,2015.0,2856.0,1245.0,1080.0,70.0,0.2524,0.2602,0.2515,0.2558,0.253
50%,27.0,27129.0,27151960000.0,2017.0,3993.0,1695.0,1495.0,121.5,0.50095,0.5033,0.50005,0.5021,0.5015
75%,41.0,41039.0,42003410000.0,2019.0,5313.0,2227.0,1973.0,196.5,0.7494,0.7446,0.7479,0.7481,0.75025
max,56.0,56045.0,56045950000.0,2020.0,70271.0,26436.0,21337.0,2364.0,1.0,1.0,1.0,1.0,1.0


## Importing atmospheric CO2 data from GOSAT/OCO-2 composite raster data
Data Source Methodology: https://www.tandfonline.com/doi/figure/10.1080/20964471.2022.2033149?scroll=top&needAccess=true&role=tab
Data Source: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/4WDTD8

*The global 1° land mapping XCO2 dataset (Mapping-XCO2) is derived from satellite XCO2 retrievals of GOSAT and OCO-2 spanning the period of April 2009 to December 2020. The data product is provided in GeoTIFF format and include two temporal resolutions: 3 days and month. The 3-day data files include gridded XCO2 and mapping uncertainty, which are named like “MappingXCO2_Date.nc” and “MappingUncertainty_Date.nc”. The flag “Date” is defined as date ID of 1426 time-units started from 20 April 2009. The monthly data files only include XCO2 data and named like “MappingXCO2_YYYY_MM.tif”. The number of “YYYY” and “MM” represent year and month, respectively. The domain of the dataset covers global land ranging from 56° S to 65° N and 169° W to 180° E. The spatial reference of the dataset is Geographic Lat/Lon. The unit of XCO2 data is ppm while the nodata values were assigned to NaN. (2021-03-26)*

US Bounding Coordinates Used to Trim Data for reference:
Longitude:
West Bounding Coordinate: -180.000000
East Bounding Coordinate: -65.000000

Latitude:
North Bounding Coordinate: 72.000000
South Bounding Coordinate: 17.000000

In [31]:
from osgeo import gdal
import re
import os
import glob
import requests
import zipfile

In [32]:
def raster_to_df(path):
    """
    :param path: path to geotif file
    :return: pandas df
    Opens a geotif file and converts to pandas df
    Labels year and month based on file name
    trims based on coordinates of continental US
    """
    #open tif file
    raster = gdal.Open(path)

    # Extract the filename from the file path
    filename = os.path.basename(path)

    # Extract the year and month using regular expressions
    match = re.search(r'(\d{4})(\d{2})', filename)
    year = match.group(1)
    month = match.group(2)
    new_file_name = f"MappingXCO2_{year}{month}.xyz"

    #convert raster to xyz file
    gdf = gdal.Translate(new_file_name, raster)

    #open xyz file as pandas df
    df = pd.read_csv(new_file_name, sep = " ", header=None)

    #rename columns
    df.columns = ["longitude","latitude", f"xco2_{year}_{month}"]

    #trim df_co2 to bounding coordinates
    df = df[(df['longitude'] >= -180.00) & (df['longitude'] <= -65.00)]
    df = df[(df['latitude'] >= 17.00) & (df['latitude'] <= 72.00)]

    return df

In [33]:
#import all co2 month geotiff files in directory on google cloud storage
url = 'https://storage.googleapis.com/oco2-sedac-2014-2018/MappingXCO2_2014-2020.zip'

destination_dir = 'co2_data'

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Download the file
response = requests.get(url)
zip_path = os.path.join(destination_dir, 'MappingXCO2.zip')
with open(zip_path, 'wb') as file:
    file.write(response.content)

# Unzip the downloaded file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(destination_dir)

# Get a list of paths to each file
file_paths = []
for root, dirs, files in os.walk(destination_dir):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

# Remove the zip file
os.remove(zip_path)

In [34]:
#iterate through monthly co2 files in folder and create dfs
dfs = []

for file in file_paths[86:]:
    df = raster_to_df(file)
    dfs.append(df)


In [35]:
# join dfs together on latitude, longitude
df_co2 = dfs[0]
for df in dfs[1:]:
    df_co2 = df_co2.merge(df, how='outer', on=['latitude', 'longitude'])

In [36]:
df_co2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6325 entries, 0 to 6324
Data columns (total 86 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   longitude     6325 non-null   float64
 1   latitude      6325 non-null   float64
 2   xco2_2018_01  2498 non-null   float64
 3   xco2_2018_03  2498 non-null   float64
 4   xco2_2018_02  2498 non-null   float64
 5   xco2_2016_09  2498 non-null   float64
 6   xco2_2018_06  2498 non-null   float64
 7   xco2_2018_12  2498 non-null   float64
 8   xco2_2018_07  2498 non-null   float64
 9   xco2_2016_08  2498 non-null   float64
 10  xco2_2014_08  2498 non-null   float64
 11  xco2_2018_11  2498 non-null   float64
 12  xco2_2018_05  2498 non-null   float64
 13  xco2_2018_04  2498 non-null   float64
 14  xco2_2018_10  2498 non-null   float64
 15  xco2_2014_09  2498 non-null   float64
 16  xco2_2019_03  2498 non-null   float64
 17  xco2_2020_07  2498 non-null   float64
 18  xco2_2020_06  2498 non-null 

In [37]:
#2014 standard deviation
df_std = df_co2.copy()

df_std[2014] = df_std[['xco2_2014_01', 'xco2_2014_02', 'xco2_2014_03', 'xco2_2014_04', 'xco2_2014_05',
                         'xco2_2014_06', 'xco2_2014_07', 'xco2_2014_08', 'xco2_2014_09', 'xco2_2014_10',
                         'xco2_2014_11', 'xco2_2014_12']].std(axis=1)

#2015 standard deviation
df_std[2015] = df_std[['xco2_2015_01', 'xco2_2015_02', 'xco2_2015_03', 'xco2_2015_04', 'xco2_2015_05',
                         'xco2_2015_06', 'xco2_2015_07', 'xco2_2015_08', 'xco2_2015_09', 'xco2_2015_10',
                         'xco2_2015_11', 'xco2_2015_12']].std(axis=1)

#2016 standard deviation
df_std[2016] = df_std[['xco2_2016_01', 'xco2_2016_02', 'xco2_2016_03', 'xco2_2016_04', 'xco2_2016_05',
                         'xco2_2016_06', 'xco2_2016_07', 'xco2_2016_08', 'xco2_2016_09', 'xco2_2016_10',
                         'xco2_2016_11', 'xco2_2016_12']].std(axis=1)

#2017 standard deviation
df_std[2017] = df_std[['xco2_2017_01', 'xco2_2017_02', 'xco2_2017_03', 'xco2_2017_04', 'xco2_2017_05',
                         'xco2_2017_06', 'xco2_2017_07', 'xco2_2017_08', 'xco2_2017_09', 'xco2_2017_10',
                         'xco2_2017_11', 'xco2_2017_12']].std(axis=1)

#2018 standard deviation
df_std[2018] = df_std[['xco2_2018_01', 'xco2_2018_02', 'xco2_2018_03', 'xco2_2018_04', 'xco2_2018_05',
                         'xco2_2018_06', 'xco2_2018_07', 'xco2_2018_08', 'xco2_2018_09', 'xco2_2018_10',
                         'xco2_2018_11', 'xco2_2018_12']].std(axis=1)

#2019 standard deviation
df_std[2019] = df_std[['xco2_2019_01', 'xco2_2019_02', 'xco2_2019_03', 'xco2_2019_04', 'xco2_2019_05',
                         'xco2_2019_06', 'xco2_2019_07', 'xco2_2019_08', 'xco2_2019_09', 'xco2_2019_10',
                         'xco2_2019_11', 'xco2_2019_12']].std(axis=1)

#2020 standard deviation
df_std[2020] = df_std[['xco2_2020_01', 'xco2_2020_02', 'xco2_2020_03', 'xco2_2020_04', 'xco2_2020_05',
                         'xco2_2020_06', 'xco2_2020_07', 'xco2_2020_08', 'xco2_2020_09', 'xco2_2020_10',
                         'xco2_2020_11', 'xco2_2020_12']].std(axis=1)

# annual columns only
df_std = df_std[['latitude', 'longitude', 2014, 2015, 2016, 2017, 2018, 2019, 2020]]

#convert from wide to long with year as column
df_std = df_std.melt(id_vars=['latitude', 'longitude'], var_name='year', value_name='xco2_std')

df_std.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44275 entries, 0 to 44274
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   44275 non-null  float64
 1   longitude  44275 non-null  float64
 2   year       44275 non-null  object 
 3   xco2_std   17486 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.4+ MB


In [38]:
# create annual averages
df_co2[2014] = df_co2[['xco2_2014_01', 'xco2_2014_02', 'xco2_2014_03', 'xco2_2014_04', 'xco2_2014_05',
                              'xco2_2014_06', 'xco2_2014_07', 'xco2_2014_08', 'xco2_2014_09', 'xco2_2014_10',
                              'xco2_2014_11', 'xco2_2014_12']].mean(axis=1)

df_co2[2015] = df_co2[['xco2_2015_01', 'xco2_2015_02', 'xco2_2015_03', 'xco2_2015_04', 'xco2_2015_05',
                                'xco2_2015_06', 'xco2_2015_07', 'xco2_2015_08', 'xco2_2015_09', 'xco2_2015_10',
                                'xco2_2015_11', 'xco2_2015_12']].mean(axis=1)

df_co2[2016] = df_co2[['xco2_2016_01', 'xco2_2016_02', 'xco2_2016_03', 'xco2_2016_04', 'xco2_2016_05',
                                'xco2_2016_06', 'xco2_2016_07', 'xco2_2016_08', 'xco2_2016_09', 'xco2_2016_10',
                                'xco2_2016_11', 'xco2_2016_12']].mean(axis=1)

df_co2[2017] = df_co2[['xco2_2017_01', 'xco2_2017_02', 'xco2_2017_03', 'xco2_2017_04', 'xco2_2017_05',
                                'xco2_2017_06', 'xco2_2017_07', 'xco2_2017_08', 'xco2_2017_09', 'xco2_2017_10',
                                'xco2_2017_11', 'xco2_2017_12']].mean(axis=1)

df_co2[2018] = df_co2[['xco2_2018_01', 'xco2_2018_02', 'xco2_2018_03', 'xco2_2018_04', 'xco2_2018_05',
                                'xco2_2018_06', 'xco2_2018_07', 'xco2_2018_08', 'xco2_2018_09', 'xco2_2018_10',
                                'xco2_2018_11', 'xco2_2018_12']].mean(axis=1)

df_co2[2019] = df_co2[['xco2_2019_01', 'xco2_2019_02', 'xco2_2019_03', 'xco2_2019_04', 'xco2_2019_05',
                                'xco2_2019_06', 'xco2_2019_07', 'xco2_2019_08', 'xco2_2019_09', 'xco2_2019_10',
                                'xco2_2019_11', 'xco2_2019_12']].mean(axis=1)

df_co2[2020] = df_co2[['xco2_2020_01', 'xco2_2020_02', 'xco2_2020_03', 'xco2_2020_04', 'xco2_2020_05',
                                'xco2_2020_06', 'xco2_2020_07', 'xco2_2020_08', 'xco2_2020_09', 'xco2_2020_10',
                                'xco2_2020_11', 'xco2_2020_12']].mean(axis=1)
# annual columns only
df_co2 = df_co2[['latitude', 'longitude', 2014, 2015, 2016, 2017, 2018, 2019, 2020]]

#convert from wide to long with year as column
df_co2 = df_co2.melt(id_vars=['latitude', 'longitude'], var_name='year', value_name='xco2')

df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44275 entries, 0 to 44274
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   44275 non-null  float64
 1   longitude  44275 non-null  float64
 2   year       44275 non-null  object 
 3   xco2       17486 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.4+ MB


In [39]:
#join df_std and df_co2
df_co2 = df_co2.merge(df_std, on=['latitude', 'longitude', 'year'], how='left')

In [40]:
#drop rows with missing values
df_co2 = df_co2.dropna()

## Importing 2019 census tract data and joining with CO2 data
Data Source: https://catalog.data.gov/dataset/2019-cartographic-boundary-shapefile-current-census-tract-for-united-states-1-500000
Metadata: https://www.atsdr.cdc.gov/placeandhealth/svi/documentation/pdf/SVI2018Documentation_01192022_1.pdf

Census tracts are polygons and CO2 values are assigned to coordinates that are at the center of one degree pixels from the raster data. The CO2 values are left joined to the census tract data, so that each census tract will receive a CO2 value for the one degree pixel with which the tract most overlaps.

In [41]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os

In [42]:
#import census tract level shape file
tract = gpd.read_file(r'/Users/joannarashid/Documents/GitHub/Team-Project-Practicum-6748/cb_2019_us_tract_500k/cb_2019_us_tract_500k.shp')

#establish coordinate points which are the center point of each 1degree pixel of the CO2 raster data
df_co2['coords'] = list(zip(df_co2['longitude'],df_co2['latitude']))
df_co2['coords'] = df_co2['coords'].apply(Point)

#convert points to geodataframe
points = gpd.GeoDataFrame(df_co2, geometry='coords', crs=tract.crs)

# join census tract with points
# s_nearest is used to left join co2 data to census tract data
# each census tract polygon will be assigned the raster value of the closest C02 point
# which represents the 1 degree pixel that the census tract most overlaps
df_join_tract = gpd.sjoin_nearest(tract, points, how='left')




In [43]:
df_join_tract.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 518077 entries, 0 to 73867
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   STATEFP      518077 non-null  object  
 1   COUNTYFP     518077 non-null  object  
 2   TRACTCE      518077 non-null  object  
 3   AFFGEOID     518077 non-null  object  
 4   GEOID        518077 non-null  object  
 5   NAME         518077 non-null  object  
 6   LSAD         518077 non-null  object  
 7   ALAND        518077 non-null  int64   
 8   AWATER       518077 non-null  int64   
 9   geometry     518077 non-null  geometry
 10  index_right  518077 non-null  int64   
 11  latitude     518077 non-null  float64 
 12  longitude    518077 non-null  float64 
 13  year         518077 non-null  object  
 14  xco2         518077 non-null  float64 
 15  xco2_std     518077 non-null  float64 
dtypes: float64(4), geometry(1), int64(3), object(8)
memory usage: 67.2+ MB


## Join CO2 data with SVI
The CO2 data is joined with the SVI data on the FIPS code and year. The SVI data is joined with the CO2 data such that all census tracts for which CO2 values are present will be preserved and those without dropped.


In [44]:
#rename df_co2_01_2018 GEOID to FIPS
df_join_tract.rename(columns = {"GEOID" : "FIPS"}, inplace= True)

#keep geopandas df for later
df_join_gpd = df_join_tract

#converting all FIPS to int
df_join_tract['FIPS'] = df_join_tract['FIPS'].astype(int)
df_svi_long['FIPS'] = df_svi_long['FIPS'].astype(int)

#join svi data to df_join_tract
df_join_tract = df_join_tract.merge(df_svi_long, on=['FIPS', 'year'], how='left')

In [96]:
df_join_tract.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 518077 entries, 0 to 518076
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   STATEFP           518077 non-null  object  
 1   COUNTYFP          518077 non-null  object  
 2   TRACTCE           518077 non-null  object  
 3   AFFGEOID          518077 non-null  object  
 4   FIPS              518077 non-null  int64   
 5   NAME              518077 non-null  object  
 6   LSAD              518077 non-null  object  
 7   ALAND             518077 non-null  int64   
 8   AWATER            518077 non-null  int64   
 9   geometry          518077 non-null  geometry
 10  index_right       518077 non-null  int64   
 11  latitude          518077 non-null  float64 
 12  longitude         518077 non-null  float64 
 13  year              518077 non-null  object  
 14  xco2              518077 non-null  float64 
 15  xco2_std          518077 non-null  float64 

In [73]:
#convert to pandas dataframe
df_join_tract = pd.DataFrame(df_join_tract)

In [75]:
#droping nas which are census tracts that did not overlap with the CO2 raster data
final_df = df_join_tract.dropna()

In [77]:
final_df.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,FIPS,NAME,LSAD,ALAND,AWATER,geometry,...,LOCATION,total_population,housing_units,num_households,unemployment,socioeconomic,household_comp,minority_status,housing_type,overall_svi
0,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,"Census Tract 7028.08, Burlington County, New J...",3204.0,1123.0,1041.0,244.0,0.6944,0.7248,0.7987,0.1516,0.6031
1,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,"Census Tract 7028.08, Burlington County, New J...",3383.0,1136.0,1058.0,164.0,0.5653,0.8186,0.7807,0.0617,0.4835
2,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,"Census Tract 7028.08, Burlington County, New J...",3193.0,1103.0,1064.0,226.0,0.6477,0.7343,0.8944,0.1606,0.6107
3,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,"Census Tract 7028.08, Burlington County, New J...",3447.0,1141.0,1053.0,77.0,0.5848,0.6889,0.9285,0.0703,0.4542
4,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,"Census Tract 7028.08, Burlington County, New J...",3198.5,1113.0,1052.5,235.0,0.67105,0.72955,0.84655,0.1561,0.6069


In [78]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487736 entries, 0 to 518076
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   STATEFP           487736 non-null  object  
 1   COUNTYFP          487736 non-null  object  
 2   TRACTCE           487736 non-null  object  
 3   AFFGEOID          487736 non-null  object  
 4   FIPS              487736 non-null  int64   
 5   NAME              487736 non-null  object  
 6   LSAD              487736 non-null  object  
 7   ALAND             487736 non-null  int64   
 8   AWATER            487736 non-null  int64   
 9   geometry          487736 non-null  geometry
 10  index_right       487736 non-null  int64   
 11  latitude          487736 non-null  float64 
 12  longitude         487736 non-null  float64 
 13  year              487736 non-null  object  
 14  xco2              487736 non-null  float64 
 15  xco2_std          487736 non-null  float64 
 16  ST

## Computing additional CO2 features that capture temporal changes

In [79]:
#change in co2 since previous year for each FIPS
final_df['co2_1yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_1yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff()


In [80]:
#total change since 2014 for each FIPS
final_df['co2_6yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff(periods=6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_6yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff(periods=6)


In [81]:
#percent change since previous year for each FIPS
final_df['co2_1yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_1yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change()


In [82]:
#percent change since 2014 for each FIPS
final_df['co2_6yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=6)
#5 year average co2 for each FIPS
final_df['co2_5yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=5)
#4 year average co2 for each FIPS
final_df['co2_4yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=4)
#3 year average co2 for each FIPS
final_df['co2_3yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=3)
#2 year average co2 for each FIPS
final_df['co2_2yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_6yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=6)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_5yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['co2_4yr_pct_cha

In [83]:
#rename xco2 to avg_co2
final_df.rename(columns = {"xco2" : "avg_co2"}, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns = {"xco2" : "avg_co2"}, inplace= True)


In [53]:
final_df.head(20)

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,FIPS,NAME,LSAD,ALAND,AWATER,geometry,...,housing_type,overall_svi,co2_1yr_change,co2_6yr_change,co2_1yr_pct_change,co2_6yr_pct_change,co2_5yr_pct_change,co2_4yr_pct_change,co2_3yr_pct_change,co2_2yr_pct_change
0,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.1516,0.6031,,,,,,,,
1,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.0617,0.4835,4.470362,,0.011071,,,,,
2,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.1606,0.6107,-10.519547,,-0.025767,,,,,-0.014981
3,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.0703,0.4542,15.565801,,0.039137,,,,0.023569,0.012361
4,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.1561,0.6069,-12.719866,,-0.030777,,,-0.007933,-0.018796,0.007155
5,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.10665,0.5433,5.451052,,0.013608,,0.005567,-0.005444,0.020861,-0.017587
6,34,5,702808,1400000US34005702808,34005702808,7028.08,CT,1510553,0,"POLYGON ((-74.90801 40.03250, -74.90496 40.036...",...,0.066,0.46885,4.626818,6.874621,0.011395,0.017026,0.005889,0.032494,-0.006392,0.025158
7,39,153,504200,1400000US39153504200,39153504200,5042.0,CT,808151,0,"POLYGON ((-81.51807 41.05957, -81.51805 41.062...",...,0.9113,0.9626,,,,,,,,
8,39,153,504200,1400000US39153504200,39153504200,5042.0,CT,808151,0,"POLYGON ((-81.51807 41.05957, -81.51805 41.062...",...,0.66785,0.83985,-12.712186,,-0.030804,,,,,
9,39,153,504200,1400000US39153504200,39153504200,5042.0,CT,808151,0,"POLYGON ((-81.51807 41.05957, -81.51805 41.062...",...,0.5177,0.79745,5.694221,,0.014237,,,,,-0.017006


In [85]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487736 entries, 0 to 518076
Data columns (total 39 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   STATEFP             487736 non-null  object  
 1   COUNTYFP            487736 non-null  object  
 2   TRACTCE             487736 non-null  object  
 3   AFFGEOID            487736 non-null  object  
 4   FIPS                487736 non-null  int64   
 5   NAME                487736 non-null  object  
 6   LSAD                487736 non-null  object  
 7   ALAND               487736 non-null  int64   
 8   AWATER              487736 non-null  int64   
 9   geometry            487736 non-null  geometry
 10  index_right         487736 non-null  int64   
 11  latitude            487736 non-null  float64 
 12  longitude           487736 non-null  float64 
 13  year                487736 non-null  object  
 14  avg_co2             487736 non-null  float64 
 15  xco2_std         

In [87]:
#export final_df to csv
final_df.to_csv(r'/Users/joannarashid/Documents/GitHub/Team-Project-Practicum-6748/final_df.csv', index=False)