## Social Vulnerability Data from CDC 2014-2018
cleaning, merging, and imputing missing values
Data source: https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html
Metadata: https://www.atsdr.cdc.gov/placeandhealth/svi/documentation/pdf/SVI2018Documentation_01192022_1.pdf

In [82]:
#load packages
import pandas as pd
import numpy as np

In [83]:
#read in cdc SVI data
cdc_svi_2014 = pd.read_csv('CDC_SVI_Data/SVI2014_US.csv')
cdc_svi_2016 = pd.read_csv('CDC_SVI_Data/SVI2016_US.csv')
cdc_svi_2018 = pd.read_csv('CDC_SVI_Data/SVI2018_US.csv')
cdc_svi_2020 = pd.read_csv('CDC_SVI_Data/SVI_2020_US.csv')

### Subsetting variables of interest and joining data

Variables and seldcted along with location labels from 2014, with FIPS code only in subsequent years for joining


In [84]:
#select rows of interest 2014
cdc_svi_2014 = cdc_svi_2014[['ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION',
                             'E_TOTPOP','E_HU','E_HH', 'E_UNEMP','RPL_THEME1','RPL_THEME2','RPL_THEME3',
                             'RPL_THEME4','RPL_THEMES']]

#select rows of interest 2016
cdc_svi_2016 = cdc_svi_2016[['FIPS','E_TOTPOP','E_HU','E_HH', 'E_UNEMP','RPL_THEME1',
                            'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

#select rows of interest 2018
cdc_svi_2018 = cdc_svi_2018[['FIPS', 'E_TOTPOP','E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                             'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

#select var 2020
cdc_svi_2020 = cdc_svi_2020[['FIPS', 'E_TOTPOP','E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                             'RPL_THEME2','RPL_THEME3','RPL_THEME4','RPL_THEMES']]

In [85]:
#rename columns to add year identifier
#add suffix to select rows of interest 2014
cdc_svi_2014_renamed = cdc_svi_2014.add_suffix('_2014')
cdc_svi_2016_renamed = cdc_svi_2016.add_suffix('_2016')
cdc_svi_2018_renamed = cdc_svi_2018.add_suffix('_2018')
cdc_svi_2020_renamed = cdc_svi_2020.add_suffix('_2020')

#drop _2014 from cdc_svi_2014_renamed
cdc_svi_2014_renamed.rename(columns={'ST_2014':'ST', 'STATE_2014':'STATE', 'ST_ABBR_2014':'ST_ABBR',
                                                            'STCNTY_2014':'STCNTY', 'COUNTY_2014':'COUNTY', 'FIPS_2014':'FIPS',
                                                            'LOCATION_2014':'LOCATION'}, inplace=True)

#rename FIPS column 'FIPS'
cdc_svi_2016_renamed.rename(columns={'FIPS_2016':'FIPS'}, inplace=True)
cdc_svi_2018_renamed.rename(columns={'FIPS_2018':'FIPS'}, inplace=True)
cdc_svi_2020_renamed.rename(columns={'FIPS_2020':'FIPS'}, inplace=True)

In [86]:
#join all dfs on FIPS
df_join = pd.merge(cdc_svi_2014_renamed, cdc_svi_2016_renamed, on='FIPS', how='outer')
df_join = pd.merge(df_join, cdc_svi_2018_renamed, on='FIPS', how='outer')
df_join = pd.merge(df_join, cdc_svi_2020_renamed, on='FIPS', how='outer')

### Imputing values for 2015, 2017, and 2019

In [87]:
# Create a list of years to impute (2015, 2017, and 2019)
years_to_impute = [2015, 2017, 2019]

# Iterate over the years to impute
for year in years_to_impute:
    # Iterate over the columns
    for column in df_join.columns:
        if column.startswith('E_') or column.startswith('RPL_'):
            # Extract the column prefix and check if it matches 'E_' or 'RPL_'
            column_prefix = column.split('_')[0]
            name = column.split('_')[1]
            if column_prefix in ['E', 'RPL']:
                # Create a new column name for the imputed values
                new_column_name = f'{column_prefix}_{name}_{year}'

                # Compute the average for the current column
                preceding_year_column = f'{column_prefix}_{name}_{year - 1}'
                succeeding_year_column = f'{column_prefix}_{name}_{year + 1}'

                #impute with mean of subsequent and prior year
                imputed_values = (
                        (df_join[preceding_year_column] + df_join[succeeding_year_column]) / 2
                )

                # Add the new column with the imputed values to the data frame
                df_join[new_column_name] = imputed_values


In [88]:
#convert from wide to long creating new year column
df_svi_long = pd.wide_to_long(df_join, stubnames=['E_TOTPOP', 'E_HU', 'E_HH', 'E_UNEMP', 'RPL_THEME1',
                                                   'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'RPL_THEMES'],
                               i=['ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION'],
                               j='year', sep='_', suffix='\w+').reset_index()

In [89]:
#rename columns
df_svi_long.rename(columns={'E_TOTPOP':'total_population', 'E_HU':'housing_units',
                             'E_HH':'num_households', 'E_UNEMP':'unemployment',
                            'RPL_THEME1':'socioeconomic',
                             'RPL_THEME2':'household_comp', 'RPL_THEME3':'minority_status', 'RPL_THEME4':'housing_type',
                             'RPL_THEMES':'overall_svi'}, inplace=True)

In [90]:
df_svi_long.head()

Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,year,total_population,housing_units,num_households,unemployment,socioeconomic,household_comp,minority_status,housing_type,overall_svi
0,36.0,New York,NY,36065.0,Oneida,36065023400,"Census Tract 234, Oneida County, New York",2014,4432.0,2456.0,2094.0,157.0,0.5811,0.9442,0.3143,0.4844,0.6302
1,36.0,New York,NY,36065.0,Oneida,36065023400,"Census Tract 234, Oneida County, New York",2016,4848.0,2459.0,2154.0,90.0,0.3631,0.9081,0.0733,0.4161,0.4056
2,36.0,New York,NY,36065.0,Oneida,36065023400,"Census Tract 234, Oneida County, New York",2018,4904.0,2394.0,2225.0,84.0,0.4684,0.7004,0.3336,0.4771,0.4893
3,36.0,New York,NY,36065.0,Oneida,36065023400,"Census Tract 234, Oneida County, New York",2020,4617.0,2339.0,2089.0,58.0,0.3147,0.6998,0.1768,0.7912,0.5298
4,36.0,New York,NY,36065.0,Oneida,36065023400,"Census Tract 234, Oneida County, New York",2015,4640.0,2457.5,2124.0,123.5,0.4721,0.92615,0.1938,0.45025,0.5179


In [91]:
#check imputation results
result = df_svi_long['year'].value_counts().reset_index()
result.columns = ['year', 'row_count']
result = result.sort_values('year')
result

Unnamed: 0,year,row_count
0,2014,95623
4,2015,95623
1,2016,95623
5,2017,95623
2,2018,95623
6,2019,95623
3,2020,95623


## Importing atmospheric CO2 data from GOSAT/OCO-2 composite raster data
Data Source Methodology: https://www.tandfonline.com/doi/figure/10.1080/20964471.2022.2033149?scroll=top&needAccess=true&role=tab
Data Source: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/4WDTD8

The global 1° land mapping XCO2 dataset (Mapping-XCO2) is derived from satellite XCO2 retrievals of GOSAT and OCO-2 spanning the period of April 2009 to December 2020. The data product is provided in GeoTIFF format and include two temporal resolutions: 3 days and month. The 3-day data files include gridded XCO2 and mapping uncertainty, which are named like “MappingXCO2_Date.nc” and “MappingUncertainty_Date.nc”. The flag “Date” is defined as date ID of 1426 time-units started from 20 April 2009. The monthly data files only include XCO2 data and named like “MappingXCO2_YYYY_MM.tif”. The number of “YYYY” and “MM” represent year and month, respectively. The domain of the dataset covers global land ranging from 56° S to 65° N and 169° W to 180° E. The spatial reference of the dataset is Geographic Lat/Lon. The unit of XCO2 data is ppm while the nodata values were assigned to NaN. (2021-03-26)

US Bounding Coordinates Used to Trim Data:
Longitude:
West Bounding Coordinate: -180.000000
East Bounding Coordinate: -65.000000

Latitude:
North Bounding Coordinate: 72.000000
South Bounding Coordinate: 17.000000

In [92]:
from osgeo import gdal
import re
import glob

In [93]:
def raster_to_df(path):
    """
    :param path: path to geotif file
    :return: pandas df
    Opens a geotif file and converts to pandas df
    Labels year and month based on file name
    trims based on coordinates of continental US
    """
    #open tif file
    raster = gdal.Open(path)

    # Extract the filename from the file path
    filename = os.path.basename(path)

    # Extract the year and month using regular expressions
    match = re.search(r'(\d{4})(\d{2})', filename)
    year = match.group(1)
    month = match.group(2)
    new_file_name = f"MappingXCO2_{year}{month}.xyz"

    #convert raster to xyz file
    gdf = gdal.Translate(new_file_name, raster)

    #open xyz file as pandas df
    df = pd.read_csv(new_file_name, sep = " ", header=None)

    #rename columns
    df.columns = ["longitude","latitude", f"xco2_{year}_{month}"]

    #trim df_co2 to bounding coordinates
    df = df[(df['longitude'] >= -180.00) & (df['longitude'] <= -65.00)]
    df = df[(df['latitude'] >= 17.00) & (df['latitude'] <= 72.00)]

    return df

In [94]:
#iterate through monthly co2 files in folder and create dfs
co2_files = glob.glob("/Users/joannarashid/Documents/GitHub/Team-Project-Practicum-6748/MappingXCO2_month/MappingXCO2_2014-2020/*.tif")

#convert to dfs
dfs = []

for file in co2_files:
    df = raster_to_df(file)
    dfs.append(df)


In [95]:
# join dfs together on latitude, longitude
df_co2 = dfs[0]
for df in dfs[1:]:
    df_co2 = df_co2.merge(df, how='outer', on=['latitude', 'longitude'])

In [96]:
#2014 standard deviation
df_std = df_co2.copy()

df_std[2014] = df_std[['xco2_2014_01', 'xco2_2014_02', 'xco2_2014_03', 'xco2_2014_04', 'xco2_2014_05',
                         'xco2_2014_06', 'xco2_2014_07', 'xco2_2014_08', 'xco2_2014_09', 'xco2_2014_10',
                         'xco2_2014_11', 'xco2_2014_12']].std(axis=1)

#2015 standard deviation
df_std[2015] = df_std[['xco2_2015_01', 'xco2_2015_02', 'xco2_2015_03', 'xco2_2015_04', 'xco2_2015_05',
                         'xco2_2015_06', 'xco2_2015_07', 'xco2_2015_08', 'xco2_2015_09', 'xco2_2015_10',
                         'xco2_2015_11', 'xco2_2015_12']].std(axis=1)

#2016 standard deviation
df_std[2016] = df_std[['xco2_2016_01', 'xco2_2016_02', 'xco2_2016_03', 'xco2_2016_04', 'xco2_2016_05',
                         'xco2_2016_06', 'xco2_2016_07', 'xco2_2016_08', 'xco2_2016_09', 'xco2_2016_10',
                         'xco2_2016_11', 'xco2_2016_12']].std(axis=1)

#2017 standard deviation
df_std[2017] = df_std[['xco2_2017_01', 'xco2_2017_02', 'xco2_2017_03', 'xco2_2017_04', 'xco2_2017_05',
                         'xco2_2017_06', 'xco2_2017_07', 'xco2_2017_08', 'xco2_2017_09', 'xco2_2017_10',
                         'xco2_2017_11', 'xco2_2017_12']].std(axis=1)

#2018 standard deviation
df_std[2018] = df_std[['xco2_2018_01', 'xco2_2018_02', 'xco2_2018_03', 'xco2_2018_04', 'xco2_2018_05',
                         'xco2_2018_06', 'xco2_2018_07', 'xco2_2018_08', 'xco2_2018_09', 'xco2_2018_10',
                         'xco2_2018_11', 'xco2_2018_12']].std(axis=1)

#2019 standard deviation
df_std[2019] = df_std[['xco2_2019_01', 'xco2_2019_02', 'xco2_2019_03', 'xco2_2019_04', 'xco2_2019_05',
                         'xco2_2019_06', 'xco2_2019_07', 'xco2_2019_08', 'xco2_2019_09', 'xco2_2019_10',
                         'xco2_2019_11', 'xco2_2019_12']].std(axis=1)

#2020 standard deviation
df_std[2020] = df_std[['xco2_2020_01', 'xco2_2020_02', 'xco2_2020_03', 'xco2_2020_04', 'xco2_2020_05',
                         'xco2_2020_06', 'xco2_2020_07', 'xco2_2020_08', 'xco2_2020_09', 'xco2_2020_10',
                         'xco2_2020_11', 'xco2_2020_12']].std(axis=1)

# annual columns only
df_std = df_std[['latitude', 'longitude', 2014, 2015, 2016, 2017, 2018, 2019, 2020]]

#convert from wide to long with year as column
df_std = df_std.melt(id_vars=['latitude', 'longitude'], var_name='year', value_name='xco2_std')

df_std.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44275 entries, 0 to 44274
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   44275 non-null  float64
 1   longitude  44275 non-null  float64
 2   year       44275 non-null  object 
 3   xco2_std   17486 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.4+ MB


In [97]:
# create annual averages
df_co2[2014] = df_co2[['xco2_2014_01', 'xco2_2014_02', 'xco2_2014_03', 'xco2_2014_04', 'xco2_2014_05',
                              'xco2_2014_06', 'xco2_2014_07', 'xco2_2014_08', 'xco2_2014_09', 'xco2_2014_10',
                              'xco2_2014_11', 'xco2_2014_12']].mean(axis=1)

df_co2[2015] = df_co2[['xco2_2015_01', 'xco2_2015_02', 'xco2_2015_03', 'xco2_2015_04', 'xco2_2015_05',
                                'xco2_2015_06', 'xco2_2015_07', 'xco2_2015_08', 'xco2_2015_09', 'xco2_2015_10',
                                'xco2_2015_11', 'xco2_2015_12']].mean(axis=1)

df_co2[2016] = df_co2[['xco2_2016_01', 'xco2_2016_02', 'xco2_2016_03', 'xco2_2016_04', 'xco2_2016_05',
                                'xco2_2016_06', 'xco2_2016_07', 'xco2_2016_08', 'xco2_2016_09', 'xco2_2016_10',
                                'xco2_2016_11', 'xco2_2016_12']].mean(axis=1)

df_co2[2017] = df_co2[['xco2_2017_01', 'xco2_2017_02', 'xco2_2017_03', 'xco2_2017_04', 'xco2_2017_05',
                                'xco2_2017_06', 'xco2_2017_07', 'xco2_2017_08', 'xco2_2017_09', 'xco2_2017_10',
                                'xco2_2017_11', 'xco2_2017_12']].mean(axis=1)

df_co2[2018] = df_co2[['xco2_2018_01', 'xco2_2018_02', 'xco2_2018_03', 'xco2_2018_04', 'xco2_2018_05',
                                'xco2_2018_06', 'xco2_2018_07', 'xco2_2018_08', 'xco2_2018_09', 'xco2_2018_10',
                                'xco2_2018_11', 'xco2_2018_12']].mean(axis=1)

df_co2[2019] = df_co2[['xco2_2019_01', 'xco2_2019_02', 'xco2_2019_03', 'xco2_2019_04', 'xco2_2019_05',
                                'xco2_2019_06', 'xco2_2019_07', 'xco2_2019_08', 'xco2_2019_09', 'xco2_2019_10',
                                'xco2_2019_11', 'xco2_2019_12']].mean(axis=1)

df_co2[2020] = df_co2[['xco2_2020_01', 'xco2_2020_02', 'xco2_2020_03', 'xco2_2020_04', 'xco2_2020_05',
                                'xco2_2020_06', 'xco2_2020_07', 'xco2_2020_08', 'xco2_2020_09', 'xco2_2020_10',
                                'xco2_2020_11', 'xco2_2020_12']].mean(axis=1)
# annual columns only
df_co2 = df_co2[['latitude', 'longitude', 2014, 2015, 2016, 2017, 2018, 2019, 2020]]

#convert from wide to long with year as column
df_co2 = df_co2.melt(id_vars=['latitude', 'longitude'], var_name='year', value_name='xco2')

df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44275 entries, 0 to 44274
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   44275 non-null  float64
 1   longitude  44275 non-null  float64
 2   year       44275 non-null  object 
 3   xco2       17486 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.4+ MB


In [98]:
#join df_std and df_co2
df_co2 = df_co2.merge(df_std, on=['latitude', 'longitude', 'year'], how='left')

In [99]:
df_co2.isna().sum()

latitude         0
longitude        0
year             0
xco2         26789
xco2_std     26789
dtype: int64

In [101]:
#drop rows with missing values
df_co2 = df_co2.dropna()

## Importing 2019 census tract data and joining with CO2 data
Data Source: https://catalog.data.gov/dataset/2019-cartographic-boundary-shapefile-current-census-tract-for-united-states-1-500000
Metadata: https://www.atsdr.cdc.gov/placeandhealth/svi/documentation/pdf/SVI2018Documentation_01192022_1.pdf

Census tracts are polygons and CO2 values are assigned to coordinates that are at the center of one degree pixels from the raster data. The CO2 values are left joined to the census tract data, so that each census tract will receive a CO2 value for the one degree pixel, that tract most overlaps.

In [102]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from osgeo import ogr
import os

In [103]:
#show fields in shapefile
shapefile_name = "/Users/joannarashid/Documents/Documents - Joanna’s MacBook Pro/School/Vernova/cb_2019_us_tract_500k/cb_2019_us_tract_500k.shp"

driver = ogr.GetDriverByName("ESRI Shapefile")
dataSource = driver.Open(shapefile_name, 0)
layer = dataSource.GetLayer()
layerDefinition = layer.GetLayerDefn()

for i in range(layerDefinition.GetFieldCount()):
    print(layerDefinition.GetFieldDefn(i).GetName())

AttributeError: 'NoneType' object has no attribute 'GetLayer'

In [104]:
#import census tract level shape file
tract = gpd.read_file(r'/Users/joannarashid/Documents/GitHub/Team-Project-Practicum-6748/cb_2019_us_tract_500k/cb_2019_us_tract_500k.shp')

#establish coordinate points which are the center point of each 1degree pixel of the CO2 raster data
df_co2['coords'] = list(zip(df_co2['longitude'],df_co2['latitude']))
df_co2['coords'] = df_co2['coords'].apply(Point)

#convert points to geodataframe
points = gpd.GeoDataFrame(df_co2, geometry='coords', crs=tract.crs)

# join census tract with points
# s_nearest is used to left join co2 data to census tract data
# each census tract polygon will be assigned the raster value of the closest C02 point
# which represents the 1 degree pixel that the census tract most overlaps
df_join_tract = gpd.sjoin_nearest(tract, points, how='left')




In [105]:
df_join_tract.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 518077 entries, 0 to 73867
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   STATEFP      518077 non-null  object  
 1   COUNTYFP     518077 non-null  object  
 2   TRACTCE      518077 non-null  object  
 3   AFFGEOID     518077 non-null  object  
 4   GEOID        518077 non-null  object  
 5   NAME         518077 non-null  object  
 6   LSAD         518077 non-null  object  
 7   ALAND        518077 non-null  int64   
 8   AWATER       518077 non-null  int64   
 9   geometry     518077 non-null  geometry
 10  index_right  518077 non-null  int64   
 11  latitude     518077 non-null  float64 
 12  longitude    518077 non-null  float64 
 13  year         518077 non-null  object  
 14  xco2         518077 non-null  float64 
 15  xco2_std     518077 non-null  float64 
dtypes: float64(4), geometry(1), int64(3), object(8)
memory usage: 67.2+ MB


## Join CO2 data with SVI data

In [106]:
#convert to pandas dataframe
df_join_tract = pd.DataFrame(df_join_tract)

In [107]:
df_join_tract.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518077 entries, 0 to 73867
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   STATEFP      518077 non-null  object  
 1   COUNTYFP     518077 non-null  object  
 2   TRACTCE      518077 non-null  object  
 3   AFFGEOID     518077 non-null  object  
 4   GEOID        518077 non-null  object  
 5   NAME         518077 non-null  object  
 6   LSAD         518077 non-null  object  
 7   ALAND        518077 non-null  int64   
 8   AWATER       518077 non-null  int64   
 9   geometry     518077 non-null  geometry
 10  index_right  518077 non-null  int64   
 11  latitude     518077 non-null  float64 
 12  longitude    518077 non-null  float64 
 13  year         518077 non-null  object  
 14  xco2         518077 non-null  float64 
 15  xco2_std     518077 non-null  float64 
dtypes: float64(4), geometry(1), int64(3), object(8)
memory usage: 67.2+ MB


In [109]:
#rename df_co2_01_2018 GEOID to FIPS
df_join_tract.rename(columns = {"GEOID" : "FIPS"}, inplace= True)

#converting all FIPS to int
df_join_tract['FIPS'] = df_join_tract['FIPS'].astype(int)

df_svi_long['FIPS'] = df_svi_long['FIPS'].astype(int)

In [122]:
# Specify the desired columns
co2_columns = ['longitude', 'latitude','xco2', 'xco2_std', 'year']

#left join df_svi_long with df_co2 on FIPS and year
final_df = df_join_tract.merge(df_svi_long, on=['FIPS', 'year'], how='left')[co2_columns + list(df_svi_long.columns)]

In [123]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518077 entries, 0 to 518076
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   longitude         518077 non-null  float64
 1   latitude          518077 non-null  float64
 2   xco2              518077 non-null  float64
 3   xco2_std          518077 non-null  float64
 4   year              518077 non-null  object 
 5   ST                510734 non-null  float64
 6   STATE             510734 non-null  object 
 7   ST_ABBR           510734 non-null  object 
 8   STCNTY            510734 non-null  float64
 9   COUNTY            510734 non-null  object 
 10  FIPS              518077 non-null  int64  
 11  LOCATION          510734 non-null  object 
 12  year              518077 non-null  object 
 13  total_population  487784 non-null  float64
 14  housing_units     487784 non-null  float64
 15  num_households    487784 non-null  float64
 16  unemployment      48

In [124]:
#droping nas which are census tracts that did not overlap with the CO2 raster data
final_df.dropna(inplace = True)

In [125]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487736 entries, 0 to 518076
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   longitude         487736 non-null  float64
 1   latitude          487736 non-null  float64
 2   xco2              487736 non-null  float64
 3   xco2_std          487736 non-null  float64
 4   year              487736 non-null  object 
 5   ST                487736 non-null  float64
 6   STATE             487736 non-null  object 
 7   ST_ABBR           487736 non-null  object 
 8   STCNTY            487736 non-null  float64
 9   COUNTY            487736 non-null  object 
 10  FIPS              487736 non-null  int64  
 11  LOCATION          487736 non-null  object 
 12  year              487736 non-null  object 
 13  total_population  487736 non-null  float64
 14  housing_units     487736 non-null  float64
 15  num_households    487736 non-null  float64
 16  unemployment      48

In [121]:
final_df.head()

Unnamed: 0,longitude,latitude,xco2,year,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,...,year.1,total_population,housing_units,num_households,unemployment,socioeconomic,household_comp,minority_status,housing_type,overall_svi
0,-74.5,40.5,403.779935,2016,34.0,New Jersey,NJ,34005.0,Burlington,34005702808,...,2016,3204.0,1123.0,1041.0,244.0,0.6944,0.7248,0.7987,0.1516,0.6031
1,-74.5,40.5,408.250298,2018,34.0,New Jersey,NJ,34005.0,Burlington,34005702808,...,2018,3383.0,1136.0,1058.0,164.0,0.5653,0.8186,0.7807,0.0617,0.4835
2,-74.5,40.5,397.730751,2014,34.0,New Jersey,NJ,34005.0,Burlington,34005702808,...,2014,3193.0,1103.0,1064.0,226.0,0.6477,0.7343,0.8944,0.1606,0.6107
3,-74.5,40.5,413.296552,2020,34.0,New Jersey,NJ,34005.0,Burlington,34005702808,...,2020,3447.0,1141.0,1053.0,77.0,0.5848,0.6889,0.9285,0.0703,0.4542
4,-74.5,40.5,400.576686,2015,34.0,New Jersey,NJ,34005.0,Burlington,34005702808,...,2015,3198.5,1113.0,1052.5,235.0,0.67105,0.72955,0.84655,0.1561,0.6069


## Computing additional CO2 features that capture temporal changes

In [126]:
#change in co2 since previous year for each FIPS
final_df['co2_1yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff()

In [127]:
#total change since 2014 for each FIPS
final_df['co2_6yr_change'] = final_df.groupby(['FIPS'])['xco2'].diff(periods=6)

In [128]:
#percent change since previous year for each FIPS
final_df['co2_1yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change()

In [129]:
#percent change since 2014 for each FIPS
final_df['co2_6yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=6)
#5 year average co2 for each FIPS
final_df['co2_5yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=5)
#4 year average co2 for each FIPS
final_df['co2_4yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=4)
#3 year average co2 for each FIPS
final_df['co2_3yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=3)
#2 year average co2 for each FIPS
final_df['co2_2yr_pct_change'] = final_df.groupby(['FIPS'])['xco2'].pct_change(periods=2)

In [130]:
#rename xco2 to avg_co2
final_df.rename(columns = {"xco2" : "avg_co2"}, inplace= True)

In [131]:
final_df.head(20)

Unnamed: 0,longitude,latitude,avg_co2,xco2_std,year,ST,STATE,ST_ABBR,STCNTY,COUNTY,...,housing_type,overall_svi,co2_1yr_change,co2_6yr_change,co2_1yr_pct_change,co2_6yr_pct_change,co2_5yr_pct_change,co2_4yr_pct_change,co2_3yr_pct_change,co2_2yr_pct_change
0,-74.5,40.5,403.779935,2.676985,2016,34.0,New Jersey,NJ,34005.0,Burlington,...,0.1516,0.6031,,,,,,,,
1,-74.5,40.5,408.250298,2.72218,2018,34.0,New Jersey,NJ,34005.0,Burlington,...,0.0617,0.4835,4.470362,,0.011071,,,,,
2,-74.5,40.5,397.730751,3.297426,2014,34.0,New Jersey,NJ,34005.0,Burlington,...,0.1606,0.6107,-10.519547,,-0.025767,,,,,-0.014981
3,-74.5,40.5,413.296552,2.7893,2020,34.0,New Jersey,NJ,34005.0,Burlington,...,0.0703,0.4542,15.565801,,0.039137,,,,0.023569,0.012361
4,-74.5,40.5,400.576686,2.817643,2015,34.0,New Jersey,NJ,34005.0,Burlington,...,0.1561,0.6069,-12.719866,,-0.030777,,,-0.007933,-0.018796,0.007155
5,-74.5,40.5,406.027738,2.733697,2017,34.0,New Jersey,NJ,34005.0,Burlington,...,0.10665,0.5433,5.451052,,0.013608,,0.005567,-0.005444,0.020861,-0.017587
6,-74.5,40.5,410.654556,2.893552,2019,34.0,New Jersey,NJ,34005.0,Burlington,...,0.066,0.46885,4.626818,6.874621,0.011395,0.017026,0.005889,0.032494,-0.006392,0.025158
7,-81.5,41.5,412.676758,2.713684,2020,39.0,Ohio,OH,39153.0,Summit,...,0.9113,0.9626,,,,,,,,
8,-81.5,41.5,399.964572,2.747933,2015,39.0,Ohio,OH,39153.0,Summit,...,0.66785,0.83985,-12.712186,,-0.030804,,,,,
9,-81.5,41.5,405.658793,2.820804,2017,39.0,Ohio,OH,39153.0,Summit,...,0.5177,0.79745,5.694221,,0.014237,,,,,-0.017006


In [72]:
#export to csv
final_df.to_csv(r'/Users/joannarashid/Documents/GitHub/Team-Project-Practicum-6748/2014_2020_all_tract.csv', index = False)