### Data
The code below reads in Geotiff files from NASA Socioeconomic Data and Applications Center (SEDAC)
Documentation for the U.S. Social Vulnerability Index Grids, v1 (2000, 2010, 2014, 2016, 2018).

Using the gdal from the osgeo package, latitude and longitude values are generated for each pixel in the raster.
The raster values are then read into a Pandas DataFrame with the latitude and longitude values as columns.
The resulting DataFrame is then merged with the other years to create a single DataFrame with all years and economic values whcih include: Socioeconomic Ranking, Household Composition & Disability, Minority Status & Language, Housing Type & Transportation, and overall Social Vulnerability Index Ranking.

In [1]:
#base dependencies
import pandas as pd
import numpy as np

#GIS packages
from osgeo import gdal
from geopy.geocoders import Nominatim
import geopandas as gpd
import georasters as gr

#utility packages
import os
import requests
from bs4 import BeautifulSoup
import re

#plots
import plotly.express as px

### Raw Data

In [2]:
# Raw data as geoTifs hosted on GitHub Repo
tif_urls = ["https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2014_tract_household_nad83.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2014_tract_housing_nad83.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2014_tract_minority_nad83.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2014_tract_overall_nad83.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2014_tract_socioeconomic_nad83.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2016_tract_household_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2016_tract_housing_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2016_tract_minority_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2016_tract_overall_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2016_tract_socioeconomic_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2018_tract_household_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2018_tract_housing_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2018_tract_minority_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2018_tract_overall_nad83_nopop.tif",
            "https://github.com/DanDryer/Team-Project-Practicum-6748/raw/main/SEDAC_SVI_data/svi_2018_tract_socioeconomic_nad83_nopop.tif"
            ]



### Data Cleaning

In [3]:
def geotif_to_pandas(path, value_name):
    """
    This function converts a GeoTIFF file to a GeoDataFrame and adds latitude and longitude values for each pixel
    :param path: The path to the GeoTIFF file
    :param value_name: The name of the column for raster values in the GeoDataFrame
    :return: A GeoDataFrame with latitude, longitude, and raster values
    """
    # Read the GeoTIFF file 
    geotif = gr.from_file(path)
    
    #converts to pandas df
    geodf = geotif.to_pandas()

    # Extract the latitude, longitude, and raster values
    df = geodf.loc[:,[True, # Raster values
                      'x', # Latitude
                      'y']]# Longitude
    
    # Rename columns
    column_names = {True: value_name, 'x': 'Latitude', 'y' : 'Longitude'} 
    df = df.rename(columns=column_names)
    
    return df

def add_county_zip(df):
    
    """
    This function uses the latitude and longitude data and a Python client called Geopy to find and add the county information to the dataset.
    :param df: The dataframe that the county information will be added to
    :return: A Pandas DataFrame with latitude, longitude, raster values, and county
    """

    df['coords'] = df['Latitude'].map(str) + ',' + df['Longitude'].map(str)
    
    county = []
    for i in df['coords']:
        locator = Nominatim(user_agent='myGeocoder')
        location = locator.reverse(i)
        value = (location.raw['address']).get('county')
        county.append(value)

    df['County'] = county
    
    zip = []
    for i in df['coords']:
        locator = Nominatim(user_agent='myGeocoder')
        location = locator.reverse(i)
        value = (location.raw['address']).get('postcode')
        zip.append(value)

    return df

In [None]:
### Warning this cell is computationally expensive

# List of dfs for each processed geotiff file
dfs = []

# Iterate through tif_urls and process each file
for tif_url in tif_urls:
    # Extract the year and value name from the URL
    filename = os.path.basename(tif_url)
    match = re.search(r"svi_(\d+)_tract_([a-zA-Z]+)", filename)
    if match:
        year = match.group(1)
        value_name = year + "_" + match.group(2)

        # Call geotif_to_pandas function with the appropriate arguments
        df = geotif_to_pandas(tif_url, value_name)
        
        # Append the df to list
        dfs.append(df)
    else:
        pass
    
# Perform inner join on 'Latitude' and 'Longitude'
df_merged = dfs[0]  # Start with the first DataFrame

for df in dfs[1:]:
    df_merged = pd.merge(df_merged, df, on=['Latitude', 'Longitude'], how='inner')

### could not run this, I kept getting errors related to the Nominatum function ###
#df_merged = add_county_zip(df_merged)

df_svi = df_merged

In [6]:
df_merged.describe()

Unnamed: 0,2014_household,Latitude,Longitude,2014_housing,2014_minority,2014_overall,2014_socioeconomic,2016_household,2016_housing,2016_minority,2016_overall,2016_socioeconomic,2018_household,2018_housing,2018_minority,2018_overall,2018_socioeconomic
count,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0,9153038.0
mean,0.5890963,-100.1339,41.04003,0.5427965,0.3270933,0.4974725,0.5044819,0.5907746,0.533363,0.3183558,0.4960319,0.5090534,0.6012385,0.5322033,0.3154114,0.5002789,0.5172087
std,0.2393126,18.56516,8.028895,0.2481619,0.2510189,0.2445421,0.2375708,0.2389508,0.2572207,0.2479444,0.2482178,0.2398771,0.2362909,0.2590478,0.2444966,0.2494314,0.2406032
min,0.0,-178.175,18.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.4028,-109.0667,35.55,0.3428,0.1162,0.3002,0.3135,0.4169,0.3193,0.111625,0.2923,0.3123,0.4295,0.315,0.113,0.2946,0.3161
50%,0.6123,-96.85833,40.20833,0.5459,0.2554,0.4927,0.5053,0.6184,0.5331,0.2503,0.4918,0.5082,0.629,0.5362,0.2546,0.4982,0.5144
75%,0.7853,-87.25,44.65833,0.7546,0.5102,0.6856,0.6956,0.7808,0.7564,0.4953,0.6887,0.7044,0.7886,0.7546,0.4748,0.6947,0.7105
max,1.0,-66.96667,71.36667,1.0,1.0,1.0,1.0,1.0,1.0,0.9999,1.0,0.9999,1.0,1.0,1.0,1.0,1.0


### Data Visualization

In [None]:
# Visualization is displayed inline
%matplotlib inline

# Set your Mapbox access token
mapbox_access_token = 'pk.eyJ1IjoianJhc2hpZCIsImEiOiJjbGlsMXhjZ3EwNzF3M2VwZWE1NDZtMjB4In0.CesHDGB6S6qDVDtTTeQxvw'

# Define the base figure using the first variable
fig = px.scatter_mapbox(df_svi, lat='Latitude', lon='Longitude',
                        hover_data=[df_svi.columns[2]], zoom=10)

# Add toggling options for each variable
for column in merged_df.columns[3:]:
    fig.add_trace(px.scatter_mapbox(df_svi,
                                    lat='Latitude',
                                    lon='Longitude',
                                    hover_data=[column]).data[0])

# Update the layout to include map style and title
fig.update_layout(mapbox_style="open-street-map", mapbox_accesstoken=mapbox_access_token)
fig.update_layout(title="Interactive Point Map")

# Show the interactive plot inline in the Jupyter Notebook
fig.show(renderer='notebook')


### Creating Additional Values