# Imports

In [3]:
import pandas as pd
import geopandas as gpd
from pathlib import Path
import os
import matplotlib.pyplot as plt


# State fips codes for later merging with the shapefile

In [4]:

state_fips_to_full_info = {
    '01': ['AL', 'Alabama'],
    '02': ['AK', 'Alaska'],
    '04': ['AZ', 'Arizona'],
    '05': ['AR', 'Arkansas'],
    '06': ['CA', 'California'],
    '08': ['CO', 'Colorado'],
    '09': ['CT', 'Connecticut'],
    '10': ['DE', 'Delaware'],
    '11': ['DC', 'District of Columbia'],
    '12': ['FL', 'Florida'],
    '13': ['GA', 'Georgia'],
    '15': ['HI', 'Hawaii'],
    '16': ['ID', 'Idaho'],
    '17': ['IL', 'Illinois'],
    '18': ['IN', 'Indiana'],
    '19': ['IA', 'Iowa'],
    '20': ['KS', 'Kansas'],
    '21': ['KY', 'Kentucky'],
    '22': ['LA', 'Louisiana'],
    '23': ['ME', 'Maine'],
    '24': ['MD', 'Maryland'],
    '25': ['MA', 'Massachusetts'],
    '26': ['MI', 'Michigan'],
    '27': ['MN', 'Minnesota'],
    '28': ['MS', 'Mississippi'],
    '29': ['MO', 'Missouri'],
    '30': ['MT', 'Montana'],
    '31': ['NE', 'Nebraska'],
    '32': ['NV', 'Nevada'],
    '33': ['NH', 'New Hampshire'],
    '34': ['NJ', 'New Jersey'],
    '35': ['NM', 'New Mexico'],
    '36': ['NY', 'New York'],
    '37': ['NC', 'North Carolina'],
    '38': ['ND', 'North Dakota'],
    '39': ['OH', 'Ohio'],
    '40': ['OK', 'Oklahoma'],
    '41': ['OR', 'Oregon'],
    '42': ['PA', 'Pennsylvania'],
    '44': ['RI', 'Rhode Island'],
    '45': ['SC', 'South Carolina'],
    '46': ['SD', 'South Dakota'],
    '47': ['TN', 'Tennessee'],
    '48': ['TX', 'Texas'],
    '49': ['UT', 'Utah'],
    '50': ['VT', 'Vermont'],
    '51': ['VA', 'Virginia'],
    '53': ['WA', 'Washington'],
    '54': ['WV', 'West Virginia'],
    '55': ['WI', 'Wisconsin'],
    '56': ['WY', 'Wyoming']
}

# Invert the dictionary to map state names to FIPS codes
name_to_fips = {info[1]: fips for fips, info in state_fips_to_full_info.items()}






# loads all csvs in a folder into a df (if they have the necessary columns)

In [5]:
def load_all_csv_to_df(directory):
    path = Path(directory)
    csv_files = list(path.glob('*.csv'))
    all_data_frames = []

    required_columns = ['date', 'cumulative_deceased', 'new_deceased', 'subregion2_name', 'population', 'subregion1_name']
    
    for file_path in csv_files:
        df = pd.read_csv(file_path)
        # Check if required columns are present
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            # Report missing columns and skip this file
            print(f"File '{file_path}' is missing columns: {missing_columns}")
            continue
        all_data_frames.append(df)
    
    # Concatenate all dataframes that have the required columns
    combined_df = pd.concat(all_data_frames, ignore_index=True)
    return combined_df


# Color assignment functions

In [6]:
# progressive color assignment
def assign_color(rate):
    """Assigns a color based on the normalized incidence rate, handling NaN values."""
    # Check if rate is NaN
    if pd.isna(rate):
        # Return a default color, e.g., gray
        return '#808080'  # Gray color in hex
    else:
        red = int(rate * 255)
        green = 255 - red
        blue = 0  # Blue component remains 0 throughout
        # Convert RGB to hex
        return '#{:02x}{:02x}{:02x}'.format(red, green, blue)


In [7]:
# binning color assignment
def assign_color_by_bin(rate, num_bins):
    if pd.isna(rate) or num_bins <= 0:
        # Return a default color, e.g., gray, if the input is NA or num_bins is invalid
        return '#808080'  # Gray color in hex
    else:
        # Calculate the index of the bin to which the rate belongs
        bin_index = min(int(rate * num_bins), num_bins - 1)
        
        # Calculate the color components based on the bin index
        red = int(bin_index / (num_bins - 1) * 255)
        green = 255 - red
        blue = 0  # Blue component remains 0 throughout

        # Convert RGB to hex
        return '#{:02x}{:02x}{:02x}'.format(red, green, blue)


# maps state_fips to name of state in original dataset

In [8]:
def map_state_fips(dataframe, state_name_col='subregion1_name'):
    """Maps state names to FIPS codes within the provided DataFrame and ensures they are stored as two-digit strings."""
    dataframe['state_fips'] = dataframe[state_name_col].map(name_to_fips)
    dataframe['state_fips'] = dataframe['state_fips'].apply(lambda x: x.zfill(2) if isinstance(x, str) else None)
    return dataframe


In [9]:
def process_data(all_data_df, time_frames, color_funcs, save_dir, **kwargs):
    # force date column to be datetime
    all_data_df['date'] = pd.to_datetime(all_data_df['date'])
    
    # filter out columns where new_deceased < 0
    all_data_df = all_data_df[all_data_df['new_deceased'] >= 0]
    if 'subregion1_name' in all_data_df.columns:
        all_data_df = map_state_fips(all_data_df)
    for time_frame in time_frames:
        for color_func in color_funcs:
            group_cols = ['state_fips', pd.Grouper(key='date', freq=time_frame)]
            agg_dict = {
                'population': 'last',
                'new_deceased': 'sum',
                'cumulative_deceased': 'last'
            }
            aggregated_data = all_data_df.groupby(group_cols).agg(agg_dict).reset_index()
            aggregated_data['incidence_rate'] = (aggregated_data['new_deceased'] / aggregated_data['population']) * 100000
            min_rate = aggregated_data['incidence_rate'].min()
            max_rate = aggregated_data['incidence_rate'].max()
            if max_rate > min_rate:
                aggregated_data['normalized_incidence_rate'] = (aggregated_data['incidence_rate'] - min_rate) / (max_rate - min_rate)
            else:
                aggregated_data['normalized_incidence_rate'] = 0.0
            if color_func == 'assign_color':
                aggregated_data['color'] = aggregated_data['normalized_incidence_rate'].apply(assign_color)
            elif color_func == 'assign_color_by_bin':
                num_bins = kwargs.get('num_bins', 5)
                aggregated_data['color'] = aggregated_data['normalized_incidence_rate'].apply(lambda x: assign_color_by_bin(x, num_bins))
            Path(save_dir).mkdir(parents=True, exist_ok=True)
            filename = f"aggregated_data_{time_frame}_{color_func}.csv"
            aggregated_data.to_csv(Path(save_dir) / filename, index=False)


# filter the csv to only include the columns_to_keep for later usage in joining with shapefile df

In [10]:
def filter_csv_by_date_and_columns(csv_file_path, date, columns_to_keep):
    df = pd.read_csv(csv_file_path)
    df['date'] = pd.to_datetime(df['date'])
    filtered_df = df[df['date'] == pd.Timestamp(date)]
    filtered_df = filtered_df[columns_to_keep]
    filtered_df['state_fips'] = pd.to_numeric(filtered_df['state_fips'], errors='coerce').fillna(-1).astype(int)
    filtered_df['state_fips'] = filtered_df['state_fips'].apply(lambda x: f"{x:02d}" if x != -1 else None)
    filtered_df.dropna(subset=['state_fips'], inplace=True)
    return filtered_df

# Merge with filtered_df with shapefile

In [13]:
import geopandas as gpd

def merge_shapefile_with_data(us_counties, filtered_df, output_file_path=None):
    # Ensure the 'state_fips' and 'color' columns are handled correctly
    filtered_df['state_fips'] = filtered_df['state_fips'].astype(str)  # Ensure state_fips is string
    # Fill missing color values with grey or assign grey if the color column is missing
    filtered_df['color'] = filtered_df.get('color', pd.Series(index=filtered_df.index, dtype=str)).fillna('#808080')
    
    # Merge the filtered DataFrame with the GeoDataFrame on state FIPS and subregion name
    merged_gdf = us_counties.merge(filtered_df, left_on=['STATEFP', 'NAMELSAD'], right_on=['state_fips', 'subregion2_name'], how='left')

    # Ensure that any rows in merged_gdf without a 'color' value are set to grey
    merged_gdf['color'].fillna('#808080', inplace=True)
    
    # Optionally convert 'date' column to string to ensure compatibility with all file formats
    if 'date' in merged_gdf.columns:
        merged_gdf['date'] = merged_gdf['date'].astype(str)

    # Optionally save the merged GeoDataFrame to a file if an output path is provided
    if output_file_path:
        merged_gdf.to_file(output_file_path, driver='GeoJSON')  # Using GeoJSON as an example, adjust as necessary

    return merged_gdf


# Plot merged_gdf geospatial plot

In [14]:
import matplotlib.pyplot as plt
import geopandas as gpd

def plot_counties_from_merged_gdf(merged_gdf, title, output_png_path):
    # Define the continental US bounds
    continental_us_bounds = {
        "minx": -130,
        "miny": 24,
        "maxx": -66,
        "maxy": 50
    }
    
    # Create a figure and axis with specified figsize and resolution (DPI)
    fig, ax = plt.subplots(figsize=(15, 8), dpi=300)
    
    # Set the bounds for the continental US
    ax.set_xlim(continental_us_bounds["minx"], continental_us_bounds["maxx"])
    ax.set_ylim(continental_us_bounds["miny"], continental_us_bounds["maxy"])
    
    # Plot all counties in the dataset with a neutral color to provide a base map
    merged_gdf.plot(ax=ax, color='lightgrey', edgecolor='black', linewidth=0.4)
    
    # Overlay the counties with specified colors where available
    # Ensure that color values are correctly interpreted
    merged_gdf.dropna(subset=['color']).plot(ax=ax, color=merged_gdf['color'], edgecolor='black', linewidth=0.4)
    
    # Adjust plot parameters
    ax.set_title(title)
    ax.set_axis_off()
    
    # Save the figure to a PNG file with the specified path and high resolution
    plt.savefig(output_png_path, dpi=300, bbox_inches='tight')
    
    # Display the plot
    plt.show()


# Run the functions to generate all plots

In [15]:
import pandas as pd
import os
import glob
from pathlib import Path
import geopandas as gpd
import matplotlib.pyplot as plt

# Set directories and paths
csv_files_directory = '../All CSVs'  # Adjust if different
directory = 'Modified Data'
output_dir = os.path.join(directory, "Processed CSVs")
shapefile_path = 'tl_2023_us_county.shp'  # Adjust path as needed

# Load shapefile
shape_df = gpd.read_file(shapefile_path)

# Define time intervals and color functions
time_intervals = ['2M', '3M']#['3D', '7D', '14D', '21D', '1M', '2M']
color_functions = ['assign_color', 'assign_color_by_bin']
num_bins = 5  # Number of bins for binning color function, applicable only when using assign_color_by_bin

# Ensure output directories exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Load all CSV files that contain the necessary columns
all_data_df = load_all_csv_to_df(csv_files_directory)

# Process each time interval for both color functions
for interval in time_intervals:
    for color_func in color_functions:
        if color_func == 'assign_color':
            process_data(all_data_df, [interval], [color_func], output_dir)
        else:
            process_data(all_data_df, [interval], [color_func], output_dir, num_bins=num_bins)

# Update file_list to match the new files
pattern = os.path.join(output_dir, 'aggregated_data_*.csv')
file_list = glob.glob(pattern)

# Columns to keep for plotting
columns_to_keep = ['date', 'state_fips', 'subregion2_name', 'color']

# Iterate through the list of file paths
for file_path in file_list:
    filename = os.path.basename(file_path)
    parts = filename.replace('aggregated_data_', '').replace('.csv', '').split('_')
    interval = parts[1]
    color_method = parts[0]

    df = pd.read_csv(file_path)
    unique_dates = sorted(df['date'].unique())

    for date in unique_dates:
        # Filter and merge data for each unique date
        filtered_df = filter_csv_by_date_and_columns(file_path, date, columns_to_keep)
        merged_gdf = merge_shapefile_with_data(shape_df, filtered_df)

        # Plot and save as PNG
        plot_title = f"{date} - {interval} - {color_method}"
        output_png_path = f"{output_dir}/{date} - {interval} - {color_method}.png"
        plot_counties_from_merged_gdf(merged_gdf, plot_title, output_png_path)


File '..\All CSVs\US_AK_02060.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_AK_02105.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_AK_02164.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_AK_02282.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_CA_SFO.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72001.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72003.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72005.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72007.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72009.csv' is missing columns: ['cumulative_deceased', 'new_deceased']
File '..\All CSVs\US_PR_72011.csv' is missing column

  dataframe['state_fips'] = dataframe[state_name_col].map(name_to_fips)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['state_fips'] = dataframe[state_name_col].map(name_to_fips)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['state_fips'] = dataframe['state_fips'].apply(lambda x: x.zfill(2) if isinstance(x, str) else None)
  group_cols = ['state_fips', pd.Grouper(key='date', freq=time_frame)]


MemoryError: Unable to allocate 9.38 GiB for an array with shape (522, 2412020) and data type float64