# This will be used for binning (Collin believes)

In [None]:
import pandas as pd
#import geopandas as gpd
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:

state_fips_to_full_info = {
    '01': ['AL', 'Alabama'],
    '02': ['AK', 'Alaska'],
    '04': ['AZ', 'Arizona'],
    '05': ['AR', 'Arkansas'],
    '06': ['CA', 'California'],
    '08': ['CO', 'Colorado'],
    '09': ['CT', 'Connecticut'],
    '10': ['DE', 'Delaware'],
    '11': ['DC', 'District of Columbia'],
    '12': ['FL', 'Florida'],
    '13': ['GA', 'Georgia'],
    '15': ['HI', 'Hawaii'],
    '16': ['ID', 'Idaho'],
    '17': ['IL', 'Illinois'],
    '18': ['IN', 'Indiana'],
    '19': ['IA', 'Iowa'],
    '20': ['KS', 'Kansas'],
    '21': ['KY', 'Kentucky'],
    '22': ['LA', 'Louisiana'],
    '23': ['ME', 'Maine'],
    '24': ['MD', 'Maryland'],
    '25': ['MA', 'Massachusetts'],
    '26': ['MI', 'Michigan'],
    '27': ['MN', 'Minnesota'],
    '28': ['MS', 'Mississippi'],
    '29': ['MO', 'Missouri'],
    '30': ['MT', 'Montana'],
    '31': ['NE', 'Nebraska'],
    '32': ['NV', 'Nevada'],
    '33': ['NH', 'New Hampshire'],
    '34': ['NJ', 'New Jersey'],
    '35': ['NM', 'New Mexico'],
    '36': ['NY', 'New York'],
    '37': ['NC', 'North Carolina'],
    '38': ['ND', 'North Dakota'],
    '39': ['OH', 'Ohio'],
    '40': ['OK', 'Oklahoma'],
    '41': ['OR', 'Oregon'],
    '42': ['PA', 'Pennsylvania'],
    '44': ['RI', 'Rhode Island'],
    '45': ['SC', 'South Carolina'],
    '46': ['SD', 'South Dakota'],
    '47': ['TN', 'Tennessee'],
    '48': ['TX', 'Texas'],
    '49': ['UT', 'Utah'],
    '50': ['VT', 'Vermont'],
    '51': ['VA', 'Virginia'],
    '53': ['WA', 'Washington'],
    '54': ['WV', 'West Virginia'],
    '55': ['WI', 'Wisconsin'],
    '56': ['WY', 'Wyoming']
}

In [22]:

def update_column_names_in_csv(folder_path):
    folder = Path(folder_path)
    csv_files = folder.glob('*.csv')
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        if 'subregion2_name' not in df.columns and 'locality_name' in df.columns:
            print(f"Updating '{csv_file.name}': 'locality_name' -> 'subregion2_name'")
            df.rename(columns={'locality_name': 'subregion2_name'}, inplace=True)
            df.to_csv(csv_file, index=False)

def make_color_lighter(color, rate, num_bins):
    """
    Lighten a color based on the bin that a given rate falls into, and return the result as a hexadecimal string.
    Each bin corresponds to a certain percentage of lightening.

    Args:
    color (tuple): The original color as an RGB tuple with values in the range [0, 1].
    rate (float): A value between 0 and 1 that determines which bin the color will be categorized into.
    num_bins (int): The number of bins, where each bin represents a different lightening intensity.

    Returns:
    str: The lightened color as a hexadecimal string.
    """
    def rgb_to_hex(rgb):
        return '#' + ''.join(f"{int(round(255 * x)):02x}" for x in rgb)

    white = (1, 1, 1)
    # Determine which bin the rate falls into
    bin_index = int(rate * num_bins)
    # Calculate the lightening percentage for the determined bin
    percentage = bin_index / num_bins
    # Lighten the color accordingly: more percentage means more white
    lightened_color = tuple((1 - percentage) * c + percentage * w for c, w in zip(color, white))
    # Convert the lightened RGB color to a hexadecimal string
    return rgb_to_hex(lightened_color)   

def make_color_lighter(color, rate, bins):

    if bins < 1:

        raise ValueError("Number of bins must be at least 1.")

    if not (0.0 <= rate <= 1.0):

        raise ValueError("Rate must be between 0.0 and 1.0.")
    # Define start and end colors in RGB

    start_color = (0, 255, 0)  # green

    end_color = (255, 0, 0)    # Red

    # Calculate the step change for each component
    step = [(end_color[i] - start_color[i]) / (bins - 1) for i in range(3)]

    # Calculate the color index based on the rate
    color_index = int(rate * (bins - 1))

    # Calculate the final color for the given rate
    final_color = [int(start_color[i] + step[i] * color_index) for i in range(3)]
    # Convert RGB to Hexadecimal

    return '#{:02X}{:02X}{:02X}'.format(*final_color)
            
def assign_color(rate):
    if pd.isna(rate):
        return '#808080'
    else:
        red = int(rate * 255)
        green = 255 - red
        blue = 0
    return '#{:02x}{:02x}{:02x}'.format(red, green, blue)

def process_files_in_folder_pathlib(folder_path):
    files_list = []
    folder = Path(folder_path)
    for file in folder.glob('*.csv'):
        if file.is_file():
            files_list.append(str(file))
        else:
            print(f"{file} is a directory, skipping.\n")
    return files_list

name_to_fips = {info[1]: fips for fips, info in state_fips_to_full_info.items()}



def process_files_and_aggregate_data_generic(csv_files, frequency, save_path):
    """
    Process a list of CSV files to aggregate data according to a specified frequency and save the aggregated data to a specified path.
    The function also checks for and reports NaN values within each file, and calculates additional statistics such as incidence rates.

    Parameters:
    - csv_files: List of file paths to CSV files.
    - frequency: String specifying the frequency for data aggregation (e.g., 'M' for monthly, 'Y' for yearly).
    - save_path: String specifying the path where the aggregated DataFrame should be saved as a CSV file.

    Returns:
    - A DataFrame containing aggregated data for specified columns across all files. This DataFrame is also saved to the specified path.
    """
    color = (0.5, 0, 0) #dark read
    num_bins = 5 #number of bins
    
    if not csv_files:
        print("No CSV files provided.")
        return None

    aggregated_stats = pd.DataFrame()

    total_files = len(csv_files)
    files_with_nan = 0

    for file_path in csv_files:
        df = pd.read_csv(file_path)

        # Report the percentage of NaN data in the file
        total_values = df.size
        nan_values = df.isna().sum().sum()
        nan_percentage = (nan_values / total_values) * 100
        #print(f"File '{file_path}' has {nan_percentage:.2f}% NaN values.")

        if nan_percentage > 0:
            files_with_nan += 1

        required_columns = ['date', 'cumulative_deceased', 'new_deceased', 'subregion2_name', 'population', 'subregion1_name']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            #print(f"File '{file_path}' is missing columns: {missing_columns}")
            continue  # Skip to the next file

        df['date'] = pd.to_datetime(df['date'])

        # Define the aggregation dictionary based on available columns
        agg_dict = {
            'population': 'last',
            'new_deceased': 'sum',
            'cumulative_deceased': 'last'
        }
        if 'new_persons_fully_vaccinated' in df.columns and 'cumulative_persons_fully_vaccinated' in df.columns:
            agg_dict.update({
                'new_persons_fully_vaccinated': 'sum',
                'cumulative_persons_fully_vaccinated': 'last'
            })

        # Determine grouping columns based on 'subregion' availability
        group_cols = ['subregion1_name', 'subregion2_name'] if 'subregion2_name' in df.columns else ['subregion1_name']
        group_cols.append(pd.Grouper(key='date', freq=frequency))

        # Aggregate data
        aggregated_data = df.groupby(group_cols).agg(agg_dict).reset_index()

        # Map state names to FIPS codes, if 'subregion1_name' is present
        if 'subregion1_name' in df.columns:
            aggregated_data['state_fips'] = aggregated_data['subregion1_name'].map(name_to_fips)  # Ensure name_to_fips is defined

        # Calculate incidence rate and normalize it
        if 'new_deceased' in df.columns and 'population' in df.columns:
            aggregated_data.loc[aggregated_data['new_deceased'] < 0, 'new_deceased'] = 0
            aggregated_data['incidence_rate'] = (aggregated_data['new_deceased'] / aggregated_data['population']) * 100000
            
            min_rate = aggregated_data['incidence_rate'].min()
            max_rate = aggregated_data['incidence_rate'].max()
            if max_rate > min_rate:
                aggregated_data['normalized_incidence_rate'] = (aggregated_data['incidence_rate'] - min_rate) / (max_rate - min_rate)
            else:
                # Handle the case where all values are the same (all zero or constant)
                aggregated_data['normalized_incidence_rate'] = 0.0 # or appropriate handling, e.g., np.nan
            aggregated_data['color'] = aggregated_data['normalized_incidence_rate'].apply(lambda rate: make_color_lighter(color, rate, num_bins))# Ensure assign_color is defined

        # Append to the main DataFrame
        aggregated_stats = pd.concat([aggregated_stats, aggregated_data], ignore_index=True)

    # Ensure the directory exists before trying to save the file
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)

    # Save the aggregated DataFrame to the specified path
    aggregated_stats.to_csv(save_path, index=False)
    #print(f"Aggregated data saved to {save_path}")
    #print(f"{files_with_nan}/{total_files} files ({(files_with_nan/total_files)*100:.2f}%) had NaN values.")

    return aggregated_stats

folder_path = "../All CSVs"
csv_files = process_files_in_folder_pathlib(folder_path)

# Correctly defining frequency list with all values
# Correctly defining frequency list with all values
freq_list = ['6M','1Y']

for freq in freq_list:
    output_path = f'Modified Data/aggregated_stats_4BinsMethod{freq}.csv'
    aggregated_data = process_files_and_aggregate_data_generic(csv_files, freq, output_path)
    


# This is the start of our 4/18/2024 changes to Gil's code

In [2]:
import pandas as pd
#import geopandas as gpd
import matplotlib.pyplot as plt
from pathlib import Path


In [3]:
def assign_color(rate):
    if pd.isna(rate):
        return '#808080'
    else:
        red = int(rate * 255)
        green = 255 - red
        blue = 0
        return '#{:02x}{:02x}{:02x}'.format(red, green, blue)

In [4]:
state_fips_to_full_info = {
    '01': ['AL', 'Alabama'],
    '02': ['AK', 'Alaska'],
    '04': ['AZ', 'Arizona'],
    '05': ['AR', 'Arkansas'],
    '06': ['CA', 'California'],
    '08': ['CO', 'Colorado'],
    '09': ['CT', 'Connecticut'],
    '10': ['DE', 'Delaware'],
    '11': ['DC', 'District of Columbia'],
    '12': ['FL', 'Florida'],
    '13': ['GA', 'Georgia'],
    '15': ['HI', 'Hawaii'],
    '16': ['ID', 'Idaho'],
    '17': ['IL', 'Illinois'],
    '18': ['IN', 'Indiana'],
    '19': ['IA', 'Iowa'],
    '20': ['KS', 'Kansas'],
    '21': ['KY', 'Kentucky'],
    '22': ['LA', 'Louisiana'],
    '23': ['ME', 'Maine'],
    '24': ['MD', 'Maryland'],
    '25': ['MA', 'Massachusetts'],
    '26': ['MI', 'Michigan'],
    '27': ['MN', 'Minnesota'],
    '28': ['MS', 'Mississippi'],
    '29': ['MO', 'Missouri'],
    '30': ['MT', 'Montana'],
    '31': ['NE', 'Nebraska'],
    '32': ['NV', 'Nevada'],
    '33': ['NH', 'New Hampshire'],
    '34': ['NJ', 'New Jersey'],
    '35': ['NM', 'New Mexico'],
    '36': ['NY', 'New York'],
    '37': ['NC', 'North Carolina'],
    '38': ['ND', 'North Dakota'],
    '39': ['OH', 'Ohio'],
    '40': ['OK', 'Oklahoma'],
    '41': ['OR', 'Oregon'],
    '42': ['PA', 'Pennsylvania'],
    '44': ['RI', 'Rhode Island'],
    '45': ['SC', 'South Carolina'],
    '46': ['SD', 'South Dakota'],
    '47': ['TN', 'Tennessee'],
    '48': ['TX', 'Texas'],
    '49': ['UT', 'Utah'],
    '50': ['VT', 'Vermont'],
    '51': ['VA', 'Virginia'],
    '53': ['WA', 'Washington'],
    '54': ['WV', 'West Virginia'],
    '55': ['WI', 'Wisconsin'],
    '56': ['WY', 'Wyoming']
}


In [5]:
def process_files_in_folder_pathlib(folder_path):
    files_list = []
    folder = Path(folder_path)
    for file in folder.glob('*.csv'):
        if file.is_file():
            files_list.append(str(file))
        else:
            print(f"{file} is a directory, skipping.\n")
    return files_list

name_to_fips = {info[1]: fips for fips, info in state_fips_to_full_info.items()}


In [6]:
def update_column_names_in_csv(folder_path):
    folder = Path(folder_path)
    csv_files = folder.glob('*.csv')
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        if 'subregion2_name' not in df.columns and 'locality_name' in df.columns:
            print(f"Updating '{csv_file.name}': 'locality_name' -> 'subregion2_name'")
            df.rename(columns={'locality_name': 'subregion2_name'}, inplace=True)
            df.to_csv(csv_file, index=False)

In [7]:
def process_files_and_aggregate_data_generic(csv_files, frequency, save_path):
    """
    Process a list of CSV files to aggregate data according to a specified frequency and save the aggregated data to a specified path.
    The function also checks for and reports NaN values within each file, and calculates additional statistics such as incidence rates.

    Parameters:
    - csv_files: List of file paths to CSV files.
    - frequency: String specifying the frequency for data aggregation (e.g., 'M' for monthly, 'Y' for yearly).
    - save_path: String specifying the path where the aggregated DataFrame should be saved as a CSV file.

    Returns:
    - A DataFrame containing aggregated data for specified columns across all files. This DataFrame is also saved to the specified path.
    """
    if not csv_files:
        print("No CSV files provided.")
        return None

    aggregated_stats = pd.DataFrame()

    total_files = len(csv_files)
    files_with_nan = 0

    for file_path in csv_files:
        df = pd.read_csv(file_path)

        # Report the percentage of NaN data in the file
        total_values = df.size
        nan_values = df.isna().sum().sum()
        nan_percentage = (nan_values / total_values) * 100
        #print(f"File '{file_path}' has {nan_percentage:.2f}% NaN values.")

        if nan_percentage > 0:
            files_with_nan += 1

        required_columns = ['date', 'cumulative_deceased', 'new_deceased', 'subregion2_name', 'population', 'subregion1_name']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            #print(f"File '{file_path}' is missing columns: {missing_columns}")
            continue  # Skip to the next file

        df['date'] = pd.to_datetime(df['date'])

        # Define the aggregation dictionary based on available columns
        agg_dict = {
            'population': 'last',
            'new_deceased': 'sum',
            'cumulative_deceased': 'last'
        }
        if 'new_persons_fully_vaccinated' in df.columns and 'cumulative_persons_fully_vaccinated' in df.columns:
            agg_dict.update({
                'new_persons_fully_vaccinated': 'sum',
                'cumulative_persons_fully_vaccinated': 'last'
            })

        # Determine grouping columns based on 'subregion' availability
        group_cols = ['subregion1_name', 'subregion2_name'] if 'subregion2_name' in df.columns else ['subregion1_name']
        group_cols.append(pd.Grouper(key='date', freq=frequency))

        # Aggregate data
        aggregated_data = df.groupby(group_cols).agg(agg_dict).reset_index()

        # Map state names to FIPS codes, if 'subregion1_name' is present
        if 'subregion1_name' in df.columns:
            aggregated_data['state_fips'] = aggregated_data['subregion1_name'].map(name_to_fips)  # Ensure name_to_fips is defined

        # Calculate incidence rate and normalize it
        if 'new_deceased' in df.columns and 'population' in df.columns:
            aggregated_data.loc[aggregated_data['new_deceased'] < 0, 'new_deceased'] = 0
            aggregated_data['incidence_rate'] = (aggregated_data['new_deceased'] / aggregated_data['population']) * 100000
            
            min_rate = aggregated_data['incidence_rate'].min()
            max_rate = aggregated_data['incidence_rate'].max()
            if max_rate > min_rate:
                aggregated_data['normalized_incidence_rate'] = (aggregated_data['incidence_rate'] - min_rate) / (max_rate - min_rate)
            else:
                # Handle the case where all values are the same (all zero or constant)
                aggregated_data['normalized_incidence_rate'] = 0.0 # or appropriate handling, e.g., np.nan
            aggregated_data['color'] = aggregated_data['normalized_incidence_rate'].apply(assign_color)  # Ensure assign_color is defined

        # Append to the main DataFrame
        aggregated_stats = pd.concat([aggregated_stats, aggregated_data], ignore_index=True)

    # Ensure the directory exists before trying to save the file
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)

    # Save the aggregated DataFrame to the specified path
    aggregated_stats.to_csv(save_path, index=False)
    #print(f"Aggregated data saved to {save_path}")
    #print(f"{files_with_nan}/{total_files} files ({(files_with_nan/total_files)*100:.2f}%) had NaN values.")

    return aggregated_stats

In [23]:


folder_path = "../All CSVs"
csv_files = process_files_in_folder_pathlib(folder_path)

# Correctly defining frequency list with all values
freq_list = [ '6M', '1Y']

for freq in freq_list:
    output_path = f'Modified Data/aggregated_monthly_stats_{freq}.csv'
    aggregated_data = process_files_and_aggregate_data_generic(csv_files, freq, output_path)
    

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os

def plot_counties_from_csv(shapefile_gdf, csv_file_path, output_folder, title_format="Plot for {}"):
    # Define the continental US bounds
    continental_us_bounds = {
        "minx": -130,
        "miny": 24,
        "maxx": -66,
        "maxy": 50
    }

    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Rename columns for merging
    df.rename(columns={'state_fips': 'STATEFP', 'subregion2_name': 'NAMELSAD'}, inplace=True)

    # Convert the 'date' column to datetime for easier manipulation
    df['date'] = pd.to_datetime(df['date'])

    # Group the DataFrame by 'date'
    grouped = df.groupby('date')

    # Make sure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    for date, group in grouped:
        # Create a color dictionary from the group
        color_dict = {row['NAMELSAD']: row['color'] for idx, row in group.iterrows()}
        
        # Update the 'color' column in the shapefile GeoDataFrame based on the color dictionary
        shapefile_gdf['color'] = shapefile_gdf['NAMELSAD'].map(color_dict).fillna('#808080')

        # Plotting setup
        fig, ax = plt.subplots(figsize=(15, 8), dpi=300)
        ax.set_xlim(continental_us_bounds["minx"], continental_us_bounds["maxx"])
        ax.set_ylim(continental_us_bounds["miny"], continental_us_bounds["maxy"])

        # Plot the counties with a neutral color to provide a base map
        shapefile_gdf.plot(ax=ax, color='lightgrey', edgecolor='black', linewidth=0.4)
        
        # Plot the counties with colors from the 'color' column
        shapefile_gdf.plot(ax=ax, color=shapefile_gdf['color'], edgecolor='black', linewidth=0.4)

        # Title and saving
        title_date = date.strftime('%Y-%m-%d')
        ax.set_title(title_format.format(title_date))
        ax.set_axis_off()
        output_png_path = os.path.join(output_folder, f"{title_date}.png")
        plt.savefig(output_png_path, dpi=300, bbox_inches='tight')
        plt.close(fig)

# File paths
csv_file_path = r"C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\ChangedData\aggregated_stats_4BinsMethod14D.csv"
shp_file_path = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoShapeData\tl_2023_us_county\tl_2023_us_county.shp'
output_folder = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoPlot'

# Load shapefile into GeoDataFrame
shape_df = gpd.read_file(shp_file_path)
#shape_df.sort_values(by=['STATEFP', 'NAMELSAD'], ascending=[True, True], inplace=True) #SORTED
# Call the function
title_format = "COVID-19 Incidence on {}"
plot_counties_from_csv(shape_df, csv_file_path, output_folder, title_format)


In [24]:
#gils code for plots. Example use below. This plots a scaling too. 
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import os
from matplotlib.colors import LinearSegmentedColormap, Normalize
from matplotlib.colorbar import ColorbarBase

def plot_counties_from_csv(shapefile_gdf, csv_file_path, output_folder, title_format="Plot for {}"):
    df = pd.read_csv(csv_file_path)
    df.rename(columns={'state_fips': 'STATEFP', 'subregion2_name': 'NAMELSAD'}, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    grouped = df.groupby('date')
    os.makedirs(output_folder, exist_ok=True)

    colors = ["#00FF00", "#3FBF00", "#7F7F00", "#BF3F00", "#FF0000"]
    cmap = LinearSegmentedColormap.from_list("rate_scale", colors, N=5)
    norm = Normalize(vmin=0, vmax=1)

    for date, group in grouped:
        color_dict = {row['NAMELSAD']: row['color'] for idx, row in group.iterrows()}
        shapefile_gdf['color'] = shapefile_gdf['NAMELSAD'].map(color_dict).fillna('#808080')

        fig, ax = plt.subplots(figsize=(15, 10))  # Increased height to accommodate the colorbar below
        ax.set_xlim([-130, -66])
        ax.set_ylim([24, 50])
        shapefile_gdf.plot(ax=ax, color='lightgrey', edgecolor='black', linewidth=0.4)
        shapefile_gdf.plot(ax=ax, color=shapefile_gdf['color'], edgecolor='black', linewidth=0.4)
        
        # Adjust the main plot's position to leave space for the colorbar
        ax_position = ax.get_position()  # Get the bounding box of the main plot
        ax.set_position([ax_position.x0, ax_position.y0 + ax_position.height * 0.1,  # Adjust bottom
                         ax_position.width, ax_position.height * 0.9])  # Adjust height

        # Create a new axes for the colorbar at the bottom
        colorbar_ax = fig.add_axes([ax_position.x0, ax_position.y0 * 0.1, ax_position.width, 0.03])  # Position for the colorbar
        
        # Create and configure the colorbar
        cbar = ColorbarBase(colorbar_ax, cmap=cmap, norm=norm, orientation='horizontal')
        cbar.set_label('Rate')
        cbar.set_ticks([0.12, 0.37, 0.62, 0.825, 1.0])
        cbar.set_ticklabels(['0.0-0.24', '0.25-0.49', '0.5-0.74', '0.75-0.90', '1.0'])
        
        # Set the title and save the figure
        ax.set_title(title_format.format(date.strftime('%Y-%m-%d')))
        ax.set_axis_off()
        plt.savefig(os.path.join(output_folder, f"{date.strftime('%Y-%m-%d')}.png"), dpi=300, bbox_inches='tight')
        plt.close(fig)

# File paths
csv_file_path = r"C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\ChangedData\aggregated_stats_4BinsMethod14D.csv"
shp_file_path = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoShapeData\tl_2023_us_county\tl_2023_us_county.shp'
output_folder = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoPlot'

# Load shapefile into GeoDataFrame
shape_df = gpd.read_file(shp_file_path)
#shape_df.sort_values(by=['STATEFP', 'NAMELSAD'], ascending=[True, True], inplace=True) #SORTED
# Call the function
title_format = "COVID-19 Incidence on {}"
plot_counties_from_csv(shape_df, csv_file_path, output_folder, title_format)
