In [None]:
import pandas as pd
#import geopandas as gpd
import matplotlib.pyplot as plt
from pathlib import Path

state_fips_to_full_info = {
    '01': ['AL', 'Alabama'],
    '02': ['AK', 'Alaska'],
    '04': ['AZ', 'Arizona'],
    '05': ['AR', 'Arkansas'],
    '06': ['CA', 'California'],
    '08': ['CO', 'Colorado'],
    '09': ['CT', 'Connecticut'],
    '10': ['DE', 'Delaware'],
    '11': ['DC', 'District of Columbia'],
    '12': ['FL', 'Florida'],
    '13': ['GA', 'Georgia'],
    '15': ['HI', 'Hawaii'],
    '16': ['ID', 'Idaho'],
    '17': ['IL', 'Illinois'],
    '18': ['IN', 'Indiana'],
    '19': ['IA', 'Iowa'],
    '20': ['KS', 'Kansas'],
    '21': ['KY', 'Kentucky'],
    '22': ['LA', 'Louisiana'],
    '23': ['ME', 'Maine'],
    '24': ['MD', 'Maryland'],
    '25': ['MA', 'Massachusetts'],
    '26': ['MI', 'Michigan'],
    '27': ['MN', 'Minnesota'],
    '28': ['MS', 'Mississippi'],
    '29': ['MO', 'Missouri'],
    '30': ['MT', 'Montana'],
    '31': ['NE', 'Nebraska'],
    '32': ['NV', 'Nevada'],
    '33': ['NH', 'New Hampshire'],
    '34': ['NJ', 'New Jersey'],
    '35': ['NM', 'New Mexico'],
    '36': ['NY', 'New York'],
    '37': ['NC', 'North Carolina'],
    '38': ['ND', 'North Dakota'],
    '39': ['OH', 'Ohio'],
    '40': ['OK', 'Oklahoma'],
    '41': ['OR', 'Oregon'],
    '42': ['PA', 'Pennsylvania'],
    '44': ['RI', 'Rhode Island'],
    '45': ['SC', 'South Carolina'],
    '46': ['SD', 'South Dakota'],
    '47': ['TN', 'Tennessee'],
    '48': ['TX', 'Texas'],
    '49': ['UT', 'Utah'],
    '50': ['VT', 'Vermont'],
    '51': ['VA', 'Virginia'],
    '53': ['WA', 'Washington'],
    '54': ['WV', 'West Virginia'],
    '55': ['WI', 'Wisconsin'],
    '56': ['WY', 'Wyoming']
}

def update_column_names_in_csv(folder_path):
    folder = Path(folder_path)
    csv_files = folder.glob('*.csv')
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        if 'subregion2_name' not in df.columns and 'locality_name' in df.columns:
            print(f"Updating '{csv_file.name}': 'locality_name' -> 'subregion2_name'")
            df.rename(columns={'locality_name': 'subregion2_name'}, inplace=True)
            df.to_csv(csv_file, index=False)

def assign_color(rate):
    if pd.isna(rate):
        return '#808080'
    else:
        red = int(rate * 255)
        green = 255 - red
        blue = 0
        return '#{:02x}{:02x}{:02x}'.format(red, green, blue)

def process_files_in_folder_pathlib(folder_path):
    files_list = []
    folder = Path(folder_path)
    for file in folder.glob('*.csv'):
        if file.is_file():
            files_list.append(str(file))
        else:
            print(f"{file} is a directory, skipping.\n")
    return files_list

name_to_fips = {info[1]: fips for fips, info in state_fips_to_full_info.items()}


def process_files_and_aggregate_data_generic(csv_files, frequency, save_path):
    """
    Process a list of CSV files to aggregate data according to a specified frequency and save the aggregated data to a specified path.
    The function also checks for and reports NaN values within each file, and calculates additional statistics such as incidence rates.

    Parameters:
    - csv_files: List of file paths to CSV files.
    - frequency: String specifying the frequency for data aggregation (e.g., 'M' for monthly, 'Y' for yearly).
    - save_path: String specifying the path where the aggregated DataFrame should be saved as a CSV file.

    Returns:
    - A DataFrame containing aggregated data for specified columns across all files. This DataFrame is also saved to the specified path.
    """
    if not csv_files:
        print("No CSV files provided.")
        return None

    aggregated_stats = pd.DataFrame()

    total_files = len(csv_files)
    files_with_nan = 0

    for file_path in csv_files:
        df = pd.read_csv(file_path)

        # Report the percentage of NaN data in the file
        total_values = df.size
        nan_values = df.isna().sum().sum()
        nan_percentage = (nan_values / total_values) * 100
        print(f"File '{file_path}' has {nan_percentage:.2f}% NaN values.")

        if nan_percentage > 0:
            files_with_nan += 1

        required_columns = ['date', 'cumulative_deceased', 'new_deceased', 'subregion2_name', 'population', 'subregion1_name']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"File '{file_path}' is missing columns: {missing_columns}")
            continue  # Skip to the next file

        df['date'] = pd.to_datetime(df['date'])

        # Define the aggregation dictionary based on available columns
        agg_dict = {
            'population': 'last',
            'new_deceased': 'sum',
            'cumulative_deceased': 'last'
        }
        if 'new_persons_fully_vaccinated' in df.columns and 'cumulative_persons_fully_vaccinated' in df.columns:
            agg_dict.update({
                'new_persons_fully_vaccinated': 'sum',
                'cumulative_persons_fully_vaccinated': 'last'
            })

        # Determine grouping columns based on 'subregion' availability
        group_cols = ['subregion1_name', 'subregion2_name'] if 'subregion2_name' in df.columns else ['subregion1_name']
        group_cols.append(pd.Grouper(key='date', freq=frequency))

        # Aggregate data
        aggregated_data = df.groupby(group_cols).agg(agg_dict).reset_index()

        # Map state names to FIPS codes, if 'subregion1_name' is present
        if 'subregion1_name' in df.columns:
            aggregated_data['state_fips'] = aggregated_data['subregion1_name'].map(name_to_fips)  # Ensure name_to_fips is defined

        # Calculate incidence rate and normalize it
        if 'new_deceased' in df.columns and 'population' in df.columns:
            aggregated_data['incidence_rate'] = (aggregated_data['new_deceased'] / aggregated_data['population']) * 100000
            aggregated_data['normalized_incidence_rate'] = (aggregated_data['incidence_rate'] - aggregated_data['incidence_rate'].min()) / (aggregated_data['incidence_rate'].max() - aggregated_data['incidence_rate'].min())
            aggregated_data['color'] = aggregated_data['normalized_incidence_rate'].apply(assign_color)  # Ensure assign_color is defined

        # Append to the main DataFrame
        aggregated_stats = pd.concat([aggregated_stats, aggregated_data], ignore_index=True)

    # Ensure the directory exists before trying to save the file
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)

    # Save the aggregated DataFrame to the specified path
    aggregated_stats.to_csv(save_path, index=False)
    print(f"Aggregated data saved to {save_path}")
    print(f"{files_with_nan}/{total_files} files ({(files_with_nan/total_files)*100:.2f}%) had NaN values.")

    return aggregated_stats




folder_path = "../All CSVs"
csv_files = process_files_in_folder_pathlib(folder_path)

# Correctly defining frequency list with all values
freq_list = ['3D', '7D', '14D', '21D', '28D', 'M', '2M', '3M']

for freq in freq_list:
    output_path = f'Modified Data/aggregated_monthly_stats_{freq}.csv'
    aggregated_data = process_files_and_aggregate_data_generic(csv_files, freq, output_path)
    

File '../All CSVs/US_IN_18057.csv' has 6.59% NaN values.
File '../All CSVs/US_VA_51011.csv' has 9.81% NaN values.
File '../All CSVs/US_AR_05107.csv' has 11.79% NaN values.
File '../All CSVs/US_OH_39143.csv' has 9.58% NaN values.
File '../All CSVs/US_NE_31039.csv' has 9.21% NaN values.
File '../All CSVs/US_SD_46025.csv' has 5.90% NaN values.
File '../All CSVs/US_IL_17129.csv' has 8.63% NaN values.
File '../All CSVs/US_ND_38051.csv' has 6.03% NaN values.
File '../All CSVs/US_NC_37079.csv' has 12.30% NaN values.
File '../All CSVs/US_IL_17127.csv' has 9.22% NaN values.
File '../All CSVs/US_VT_50015.csv' has 12.96% NaN values.
File '../All CSVs/US_NE_31097.csv' has 8.46% NaN values.
File '../All CSVs/US_ND_38065.csv' has 6.21% NaN values.
File '../All CSVs/US_NE_31033.csv' has 9.06% NaN values.
File '../All CSVs/US_OR_41033.csv' has 9.03% NaN values.
File '../All CSVs/US_GA_13019.csv' has 20.88% NaN values.
File '../All CSVs/US_MI_26063.csv' has 12.59% NaN values.
File '../All CSVs/US_NE_31