In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import os
from matplotlib.colors import LinearSegmentedColormap, Normalize
from matplotlib.colorbar import ColorbarBase
from pathlib import Path
import numpy as np

**Description of fuction bellow**  
The funciton no longer assings color. That is done running another function.  
***Things to change before running***  
freq_list  
output_path 



In [3]:
import pandas as pd
state_fips_to_full_info = {
    '01': ['AL', 'Alabama'],
    '02': ['AK', 'Alaska'],
    '04': ['AZ', 'Arizona'],
    '05': ['AR', 'Arkansas'],
    '06': ['CA', 'California'],
    '08': ['CO', 'Colorado'],
    '09': ['CT', 'Connecticut'],
    '10': ['DE', 'Delaware'],
    '11': ['DC', 'District of Columbia'],
    '12': ['FL', 'Florida'],
    '13': ['GA', 'Georgia'],
    '15': ['HI', 'Hawaii'],
    '16': ['ID', 'Idaho'],
    '17': ['IL', 'Illinois'],
    '18': ['IN', 'Indiana'],
    '19': ['IA', 'Iowa'],
    '20': ['KS', 'Kansas'],
    '21': ['KY', 'Kentucky'],
    '22': ['LA', 'Louisiana'],
    '23': ['ME', 'Maine'],
    '24': ['MD', 'Maryland'],
    '25': ['MA', 'Massachusetts'],
    '26': ['MI', 'Michigan'],
    '27': ['MN', 'Minnesota'],
    '28': ['MS', 'Mississippi'],
    '29': ['MO', 'Missouri'],
    '30': ['MT', 'Montana'],
    '31': ['NE', 'Nebraska'],
    '32': ['NV', 'Nevada'],
    '33': ['NH', 'New Hampshire'],
    '34': ['NJ', 'New Jersey'],
    '35': ['NM', 'New Mexico'],
    '36': ['NY', 'New York'],
    '37': ['NC', 'North Carolina'],
    '38': ['ND', 'North Dakota'],
    '39': ['OH', 'Ohio'],
    '40': ['OK', 'Oklahoma'],
    '41': ['OR', 'Oregon'],
    '42': ['PA', 'Pennsylvania'],
    '44': ['RI', 'Rhode Island'],
    '45': ['SC', 'South Carolina'],
    '46': ['SD', 'South Dakota'],
    '47': ['TN', 'Tennessee'],
    '48': ['TX', 'Texas'],
    '49': ['UT', 'Utah'],
    '50': ['VT', 'Vermont'],
    '51': ['VA', 'Virginia'],
    '53': ['WA', 'Washington'],
    '54': ['WV', 'West Virginia'],
    '55': ['WI', 'Wisconsin'],
    '56': ['WY', 'Wyoming']
}

name_to_fips = {info[1]: fips for fips, info in state_fips_to_full_info.items()}

# Function to update column names in CSV files within a folder
def update_column_names_in_csv(folder_path):
    folder = Path(folder_path)
    csv_files = folder.glob('*.csv')
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        if 'subregion2_name' not in df.columns and 'locality_name' in df.columns:
            print(f"Updating '{csv_file.name}': 'locality_name' -> 'subregion2_name'")
            df.rename(columns={'locality_name': 'subregion2_name'}, inplace=True)
            df.to_csv(csv_file, index=False)
            
# Function to list CSV files in a folder
def process_files_in_folder_pathlib(folder_path):
    files_list = []
    folder = Path(folder_path)
    for file in folder.glob('*.csv'):
        if file.is_file():
            files_list.append(str(file))
        else:
            print(f"{file} is a directory, skipping.\n")
    return files_list

# Function to process files and aggregate data
def process_files_and_aggregate_data(csv_files, frequency, save_path):
    aggregated_stats = pd.DataFrame()  # Initialize an empty DataFrame to collect results
    total_files = len(csv_files)

    for file_path in csv_files:
        df = pd.read_csv(file_path)
        print(f"Processing {file_path}...")

        # Check for required columns in the DataFrame
        required_columns = [
            'date', 'STATENAME', 'COUNTYNAME', 'DRUNK_DR',
            'PERSONS', 'Population', 'FATALS', 'WEATHERNAME', 'LGT_CONDNAME'
        ]
        missing_columns = set(required_columns) - set(df.columns)
        if missing_columns:
            print(f"File '{file_path}' is missing columns: {missing_columns}")
            continue

        # Ensure 'date' column is in datetime format
        df['date'] = pd.to_datetime(df['date'])

        # Map STATENAME to state_fips code before grouping
        df['state_fips'] = df['STATENAME'].map(name_to_fips)

        # Define grouping columns including both 'STATENAME' and 'state_fips'
        group_cols = ['STATENAME', 'state_fips', 'COUNTYNAME', pd.Grouper(key='date', freq=frequency)]

        # Filter rows where Population is not NA and greater than 0
        df = df[df['Population'].notna() & (df['Population'] > 0)]

        # Aggregation dictionary for summing and averaging data
        agg_dict = {
            'Population': 'first',
            'DRUNK_DR': 'sum',
            'FATALS': 'sum',
            'PERSONS': 'sum'
        }

        # Perform grouping and aggregation
        grouped_df = df.groupby(group_cols).agg(agg_dict).reset_index()

        # Calculate incidence rates
        for col in agg_dict.keys():
            if col != 'Population':
                rate_col = f'incidence_rate_{col}'
                grouped_df[rate_col] = np.ceil((grouped_df[col] / grouped_df['Population']) * 100000)

        # Concatenate grouped data to the aggregated stats DataFrame
        aggregated_stats = pd.concat([aggregated_stats, grouped_df], ignore_index=True)

    # Save aggregated data if not empty
    if not aggregated_stats.empty:
        output_file = Path(save_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
        aggregated_stats.to_csv(output_file, index=False)  # This will overwrite the existing file
        print(f"Aggregated data saved to {output_file}")
    else:
        print("No data to save.")

    print(f"{total_files} files processed. Some files may have had NaN values.")
    return aggregated_stats

"""
example use

folder_path = r"C:\Users\A404007\Desktop\Gils Folder\computer vis\project Data\DataWareHouse\Changed data\ExcelentData\AllDates"
csv_files = process_files_in_folder_pathlib(folder_path)

# Correctly defining frequency list with all values
# Correctly defining frequency list with all values
freq_list = ['1D','1W','1M','6M','1Y' ]

for freq in freq_list:
    output_path = r'C:\Users\A404007\Desktop\Gils Folder\computer vis\project Data\DataWareHouse\Changed data\InterestingCalculations\ACCIDENTDATA_ALLDATA_{}.csv'.format(freq)
    aggregated_data = process_files_and_aggregate_data(csv_files, freq, output_path)
"""


NameError: name 'Path' is not defined

**Description of fuction bellow**  
The funciton just assings color. It just adds on color to the csv file so if you had incidence_rate_PERSONS it will now have a column with name color_incidence_rate_PERSONS to help determine what color means what 
***Things to change before running***  
define your color bins, (min,max,color)  
freq_list  
path  


In [None]:
def color_mapping(csv_file, attribute, bin_assignment):
    df = pd.read_csv(csv_file)

    color_mapping = []

    for rate in df[attribute]:
        color_assigned = False
        for bin_range, color in bin_assignment:
            if bin_range[0] <= rate < bin_range[1]:
                color_mapping.append(color)
                color_assigned = True
                break
        if not color_assigned:
            # Assign grey as the default color if attribute value doesn't fall into any bin range
            color_mapping.append('#808080')

    df['color_' + attribute] = color_mapping

    # Override the CSV file with the new color mapping column
    df.to_csv(csv_file, index=False)
color_bins = [
    ((0, 5), '#FFFFCC'),
    ((5, 20), '#FFEDA0'),
    ((20, 40), '#FED976'),
    ((40, 80), '#FEB24C'),  # This range was duplicated, adjusted to follow the sequence
    ((80, 160), '#FD8D3C'),
    ((160, 320), '#FC4E2A'),
    ((320, float('inf')), '#800026')  # You can adjust this color as needed
]
"""
example use: 

freq_list = ['1D','1M','1W','1Y','6M']

for freq in freq_list:
    path = r'C:\Users\A404007\Desktop\Gils Folder\computer vis\project Data\DataWareHouse\Changed data\InterestingCalculations\ACCIDENTDATA_ALLDATA_{}.csv'.format(freq)
    color_mapping(path, 'incidence_rate_PERSONS', color_bins)
"""

**Description of function below**  
The function will minimize unique colors and determine the minimum and maximum for that unique color, and determine scaling that way. Color size controls how big the color legend size circle is in the plot. Label and spacing determine how much space those circles are to one another.  
Note: make sure to change `min_max_list = [f"{row['Min']} - {row['Max']} Covid Death Range of \n\nFrequency: {row['RawSize']}" for index, row in color_stats_sorted.iterrows()]` the 'Covid Death Range of' for it to make sense to whatever you are plotting.  
***Things to change before running***  
define your color bins, (min, max, color)  
freq_list  
path


In [None]:


def plot_counties_from_csv(shapefile_gdf, csv_file_path, output_folder, attribute, color_size, labelspacing, title_format="Plot for {}"):
    df = pd.read_csv(csv_file_path)
    df.rename(columns={'state_fips': 'STATEFP', 'subregion2_name': 'NAMELSAD'}, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    grouped = df.groupby('date')
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for date, group in grouped:
        df_color = group[attribute].unique()
        color_counts = group[attribute].value_counts()
        min_count, max_count = color_counts.min(), color_counts.max()
        color_to_size = {color: color_size for color in df_color}

        data = []
        new_attribute = attribute.replace("color_", "")

        for color in df_color:
            color_df = group[group[attribute] == color]
            min_value = color_df[new_attribute].min()
            max_value = color_df[new_attribute].max()
            size = color_to_size[color]
            size_raw = color_counts[color]
            data.append({'Color': color, 'Min': min_value, 'Max': max_value, 'Size': size, 'RawSize': size_raw})

        color_stats = pd.DataFrame(data)
        color_stats_sorted = color_stats.sort_values(by='Max', ascending=True)
        colors_sorted_list = color_stats_sorted['Color'].tolist()
        min_max_list = [f"{row['Min']} - {row['Max']} Covid Death Range of \n\nFrequency: {row['RawSize']}" for index, row in color_stats_sorted.iterrows()]
        min_max_list.insert(0,"Missing data")
        colors_size_list = color_stats_sorted['Size'].tolist()
        colors_size_list.insert(0, color_size)
        colors_sorted_list.insert(0, '#808080')

        cmap = LinearSegmentedColormap.from_list("rate_scale", colors_sorted_list, N=len(colors_sorted_list))

        color_dict = {row['NAMELSAD']: row[attribute] for idx, row in group.iterrows()}
        shapefile_gdf['color'] = shapefile_gdf['NAMELSAD'].map(color_dict).fillna('#808080')

        fig, ax = plt.subplots(figsize=(15, 10))
        ax.set_xlim([-130, -66])
        ax.set_ylim([24, 50])
        shapefile_gdf.plot(ax=ax, color='lightgrey', edgecolor='black', linewidth=0.4)
        shapefile_gdf.plot(ax=ax, color=shapefile_gdf['color'], edgecolor='black', linewidth=0.4)

        legend_entries = [plt.Line2D([0], [0], marker='o', color=color, label=label, markersize=size, linestyle='')
                          for label, color, size in zip(min_max_list, colors_sorted_list, colors_size_list)]
        ax.legend(handles=legend_entries, loc='center left', bbox_to_anchor=(-0.10, 0.5),
                  handlelength=2, handletextpad=2, labelspacing=labelspacing, borderaxespad=1, fontsize='x-small')

        ax.set_title(title_format.format(date.strftime('%Y-%m-%d')))
        ax.set_axis_off()
        plt.savefig(os.path.join(output_folder, f"{date.strftime('%Y-%m-%d')}.png"), dpi=300, bbox_inches='tight')
        plt.close(fig)

    print("All maps have been saved successfully.")
    
"""
example use: 
    # File paths
csv_file_path = r"C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\ChangedData\aggregated_stats_4BinsMethod3D.csv"
shp_file_path = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoShapeData\tl_2023_us_county\tl_2023_us_county.shp'
output_folder = r'C:\Users\A404007\Desktop\Gils Folder\Data Mining\projectdata\GeoPlot\aggregated_stats_4BinsMethod3D_Latest'
attribute = 'color_incidence_rate'
# Load shapefile into GeoDataFrame
shape_df = gpd.read_file(shp_file_path)
#shape_df.sort_values(by=['STATEFP', 'NAMELSAD'], ascending=[True, True], inplace=True) #SORTED
# Call the function
title_format = "Death Rate to COVID  per 100,000 People:{}"
plot_counties_from_csv(shape_df, csv_file_path, output_folder,attribute, 10, 1.0, title_format)

"""