In [2]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt

# Set pandas option to display all columns
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.reset_option('max_rows')


In [3]:
columns_to_load = [
     'location_key'
    , 'date'
    , 'place_id'
    , 'wikidata_id'
    , 'datacommons_id'
    , 'country_code'
    , 'country_name'
    , 'subregion1_code'
    , 'subregion1_name'
    , 'subregion2_code'
    , 'subregion2_name'
    , 'new_confirmed'
    , 'new_deceased'
    , 'cumulative_confirmed'
    , 'cumulative_deceased'
    , 'new_persons_fully_vaccinated'
    , 'cumulative_persons_fully_vaccinated'
    , 'population'
    , 'population_male'
    , 'population_female'
    , 'population_age_00_09'
    , 'population_age_10_19'
    , 'population_age_20_29'
    , 'population_age_30_39'
    , 'population_age_40_49'
    , 'population_age_50_59'
    , 'population_age_60_69'
    , 'population_age_70_79'
    , 'population_age_80_and_older'
    , 'openstreetmap_id'
    , 'latitude'
    , 'longitude'
    , 'area_sq_km'
    , 'life_expectancy'
    , 'mobility_retail_and_recreation'
    , 'mobility_grocery_and_pharmacy'
    , 'mobility_parks'
    , 'mobility_transit_stations'
    , 'mobility_workplaces'
    , 'mobility_residential'
    , 'average_temperature_celsius'
    , 'minimum_temperature_celsius'
    , 'maximum_temperature_celsius'
    , 'rainfall_mm'
    , 'dew_point'
    , 'relative_humidity'
]

In [4]:
# Directory path to your CSV files
directory_path = "../Test CSVs/"  # Example directory path

# Initialize an empty DataFrame to store data from all CSV files
full_df = pd.DataFrame(columns=columns_to_load)

# Loop over the list of CSV files
for file in glob.glob(directory_path + '*.csv'):
    # Read the CSV file without specifying columns to ensure we don't miss any data
    df = pd.read_csv(file)
    
    # Ensure all desired columns are present, add them with NaN values if they are missing
    for column in columns_to_load:
        if column not in df.columns:
            df[column] = np.nan

    # Reorder and select only the desired columns to maintain consistency
    df = df[columns_to_load]
    
    # Append the contents of the file to the full DataFrame
    full_df = pd.concat([full_df, df], ignore_index=True)

# Optionally, reset the index of the final DataFrame
full_df.reset_index(drop=True, inplace=True)



  full_df = pd.concat([full_df, df], ignore_index=True)
  full_df = pd.concat([full_df, df], ignore_index=True)
  full_df = pd.concat([full_df, df], ignore_index=True)


In [21]:
# Data Prep
freq_df = full_df.copy()
freq_df['date'] = pd.to_datetime(freq_df['date'])
freq_df['quarter'] = freq_df['date'].dt.to_period('Q').dt.start_time
freq_df['month'] = freq_df['date'].dt.to_period('M').dt.to_timestamp()
freq_df['week'] = freq_df['date'] - pd.to_timedelta(freq_df['date'].dt.weekday, unit='d')
freq_df['county_name'] = freq_df['subregion2_name'] + ', ' + freq_df['subregion1_name']

columns_to_keep = [
    'county_name', 'location_key', 'quarter', 'month', 'week', 'date',
    'new_confirmed', 'new_deceased', 'cumulative_confirmed', 'cumulative_deceased',
    'new_persons_fully_vaccinated', 'cumulative_persons_fully_vaccinated',
    'population', 'population_male', 'population_female',
    'population_age_00_09', 'population_age_10_19', 'population_age_20_29',
    'population_age_30_39', 'population_age_40_49', 'population_age_50_59',
    'population_age_60_69', 'population_age_70_79', 'population_age_80_and_older',
    'area_sq_km',
    'life_expectancy',
    'average_temperature_celsius', 'minimum_temperature_celsius',
    'maximum_temperature_celsius', 'rainfall_mm',
    'relative_humidity'
]

freq_df = freq_df[columns_to_keep]

# Convert object types to float64 where applicable
for column in freq_df.select_dtypes(include=['object']).columns:
    try:
        freq_df[column] = freq_df[column].astype(float)
    except ValueError:
        print(f"Conversion failed for column: {column}")


aggregations = {
    'new_confirmed': 'sum',
    'new_deceased': 'sum',
    'cumulative_confirmed': 'max',
    'cumulative_deceased': 'max',
    'new_persons_fully_vaccinated': 'sum',
    'cumulative_persons_fully_vaccinated': 'max',
    'population': 'max',
    'population_male': 'max',
    'population_female': 'max',
    'population_age_00_09': 'max',
    'population_age_10_19': 'max',
    'population_age_20_29': 'max',
    'population_age_30_39': 'max',
    'population_age_40_49': 'max',
    'population_age_50_59': 'max',
    'population_age_60_69': 'max',
    'population_age_70_79': 'max',
    'population_age_80_and_older': 'max',
    'area_sq_km': 'max',
    'life_expectancy': 'max',
    'average_temperature_celsius': 'mean',
    'minimum_temperature_celsius': 'mean',
    'maximum_temperature_celsius': 'mean',
    'rainfall_mm': 'sum',
    'relative_humidity': 'mean'
}


# Group and aggregate for weekly, monthly, quarterly
weekly_df = freq_df.groupby(['county_name', 'week']).agg(aggregations).reset_index()
# monthly_df = freq_df.groupby(['county_name', 'month']).agg(aggregations).reset_index()
# quarterly_df = freq_df.groupby(['county_name', 'quarter']).agg(aggregations).reset_index()

# Format descriptive columns
weekly_df['description'] = weekly_df['county_name'] + " - Week of " + weekly_df['week'].dt.strftime('%Y-%m-%d')
# monthly_df['description'] = monthly_df['county_name'] + " - Month of " + monthly_df['month'].dt.strftime('%Y-%m')
# quarterly_df['description'] = quarterly_df['county_name'] + " - Quarter of " + quarterly_df['quarter'].dt.strftime('%Y-Q%q')





Conversion failed for column: county_name
Conversion failed for column: location_key


In [22]:
# Normalize the specified metrics by population to get per capita values
normalize_columns = [
    'new_confirmed',
    'new_deceased',
    'cumulative_confirmed',
    'cumulative_deceased',
    'new_persons_fully_vaccinated',
    'cumulative_persons_fully_vaccinated',
    'population_male', 'population_female',
    'population_age_00_09', 'population_age_10_19', 'population_age_20_29',
    'population_age_30_39', 'population_age_40_49', 'population_age_50_59',
    'population_age_60_69', 'population_age_70_79', 'population_age_80_and_older',
]

# Function to apply normalization
def normalize_by_population(df, columns):
    for col in columns:
        df[f'{col}_per_100k'] = df[col] / df['population'] * 100000
    return df

# Apply normalization
weekly_df = normalize_by_population(weekly_df, normalize_columns)
weekly_df.head()

Unnamed: 0,county_name,week,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,new_persons_fully_vaccinated,cumulative_persons_fully_vaccinated,population,population_male,population_female,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,description,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k
0,"Adams County, Colorado",2019-12-30,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,2.713334,-4.903333,10.97679,0.0,43.75703,"Adams County, Colorado - Week of 2019-12-30",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
1,"Adams County, Colorado",2020-01-06,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,0.588095,-6.238889,8.563492,0.0,43.450591,"Adams County, Colorado - Week of 2020-01-06",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
2,"Adams County, Colorado",2020-01-13,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,-0.452381,-7.979365,8.264286,0.0,41.711824,"Adams County, Colorado - Week of 2020-01-13",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
3,"Adams County, Colorado",2020-01-20,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,2.561565,-4.45034,11.145692,0.0,42.763934,"Adams County, Colorado - Week of 2020-01-20",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
4,"Adams County, Colorado",2020-01-27,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,3.066667,-4.392857,11.653616,1.200453,48.735622,"Adams County, Colorado - Week of 2020-01-27",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593


In [23]:
change_columns = [
    'new_confirmed_per_100k',
    'new_deceased_per_100k',
    'cumulative_confirmed_per_100k',
    'cumulative_deceased_per_100k',
    'new_persons_fully_vaccinated_per_100k',
    'cumulative_persons_fully_vaccinated_per_100k',
    'average_temperature_celsius',
    'minimum_temperature_celsius',
    'maximum_temperature_celsius',
    'rainfall_mm',
    'relative_humidity'
]


# Calculate the percentage change within each group


# Ensure the DataFrame is sorted by date to correctly apply pct_change

weekly_df.sort_values(by=['county_name', 'week'], inplace=True)
weekly_df.set_index(['county_name', weekly_df.columns[1]], inplace=True)  # Set index as county_name and the time period
for col in change_columns:
    weekly_df[f'{col}_pct_change'] = weekly_df.groupby('county_name')[col].pct_change(fill_method=None).multiply(100)

# Reset index after calculations
weekly_df.reset_index(inplace=True)


In [24]:
for col in weekly_df.columns:
    print(col)

county_name
week
new_confirmed
new_deceased
cumulative_confirmed
cumulative_deceased
new_persons_fully_vaccinated
cumulative_persons_fully_vaccinated
population
population_male
population_female
population_age_00_09
population_age_10_19
population_age_20_29
population_age_30_39
population_age_40_49
population_age_50_59
population_age_60_69
population_age_70_79
population_age_80_and_older
area_sq_km
life_expectancy
average_temperature_celsius
minimum_temperature_celsius
maximum_temperature_celsius
rainfall_mm
relative_humidity
description
new_confirmed_per_100k
new_deceased_per_100k
cumulative_confirmed_per_100k
cumulative_deceased_per_100k
new_persons_fully_vaccinated_per_100k
cumulative_persons_fully_vaccinated_per_100k
population_male_per_100k
population_female_per_100k
population_age_00_09_per_100k
population_age_10_19_per_100k
population_age_20_29_per_100k
population_age_30_39_per_100k
population_age_40_49_per_100k
population_age_50_59_per_100k
population_age_60_69_per_100k
populat

In [25]:
weekly_df.tail(500)

Unnamed: 0,county_name,week,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,new_persons_fully_vaccinated,cumulative_persons_fully_vaccinated,population,population_male,population_female,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,description,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change
58146,"Yates County, New York",2021-04-19,15.0,0.0,1125.0,26.0,0.0,,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,6.722222,1.000793,13.477778,6.451600,60.267959,"Yates County, New York - Week of 2021-04-19",59.995200,0.0,4499.640029,103.991681,0.000000,,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-21.052632,,1.351351,0.0,,,-18.259025,-76.535175,-4.857415,-76.803653,-19.378277
58147,"Yates County, New York",2021-04-26,17.0,0.0,1142.0,26.0,432.0,8195.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,9.914286,4.113492,17.739683,31.312556,62.996033,"Yates County, New York - Week of 2021-04-26",67.994560,0.0,4567.634589,103.991681,1727.861771,32777.377810,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,13.333333,,1.511111,0.0,inf,,47.485244,311.023111,31.621717,385.345589,4.526573
58148,"Yates County, New York",2021-05-03,20.0,0.0,1162.0,26.0,547.0,8742.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,9.707936,4.988889,16.224603,20.940889,77.742800,"Yates County, New York - Week of 2021-05-03",79.993601,0.0,4647.628190,103.991681,2187.824974,34965.202784,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,17.647059,,1.751313,0.0,26.620370,6.674802,-2.081333,21.281104,-8.540622,-33.123029,23.409042
58149,"Yates County, New York",2021-05-10,6.0,0.0,1168.0,26.0,303.0,9045.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,10.869048,3.092857,17.934921,8.627533,52.186355,"Yates County, New York - Week of 2021-05-10",23.998080,0.0,4671.626270,103.991681,1211.903048,36177.105832,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-70.000000,,0.516351,0.0,-44.606947,3.466026,11.960434,-38.005093,10.541505,-58.800541,-32.873071
58150,"Yates County, New York",2021-05-17,4.0,0.0,1172.0,26.0,274.0,9319.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,19.763492,10.791270,28.157143,0.191911,54.239643,"Yates County, New York - Week of 2021-05-17",15.998720,0.0,4687.624990,103.991681,1095.912327,37273.018159,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-33.333333,,0.342466,0.0,-9.570957,3.029298,81.832785,248.909439,56.996196,-97.775598,3.934532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58641,"Yuma County, Colorado",2022-08-15,0.0,0.0,,,1.0,4476.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,22.316667,14.610318,31.771428,4.080329,55.260483,"Yuma County, Colorado - Week of 2022-08-15",0.000000,0.0,,,10.041169,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-50.000000,0.022346,-13.859020,-10.479478,-10.310526,307.263579,29.454447
58642,"Yuma County, Colorado",2022-08-22,0.0,0.0,,,0.0,4476.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,23.825397,13.609524,33.813492,2.394857,38.153665,"Yuma County, Colorado - Week of 2022-08-22",0.000000,0.0,,,0.000000,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-100.000000,0.000000,6.760553,-6.849910,6.427358,-41.307257,-30.956693
58643,"Yuma County, Colorado",2022-08-29,0.0,0.0,,,0.0,,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,23.581129,13.655379,34.371605,1.320901,39.698171,"Yuma County, Colorado - Week of 2022-08-29",0.000000,0.0,,,0.000000,,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,,,-1.025242,0.336933,1.650563,-44.844264,4.048120
58644,"Yuma County, Colorado",2022-09-05,0.0,0.0,,,11.0,4487.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,21.013404,11.675926,32.332716,0.464457,42.166975,"Yuma County, Colorado - Week of 2022-09-05",0.000000,0.0,,,110.452857,45054.724370,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,inf,,-10.888898,-14.495775,-5.931899,-64.837864,6.218938


In [26]:
columns_to_keep = [
    'county_name'
    , 'week'
    , 'description'
    , 'area_sq_km'
    , 'life_expectancy'
    , 'average_temperature_celsius'
    , 'minimum_temperature_celsius'
    , 'maximum_temperature_celsius'
    , 'rainfall_mm'
    , 'relative_humidity'
    , 'new_confirmed_per_100k'
    , 'new_deceased_per_100k'
    , 'cumulative_confirmed_per_100k'
    , 'cumulative_deceased_per_100k'
    , 'new_persons_fully_vaccinated_per_100k'
    , 'cumulative_persons_fully_vaccinated_per_100k'
    , 'population_male_per_100k'
    , 'population_female_per_100k'
    , 'population_age_00_09_per_100k'
    , 'population_age_10_19_per_100k'
    , 'population_age_20_29_per_100k'
    , 'population_age_30_39_per_100k'
    , 'population_age_40_49_per_100k'
    , 'population_age_50_59_per_100k'
    , 'population_age_60_69_per_100k'
    , 'population_age_70_79_per_100k'
    , 'population_age_80_and_older_per_100k'
    , 'new_confirmed_per_100k_pct_change'
    , 'new_deceased_per_100k_pct_change'
    , 'cumulative_confirmed_per_100k_pct_change'
    , 'cumulative_deceased_per_100k_pct_change'
    , 'new_persons_fully_vaccinated_per_100k_pct_change'
    , 'cumulative_persons_fully_vaccinated_per_100k_pct_change'
    , 'average_temperature_celsius_pct_change'
    , 'minimum_temperature_celsius_pct_change'
    , 'maximum_temperature_celsius_pct_change'
    , 'rainfall_mm_pct_change'
    , 'relative_humidity_pct_change'
]

weekly_df = weekly_df[columns_to_keep]
weekly_df

Unnamed: 0,county_name,week,description,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change
0,"Adams County, Colorado",2019-12-30,"Adams County, Colorado - Week of 2019-12-30",3102.0,77.977528,2.713334,-4.903333,10.976790,0.000000,43.757030,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,,,,,
1,"Adams County, Colorado",2020-01-06,"Adams County, Colorado - Week of 2020-01-06",3102.0,77.977528,0.588095,-6.238889,8.563492,0.000000,43.450591,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-78.325729,27.237710,-21.985462,,-0.700319
2,"Adams County, Colorado",2020-01-13,"Adams County, Colorado - Week of 2020-01-13",3102.0,77.977528,-0.452381,-7.979365,8.264286,0.000000,41.711824,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-176.923055,27.897216,-3.493977,,-4.001713
3,"Adams County, Colorado",2020-01-20,"Adams County, Colorado - Week of 2020-01-20",3102.0,77.977528,2.561565,-4.450340,11.145692,0.000000,42.763934,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-666.240740,-44.226888,34.865760,,2.522331
4,"Adams County, Colorado",2020-01-27,"Adams County, Colorado - Week of 2020-01-27",3102.0,77.977528,3.066667,-4.392857,11.653616,1.200453,48.735622,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,19.718495,-1.291657,4.557132,inf,13.964309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58641,"Yuma County, Colorado",2022-08-15,"Yuma County, Colorado - Week of 2022-08-15",6136.0,79.650000,22.316667,14.610318,31.771428,4.080329,55.260483,0.0,0.0,,,10.041169,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-50.0,0.022346,-13.859020,-10.479478,-10.310526,307.263579,29.454447
58642,"Yuma County, Colorado",2022-08-22,"Yuma County, Colorado - Week of 2022-08-22",6136.0,79.650000,23.825397,13.609524,33.813492,2.394857,38.153665,0.0,0.0,,,0.000000,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-100.0,0.000000,6.760553,-6.849910,6.427358,-41.307257,-30.956693
58643,"Yuma County, Colorado",2022-08-29,"Yuma County, Colorado - Week of 2022-08-29",6136.0,79.650000,23.581129,13.655379,34.371605,1.320901,39.698171,0.0,0.0,,,0.000000,,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,,,-1.025242,0.336933,1.650563,-44.844264,4.048120
58644,"Yuma County, Colorado",2022-09-05,"Yuma County, Colorado - Week of 2022-09-05",6136.0,79.650000,21.013404,11.675926,32.332716,0.464457,42.166975,0.0,0.0,,,110.452857,45054.724370,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,inf,,-10.888898,-14.495775,-5.931899,-64.837864,6.218938


In [27]:
# Function to apply binning
def apply_binning(df):
    # area_sq_km
    bins = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, float('inf')]
    labels = ['<1000', '1000-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '9000-10000', '>10000']
    df['area_sq_km_bins'] = pd.cut(df['area_sq_km'], bins=bins, labels=labels, right=False)


    # life_expectancy
    bins = [0, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, float('inf')]
    labels = ['<70', '70-72', '72-74', '74-76', '76-78', '78-80', '80-82', '82-84', '84-86', '86-88', '>88']
    df['life_expectancy_bins'] = pd.cut(df['life_expectancy'], bins=bins, labels=labels, right=False)

    # average_temperature_celsius, minimum_temperature_celsius, maximum_temperature_celsius
    temp_bins = [df['average_temperature_celsius'].min(), 0, 15, 25, 35, df['average_temperature_celsius'].max()]
    temp_labels = ['<0°C', '0-15°C', '15-25°C', '25-35°C', '>35°C']
    df['average_temperature_celsius_bins'] = pd.cut(df['average_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)
    df['minimum_temperature_celsius_bins'] = pd.cut(df['minimum_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)
    df['maximum_temperature_celsius_bins'] = pd.cut(df['maximum_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)

    # rainfall_mm
    bins = [0, 5, 10, 15, 20, float('inf')]
    labels = ['<5mm', '5-10mm', '10-15mm', '15-20mm', '>20mm']
    df['rainfall_mm_bins'] = pd.cut(df['rainfall_mm'], bins=bins, labels=labels, right=False)

    # relative_humidity
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    labels = ['<10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '>90%']
    df['relative_humidity_bins'] = pd.cut(df['relative_humidity'], bins=bins, labels=labels, right=False)

    # new_confirmed_per_100k
    bins = [-100000, 50, 100, 150, 200, 250, 300, 350, 400, float('inf')]
    labels = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350-400', '>400']
    df['new_confirmed_per_100k_bins'] = pd.cut(df['new_confirmed_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_confirmed_per_100k
    bins = [-100000, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, float('inf')]
    labels = ['<1000', '1000-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '9000-10000', '>10000']
    df['cumulative_confirmed_per_100k_bins'] = pd.cut(df['cumulative_confirmed_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_deceased_per_100k
    bins = [-100000, 100, 200, 300, 400, 500, 600, 700, 800, float('inf')]
    labels = ['<100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '>800']
    df['cumulative_deceased_per_100k_bins'] = pd.cut(df['cumulative_deceased_per_100k'], bins=bins, labels=labels, right=False)

    # new_persons_fully_vaccinated_per_100k
    bins = [-100000, 50, 100, 150, 200, 250, 300, 350, 400, float('inf')]
    labels = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350-400', '>400']
    df['new_persons_fully_vaccinated_per_100k_bins'] = pd.cut(df['new_persons_fully_vaccinated_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_persons_fully_vaccinated_per_100k
    bins = [-100000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, float('inf')]
    labels = ['<10000', '10000-20000', '20000-30000', '30000-40000', '40000-50000', '50000-60000', '60000-70000', '70000-80000', '80000-90000', '90000-100000', '>100000']
    df['cumulative_persons_fully_vaccinated_per_100k_bins'] = pd.cut(df['cumulative_persons_fully_vaccinated_per_100k'], bins=bins, labels=labels, right=False)

    # population_male_per_100k population_female_per_100k
    bins = [40000, 42000, 44000, 46000, 48000, 50000, 52000, 54000, 56000, 58000, 60000, float('inf')]
    labels = ['<42000', '42000-44000', '44000-46000', '46000-48000', '48000-50000', '50000-52000', '52000-54000', '54000-56000', '56000-58000', '58000-60000', '>60000']
    df['population_male_per_100k_bins'] = pd.cut(df['population_male_per_100k'], bins=bins, labels=labels, right=False)
    df['population_female_per_100k_bins'] = pd.cut(df['population_female_per_100k'], bins=bins, labels=labels, right=False)

    # population_age_xxx
    bins = [0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000, float('inf')]
    labels = ['<2000', '2000-4000', '4000-6000', '6000-8000', '8000-10000', '10000-12000', '12000-14000', '14000-16000', '16000-18000', '18000-20000', '>20000']
    df['population_age_00_09_per_100k_bins'] = pd.cut(df['population_age_00_09_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_10_19_per_100k_bins'] = pd.cut(df['population_age_10_19_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_20_29_per_100k_bins'] = pd.cut(df['population_age_20_29_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_30_39_per_100k_bins'] = pd.cut(df['population_age_30_39_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_40_49_per_100k_bins'] = pd.cut(df['population_age_40_49_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_50_59_per_100k_bins'] = pd.cut(df['population_age_50_59_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_60_69_per_100k_bins'] = pd.cut(df['population_age_60_69_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_70_79_per_100k_bins'] = pd.cut(df['population_age_70_79_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_80_and_older_per_100k_bins'] = pd.cut(df['population_age_80_and_older_per_100k'], bins=bins, labels=labels, right=False)


    # new_confirmed_per_100k_pct_change
    bins = [-100, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 100]
    labels = ['<-50%', '-50% - -40%', '-40% - -30%', '-30% - -20%', '-20% - -10%', '-10% - 0%', '0% - 10%', '10% - 20%', '20% - 30%', '30% - 40%', '40% - 50%', '>50%']
    df['new_confirmed_per_100k_pct_change_bins'] = pd.cut(df['new_confirmed_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['new_deceased_per_100k_pct_change_bins'] = pd.cut(df['new_deceased_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_confirmed_per_100k_pct_change_bins'] = pd.cut(df['cumulative_confirmed_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_deceased_per_100k_pct_change_bins'] = pd.cut(df['cumulative_deceased_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_persons_fully_vaccinated_per_100k_pct_change_bins'] = pd.cut(df['cumulative_persons_fully_vaccinated_per_100k_pct_change'], bins=bins, labels=labels, right=False)

    df['average_temperature_celsius_pct_change_bins'] = pd.cut(df['average_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)
    df['minimum_temperature_celsius_pct_change_bins'] = pd.cut(df['minimum_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)
    df['maximum_temperature_celsius_pct_change_bins'] = pd.cut(df['maximum_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)

    df['rainfall_mm_pct_change_bins'] = pd.cut(df['rainfall_mm_pct_change'], bins=bins, labels=labels, right=False)
    df['relative_humidity_pct_change_bins'] = pd.cut(df['relative_humidity_pct_change'], bins=bins, labels=labels, right=False)

    
    
apply_binning(weekly_df)

In [28]:
sample_df = weekly_df.sample(n=10)
sample_df


Unnamed: 0,county_name,week,description,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change,area_sq_km_bins,life_expectancy_bins,average_temperature_celsius_bins,minimum_temperature_celsius_bins,maximum_temperature_celsius_bins,rainfall_mm_bins,relative_humidity_bins,new_confirmed_per_100k_bins,cumulative_confirmed_per_100k_bins,cumulative_deceased_per_100k_bins,new_persons_fully_vaccinated_per_100k_bins,cumulative_persons_fully_vaccinated_per_100k_bins,population_male_per_100k_bins,population_female_per_100k_bins,population_age_00_09_per_100k_bins,population_age_10_19_per_100k_bins,population_age_20_29_per_100k_bins,population_age_30_39_per_100k_bins,population_age_40_49_per_100k_bins,population_age_50_59_per_100k_bins,population_age_60_69_per_100k_bins,population_age_70_79_per_100k_bins,population_age_80_and_older_per_100k_bins,new_confirmed_per_100k_pct_change_bins,new_deceased_per_100k_pct_change_bins,cumulative_confirmed_per_100k_pct_change_bins,cumulative_deceased_per_100k_pct_change_bins,cumulative_persons_fully_vaccinated_per_100k_pct_change_bins,average_temperature_celsius_pct_change_bins,minimum_temperature_celsius_pct_change_bins,maximum_temperature_celsius_pct_change_bins,rainfall_mm_pct_change_bins,relative_humidity_pct_change_bins
27808,"Levy County, Florida",2022-04-04,"Levy County, Florida - Week of 2022-04-04",3658.0,74.925,19.894445,14.110317,25.307937,4.0386,68.297319,9.931473,0.0,25434.501937,404.707518,81.934651,48540.073493,48939.815275,51060.184725,10599.364386,10658.953223,10862.548416,10135.068031,10673.850432,14792.928791,15728.970106,11309.464694,5238.851922,inf,-100.0,0.039062,0.0,-31.25,0.169083,-0.93661,-6.040589,-3.565488,-83.341441,-10.996018,3000-4000,74-76,15-25°C,0-15°C,25-35°C,<5mm,60-70%,<50,>10000,400-500,50-100,40000-50000,48000-50000,50000-52000,10000-12000,10000-12000,10000-12000,10000-12000,10000-12000,14000-16000,14000-16000,10000-12000,4000-6000,,<-50%,0% - 10%,0% - 10%,0% - 10%,-10% - 0%,-10% - 0%,-10% - 0%,<-50%,-20% - -10%
2194,"Ashe County, North Carolina",2021-03-22,"Ashe County, North Carolina - Week of 2021-03-22",1105.0,77.483333,13.06746,6.745414,19.532892,58.869036,70.891398,100.795162,0.0,7279.650577,149.326166,0.0,,49184.30582,50815.69418,9340.351663,10340.836973,9299.286968,10288.572815,12162.616194,15134.206891,16567.738082,11229.327659,5637.062754,80.0,,1.404056,0.0,,,50.022778,99.23165,25.645519,56.871621,13.678564,1000-2000,76-78,0-15°C,0-15°C,15-25°C,>20mm,70-80%,100-150,7000-8000,100-200,<50,,48000-50000,50000-52000,8000-10000,10000-12000,8000-10000,10000-12000,12000-14000,14000-16000,16000-18000,10000-12000,4000-6000,>50%,,0% - 10%,0% - 10%,,>50%,>50%,20% - 30%,>50%,10% - 20%
2342,"Ashland County, Wisconsin",2021-05-03,"Ashland County, Wisconsin - Week of 2021-05-03",5941.0,,5.59127,1.197619,10.613492,0.472722,65.264347,161.279917,0.0,8760.725115,96.76795,2496.613122,44139.087801,50190.310303,49809.689697,11760.531579,13153.990065,11754.080382,11270.24063,10644.47455,14695.826076,14347.461454,7786.594413,4586.800852,38.888889,,1.875469,0.0,2.380952,5.995352,-13.218726,-44.428987,-18.697478,-92.588381,-4.101189,5000-6000,,0-15°C,0-15°C,0-15°C,<5mm,60-70%,150-200,8000-9000,<100,>400,40000-50000,50000-52000,48000-50000,10000-12000,12000-14000,10000-12000,10000-12000,10000-12000,14000-16000,14000-16000,6000-8000,4000-6000,30% - 40%,,0% - 10%,0% - 10%,0% - 10%,-20% - -10%,-50% - -40%,-20% - -10%,<-50%,-10% - 0%
51078,"Sutter County, California",2021-11-29,"Sutter County, California - Week of 2021-11-29",1576.0,78.635,10.595238,5.166667,17.889771,0.0,85.083171,157.997588,5.197289,15223.899214,193.339154,727.620473,54492.536693,49766.121991,50233.878009,14152.218203,14361.149225,13735.395618,12909.026652,11639.848655,12500.519729,10227.225479,6405.139079,4069.477361,186.792453,inf,1.04871,2.762431,212.5,1.353337,2.072891,42.03637,-0.549047,,19.927688,1000-2000,78-80,0-15°C,0-15°C,15-25°C,<5mm,80-90%,150-200,>10000,100-200,>400,50000-60000,48000-50000,50000-52000,14000-16000,14000-16000,12000-14000,12000-14000,10000-12000,12000-14000,10000-12000,6000-8000,4000-6000,,,0% - 10%,0% - 10%,0% - 10%,0% - 10%,40% - 50%,-10% - 0%,,10% - 20%
8472,"Clark County, Wisconsin",2021-10-18,"Clark County, Wisconsin - Week of 2021-10-18",3157.0,,7.677778,1.147619,15.052381,6.03885,72.014464,153.307686,5.785196,13725.376761,225.622632,260.333806,33747.939024,50351.450638,49648.549362,16401.029765,15585.317173,10288.970524,10422.030025,10751.786179,13528.680108,11671.632293,6667.438026,4683.115906,-47.0,-33.333333,1.129582,2.631579,-35.251799,0.777403,-36.47229,-84.670835,-13.833991,-43.76478,-0.94315,3000-4000,,0-15°C,0-15°C,15-25°C,5-10mm,70-80%,150-200,>10000,200-300,250-300,30000-40000,50000-52000,48000-50000,16000-18000,14000-16000,10000-12000,10000-12000,10000-12000,12000-14000,10000-12000,6000-8000,4000-6000,-50% - -40%,-40% - -30%,0% - 10%,0% - 10%,0% - 10%,-40% - -30%,<-50%,-20% - -10%,-50% - -40%,-10% - 0%
8394,"Clark County, Wisconsin",2020-04-20,"Clark County, Wisconsin - Week of 2020-04-20",3157.0,,5.937302,-0.369841,13.610317,2.483555,55.236739,8.677794,2.892598,54.959359,5.785196,0.0,,50351.450638,49648.549362,16401.029765,15585.317173,10288.970524,10422.030025,10751.786179,13528.680108,11671.632293,6667.438026,4683.115906,-62.5,0.0,18.75,100.0,,,1176.621013,-92.264276,112.845972,-79.885554,-4.50419,3000-4000,,0-15°C,<0°C,0-15°C,<5mm,50-60%,<50,<1000,<100,<50,,50000-52000,48000-50000,16000-18000,14000-16000,10000-12000,10000-12000,10000-12000,12000-14000,10000-12000,6000-8000,4000-6000,<-50%,0% - 10%,10% - 20%,,,,<-50%,,<-50%,-10% - 0%
58208,"Yates County, New York",2022-06-27,"Yates County, New York - Week of 2022-06-27",974.0,80.58,20.643651,13.573016,27.962698,17.7546,60.034394,0.0,0.0,,,35.99712,53395.728342,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,,,,,350.0,0.067461,-2.243688,4.325015,-3.816439,152.34657,-4.729586,<1000,80-82,15-25°C,0-15°C,25-35°C,15-20mm,60-70%,<50,,,<50,50000-60000,48000-50000,50000-52000,12000-14000,12000-14000,12000-14000,8000-10000,8000-10000,12000-14000,14000-16000,8000-10000,4000-6000,,,,,0% - 10%,-10% - 0%,0% - 10%,-10% - 0%,,-10% - 0%
54693,"Walworth County, Wisconsin",2020-06-08,"Walworth County, Wisconsin - Week of 2020-06-08",577.0,,18.057936,12.331746,24.502381,11.449151,61.500583,35.002431,0.0,457.948469,16.528926,0.0,,49854.156539,50145.843461,10687.408848,13873.602333,14485.172581,10834.224599,11605.250365,14567.81721,12716.57754,7184.248906,4045.697618,-10.0,,8.275862,0.0,,,-12.579245,-18.996977,-12.030204,117.905946,-1.545825,<1000,,15-25°C,0-15°C,15-25°C,10-15mm,60-70%,<50,<1000,<100,<50,,48000-50000,50000-52000,10000-12000,12000-14000,14000-16000,10000-12000,10000-12000,14000-16000,12000-14000,6000-8000,4000-6000,-10% - 0%,,0% - 10%,0% - 10%,,-20% - -10%,-20% - -10%,-20% - -10%,,-10% - 0%
35521,"Northampton County, North Carolina",2020-05-25,"Northampton County, North Carolina - Week of 2...",1426.0,76.96,21.271429,17.56746,26.886508,16.657966,85.577381,45.248869,10.055304,739.064857,70.387129,0.0,,48838.612368,51161.387632,9270.990447,10367.018602,11156.35998,9019.607843,10603.31825,15550.527903,16626.44545,11070.889894,6334.841629,0.0,0.0,6.521739,16.666667,,,18.66643,23.102161,16.407809,-60.90589,-3.010215,1000-2000,76-78,15-25°C,15-25°C,25-35°C,15-20mm,80-90%,<50,<1000,<100,<50,,48000-50000,50000-52000,8000-10000,10000-12000,10000-12000,8000-10000,10000-12000,14000-16000,16000-18000,10000-12000,6000-8000,0% - 10%,0% - 10%,0% - 10%,10% - 20%,,10% - 20%,20% - 30%,10% - 20%,<-50%,-10% - 0%
50614,"Summit County, Colorado",2021-03-08,"Summit County, Colorado - Week of 2021-03-08",1604.0,82.05,-2.496208,-7.777513,3.785362,13.377334,60.273605,233.614536,0.0,10204.412719,22.712524,0.0,,54192.083063,45807.916937,9367.293965,8406.87865,16521.73913,17407.527579,14815.055159,13533.419857,12897.469176,5684.620376,1365.996106,63.636364,,2.342987,0.0,,,-21.954787,-25.934042,-35.415631,637.333254,19.771588,1000-2000,82-84,<0°C,<0°C,0-15°C,10-15mm,60-70%,200-250,>10000,<100,<50,,54000-56000,44000-46000,8000-10000,8000-10000,16000-18000,16000-18000,14000-16000,12000-14000,12000-14000,4000-6000,<2000,>50%,,0% - 10%,0% - 10%,,-30% - -20%,-30% - -20%,-40% - -30%,,10% - 20%


In [29]:
columns_to_keep = ['county_name'
    , 'week'
    , 'description'
    , 'area_sq_km_bins'
    , 'life_expectancy_bins'
    , 'average_temperature_celsius_bins'
    , 'minimum_temperature_celsius_bins'
    , 'maximum_temperature_celsius_bins'
    , 'rainfall_mm_bins'
    , 'relative_humidity_bins'
    , 'new_confirmed_per_100k_bins'
    , 'cumulative_confirmed_per_100k_bins'
    , 'cumulative_deceased_per_100k_bins'
    , 'new_persons_fully_vaccinated_per_100k_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_bins'
    , 'population_male_per_100k_bins'
    , 'population_female_per_100k_bins'
    , 'population_age_00_09_per_100k_bins'
    , 'population_age_10_19_per_100k_bins'
    , 'population_age_20_29_per_100k_bins'
    , 'population_age_30_39_per_100k_bins'
    , 'population_age_40_49_per_100k_bins'
    , 'population_age_50_59_per_100k_bins'
    , 'population_age_60_69_per_100k_bins'
    , 'population_age_70_79_per_100k_bins'
    , 'population_age_80_and_older_per_100k_bins'
    , 'new_confirmed_per_100k_pct_change_bins'
    , 'new_deceased_per_100k_pct_change_bins'
    , 'cumulative_confirmed_per_100k_pct_change_bins'
    , 'cumulative_deceased_per_100k_pct_change_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_pct_change_bins'
    , 'average_temperature_celsius_pct_change_bins'
    , 'minimum_temperature_celsius_pct_change_bins'
    , 'maximum_temperature_celsius_pct_change_bins'
    , 'rainfall_mm_pct_change_bins'
    , 'relative_humidity_pct_change_bins'
    ]

weekly_df_binned = weekly_df[columns_to_keep]   

In [30]:
for column in weekly_df_binned.columns:
    print(column)

county_name
week
description
area_sq_km_bins
life_expectancy_bins
average_temperature_celsius_bins
minimum_temperature_celsius_bins
maximum_temperature_celsius_bins
rainfall_mm_bins
relative_humidity_bins
new_confirmed_per_100k_bins
cumulative_confirmed_per_100k_bins
cumulative_deceased_per_100k_bins
new_persons_fully_vaccinated_per_100k_bins
cumulative_persons_fully_vaccinated_per_100k_bins
population_male_per_100k_bins
population_female_per_100k_bins
population_age_00_09_per_100k_bins
population_age_10_19_per_100k_bins
population_age_20_29_per_100k_bins
population_age_30_39_per_100k_bins
population_age_40_49_per_100k_bins
population_age_50_59_per_100k_bins
population_age_60_69_per_100k_bins
population_age_70_79_per_100k_bins
population_age_80_and_older_per_100k_bins
new_confirmed_per_100k_pct_change_bins
new_deceased_per_100k_pct_change_bins
cumulative_confirmed_per_100k_pct_change_bins
cumulative_deceased_per_100k_pct_change_bins
cumulative_persons_fully_vaccinated_per_100k_pct_chan

In [31]:
weekly_df_binned.head()

Unnamed: 0,county_name,week,description,area_sq_km_bins,life_expectancy_bins,average_temperature_celsius_bins,minimum_temperature_celsius_bins,maximum_temperature_celsius_bins,rainfall_mm_bins,relative_humidity_bins,new_confirmed_per_100k_bins,cumulative_confirmed_per_100k_bins,cumulative_deceased_per_100k_bins,new_persons_fully_vaccinated_per_100k_bins,cumulative_persons_fully_vaccinated_per_100k_bins,population_male_per_100k_bins,population_female_per_100k_bins,population_age_00_09_per_100k_bins,population_age_10_19_per_100k_bins,population_age_20_29_per_100k_bins,population_age_30_39_per_100k_bins,population_age_40_49_per_100k_bins,population_age_50_59_per_100k_bins,population_age_60_69_per_100k_bins,population_age_70_79_per_100k_bins,population_age_80_and_older_per_100k_bins,new_confirmed_per_100k_pct_change_bins,new_deceased_per_100k_pct_change_bins,cumulative_confirmed_per_100k_pct_change_bins,cumulative_deceased_per_100k_pct_change_bins,cumulative_persons_fully_vaccinated_per_100k_pct_change_bins,average_temperature_celsius_pct_change_bins,minimum_temperature_celsius_pct_change_bins,maximum_temperature_celsius_pct_change_bins,rainfall_mm_pct_change_bins,relative_humidity_pct_change_bins
0,"Adams County, Colorado",2019-12-30,"Adams County, Colorado - Week of 2019-12-30",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,,,,
1,"Adams County, Colorado",2020-01-06,"Adams County, Colorado - Week of 2020-01-06",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,<-50%,20% - 30%,-30% - -20%,,-10% - 0%
2,"Adams County, Colorado",2020-01-13,"Adams County, Colorado - Week of 2020-01-13",3000-4000,76-78,<0°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,20% - 30%,-10% - 0%,,-10% - 0%
3,"Adams County, Colorado",2020-01-20,"Adams County, Colorado - Week of 2020-01-20",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,-50% - -40%,30% - 40%,,0% - 10%
4,"Adams County, Colorado",2020-01-27,"Adams County, Colorado - Week of 2020-01-27",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,10% - 20%,-10% - 0%,0% - 10%,,10% - 20%


### Frequent Pattern Analysis


In [18]:
!pip install mlxtend

Looking in indexes: https://pypi.org/simple, https://pypi-user:****@fetch.jfrog.io/artifactory/api/pypi/fetchrewards-pypi-release-local/simple
Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.1
[0m

In [32]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
# Ensure all data in the DataFrame columns are categorical
columns_to_convert = weekly_df_binned.columns.difference(['county_name', 'week', 'description'])
for col in columns_to_convert:
    weekly_df_binned.loc[:, col] = weekly_df_binned[col].astype(str)

# Convert DataFrame into a list of transactions (ensure all items are strings)
transactions = weekly_df_binned.drop(['county_name', 'week', 'description'], axis=1).applymap(str).values.tolist()

# Encode the transactions
encoder = TransactionEncoder()
encoded_array = encoder.fit_transform(transactions)
encoded_df = pd.DataFrame(encoded_array, columns=encoder.columns_)

# Find frequent itemsets with a minimum support threshold
frequent_itemsets = apriori(encoded_df, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Display the rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on 