In [1]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt

# Set pandas option to display all columns
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.reset_option('max_rows')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
columns_to_load = [
     'location_key'
    , 'date'
    , 'place_id'
    , 'wikidata_id'
    , 'datacommons_id'
    , 'country_code'
    , 'country_name'
    , 'subregion1_code'
    , 'subregion1_name'
    , 'subregion2_code'
    , 'subregion2_name'
    , 'new_confirmed'
    , 'new_deceased'
    , 'cumulative_confirmed'
    , 'cumulative_deceased'
    , 'new_persons_fully_vaccinated'
    , 'cumulative_persons_fully_vaccinated'
    , 'population'
    , 'population_male'
    , 'population_female'
    , 'population_age_00_09'
    , 'population_age_10_19'
    , 'population_age_20_29'
    , 'population_age_30_39'
    , 'population_age_40_49'
    , 'population_age_50_59'
    , 'population_age_60_69'
    , 'population_age_70_79'
    , 'population_age_80_and_older'
    , 'openstreetmap_id'
    , 'latitude'
    , 'longitude'
    , 'area_sq_km'
    , 'life_expectancy'
    , 'mobility_retail_and_recreation'
    , 'mobility_grocery_and_pharmacy'
    , 'mobility_parks'
    , 'mobility_transit_stations'
    , 'mobility_workplaces'
    , 'mobility_residential'
    , 'average_temperature_celsius'
    , 'minimum_temperature_celsius'
    , 'maximum_temperature_celsius'
    , 'rainfall_mm'
    , 'dew_point'
    , 'relative_humidity'
]

In [3]:
# Directory path to your CSV files
directory_path = "../Test CSVs/"  # Example directory path

# Initialize an empty DataFrame to store data from all CSV files
full_df = pd.DataFrame(columns=columns_to_load)

# Loop over the list of CSV files
for file in glob.glob(directory_path + '*.csv'):
    # Read the CSV file without specifying columns to ensure we don't miss any data
    df = pd.read_csv(file)
    
    # Ensure all desired columns are present, add them with NaN values if they are missing
    for column in columns_to_load:
        if column not in df.columns:
            df[column] = np.nan

    # Reorder and select only the desired columns to maintain consistency
    df = df[columns_to_load]
    
    # Append the contents of the file to the full DataFrame
    full_df = pd.concat([full_df, df], ignore_index=True)

# Optionally, reset the index of the final DataFrame
full_df.reset_index(drop=True, inplace=True)



  full_df = pd.concat([full_df, df], ignore_index=True)
  full_df = pd.concat([full_df, df], ignore_index=True)


In [4]:
# Data Prep
freq_df = full_df.copy()
freq_df['date'] = pd.to_datetime(freq_df['date'])
freq_df['quarter'] = freq_df['date'].dt.to_period('Q').dt.start_time
freq_df['month'] = freq_df['date'].dt.to_period('M').dt.to_timestamp()
freq_df['week'] = freq_df['date'] - pd.to_timedelta(freq_df['date'].dt.weekday, unit='d')
freq_df['county_name'] = freq_df['subregion2_name'] + ', ' + freq_df['subregion1_name']

columns_to_keep = [
    'county_name', 'location_key', 'quarter', 'month', 'week', 'date',
    'new_confirmed', 'new_deceased', 'cumulative_confirmed', 'cumulative_deceased',
    'new_persons_fully_vaccinated', 'cumulative_persons_fully_vaccinated',
    'population', 'population_male', 'population_female',
    'population_age_00_09', 'population_age_10_19', 'population_age_20_29',
    'population_age_30_39', 'population_age_40_49', 'population_age_50_59',
    'population_age_60_69', 'population_age_70_79', 'population_age_80_and_older',
    'area_sq_km',
    'life_expectancy',
    'average_temperature_celsius', 'minimum_temperature_celsius',
    'maximum_temperature_celsius', 'rainfall_mm',
    'relative_humidity'
]

freq_df = freq_df[columns_to_keep]

# Convert object types to float64 where applicable
for column in freq_df.select_dtypes(include=['object']).columns:
    try:
        freq_df[column] = freq_df[column].astype(float)
    except ValueError:
        print(f"Conversion failed for column: {column}")


aggregations = {
    'new_confirmed': 'sum',
    'new_deceased': 'sum',
    'cumulative_confirmed': 'max',
    'cumulative_deceased': 'max',
    'new_persons_fully_vaccinated': 'sum',
    'cumulative_persons_fully_vaccinated': 'max',
    'population': 'max',
    'population_male': 'max',
    'population_female': 'max',
    'population_age_00_09': 'max',
    'population_age_10_19': 'max',
    'population_age_20_29': 'max',
    'population_age_30_39': 'max',
    'population_age_40_49': 'max',
    'population_age_50_59': 'max',
    'population_age_60_69': 'max',
    'population_age_70_79': 'max',
    'population_age_80_and_older': 'max',
    'area_sq_km': 'max',
    'life_expectancy': 'max',
    'average_temperature_celsius': 'mean',
    'minimum_temperature_celsius': 'mean',
    'maximum_temperature_celsius': 'mean',
    'rainfall_mm': 'sum',
    'relative_humidity': 'mean'
}


# Group and aggregate for weekly, monthly, quarterly
weekly_df = freq_df.groupby(['county_name', 'week']).agg(aggregations).reset_index()
# monthly_df = freq_df.groupby(['county_name', 'month']).agg(aggregations).reset_index()
# quarterly_df = freq_df.groupby(['county_name', 'quarter']).agg(aggregations).reset_index()

# Format descriptive columns
weekly_df['description'] = weekly_df['county_name'] + " - Week of " + weekly_df['week'].dt.strftime('%Y-%m-%d')
# monthly_df['description'] = monthly_df['county_name'] + " - Month of " + monthly_df['month'].dt.strftime('%Y-%m')
# quarterly_df['description'] = quarterly_df['county_name'] + " - Quarter of " + quarterly_df['quarter'].dt.strftime('%Y-Q%q')





Conversion failed for column: county_name
Conversion failed for column: location_key


In [5]:
# Normalize the specified metrics by population to get per capita values
normalize_columns = [
    'new_confirmed',
    'new_deceased',
    'cumulative_confirmed',
    'cumulative_deceased',
    'new_persons_fully_vaccinated',
    'cumulative_persons_fully_vaccinated',
    'population_male', 'population_female',
    'population_age_00_09', 'population_age_10_19', 'population_age_20_29',
    'population_age_30_39', 'population_age_40_49', 'population_age_50_59',
    'population_age_60_69', 'population_age_70_79', 'population_age_80_and_older',
]

# Function to apply normalization
def normalize_by_population(df, columns):
    for col in columns:
        df[f'{col}_per_100k'] = df[col] / df['population'] * 100000
    return df

# Apply normalization
weekly_df = normalize_by_population(weekly_df, normalize_columns)
weekly_df.head()

Unnamed: 0,county_name,week,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,new_persons_fully_vaccinated,cumulative_persons_fully_vaccinated,population,population_male,population_female,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,description,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k
0,"Adams County, Colorado",2019-12-30,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,2.713334,-4.903333,10.97679,0.0,43.75703,"Adams County, Colorado - Week of 2019-12-30",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
1,"Adams County, Colorado",2020-01-06,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,0.588095,-6.238889,8.563492,0.0,43.450591,"Adams County, Colorado - Week of 2020-01-06",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
2,"Adams County, Colorado",2020-01-13,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,-0.452381,-7.979365,8.264286,0.0,41.711824,"Adams County, Colorado - Week of 2020-01-13",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
3,"Adams County, Colorado",2020-01-20,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,2.561565,-4.45034,11.145692,0.0,42.763934,"Adams County, Colorado - Week of 2020-01-20",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593
4,"Adams County, Colorado",2020-01-27,0.0,0.0,,,0.0,,503590.0,254232.0,249358.0,74850.0,73029.0,71893.0,81068.0,67359.0,59799.0,43573.0,21329.0,10690.0,3102.0,77.977528,3.066667,-4.392857,11.653616,1.200453,48.735622,"Adams County, Colorado - Week of 2020-01-27",0.0,0.0,,,0.0,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593


In [6]:
change_columns = [
    'new_confirmed_per_100k',
    'new_deceased_per_100k',
    'cumulative_confirmed_per_100k',
    'cumulative_deceased_per_100k',
    'new_persons_fully_vaccinated_per_100k',
    'cumulative_persons_fully_vaccinated_per_100k',
    'average_temperature_celsius',
    'minimum_temperature_celsius',
    'maximum_temperature_celsius',
    'rainfall_mm',
    'relative_humidity'
]


# Calculate the percentage change within each group


# Ensure the DataFrame is sorted by date to correctly apply pct_change

weekly_df.sort_values(by=['county_name', 'week'], inplace=True)
weekly_df.set_index(['county_name', weekly_df.columns[1]], inplace=True)  # Set index as county_name and the time period
for col in change_columns:
    weekly_df[f'{col}_pct_change'] = weekly_df.groupby('county_name')[col].pct_change(fill_method=None).multiply(100)

# Reset index after calculations
weekly_df.reset_index(inplace=True)


In [7]:
for col in weekly_df.columns:
    print(col)

county_name
week
new_confirmed
new_deceased
cumulative_confirmed
cumulative_deceased
new_persons_fully_vaccinated
cumulative_persons_fully_vaccinated
population
population_male
population_female
population_age_00_09
population_age_10_19
population_age_20_29
population_age_30_39
population_age_40_49
population_age_50_59
population_age_60_69
population_age_70_79
population_age_80_and_older
area_sq_km
life_expectancy
average_temperature_celsius
minimum_temperature_celsius
maximum_temperature_celsius
rainfall_mm
relative_humidity
description
new_confirmed_per_100k
new_deceased_per_100k
cumulative_confirmed_per_100k
cumulative_deceased_per_100k
new_persons_fully_vaccinated_per_100k
cumulative_persons_fully_vaccinated_per_100k
population_male_per_100k
population_female_per_100k
population_age_00_09_per_100k
population_age_10_19_per_100k
population_age_20_29_per_100k
population_age_30_39_per_100k
population_age_40_49_per_100k
population_age_50_59_per_100k
population_age_60_69_per_100k
populat

In [8]:
weekly_df.tail(500)

Unnamed: 0,county_name,week,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,new_persons_fully_vaccinated,cumulative_persons_fully_vaccinated,population,population_male,population_female,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,description,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change
58146,"Yates County, New York",2021-04-19,15.0,0.0,1125.0,26.0,0.0,,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,6.722222,1.000793,13.477778,6.451600,60.267959,"Yates County, New York - Week of 2021-04-19",59.995200,0.0,4499.640029,103.991681,0.000000,,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-21.052632,,1.351351,0.0,,,-18.259025,-76.535175,-4.857415,-76.803653,-19.378277
58147,"Yates County, New York",2021-04-26,17.0,0.0,1142.0,26.0,432.0,8195.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,9.914286,4.113492,17.739683,31.312556,62.996033,"Yates County, New York - Week of 2021-04-26",67.994560,0.0,4567.634589,103.991681,1727.861771,32777.377810,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,13.333333,,1.511111,0.0,inf,,47.485244,311.023111,31.621717,385.345589,4.526573
58148,"Yates County, New York",2021-05-03,20.0,0.0,1162.0,26.0,547.0,8742.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,9.707936,4.988889,16.224603,20.940889,77.742800,"Yates County, New York - Week of 2021-05-03",79.993601,0.0,4647.628190,103.991681,2187.824974,34965.202784,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,17.647059,,1.751313,0.0,26.620370,6.674802,-2.081333,21.281104,-8.540622,-33.123029,23.409042
58149,"Yates County, New York",2021-05-10,6.0,0.0,1168.0,26.0,303.0,9045.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,10.869048,3.092857,17.934921,8.627533,52.186355,"Yates County, New York - Week of 2021-05-10",23.998080,0.0,4671.626270,103.991681,1211.903048,36177.105832,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-70.000000,,0.516351,0.0,-44.606947,3.466026,11.960434,-38.005093,10.541505,-58.800541,-32.873071
58150,"Yates County, New York",2021-05-17,4.0,0.0,1172.0,26.0,274.0,9319.0,25002.0,12162.0,12840.0,3082.0,3308.0,3316.0,2485.0,2404.0,3455.0,3604.0,2153.0,1195.0,974.0,80.58,19.763492,10.791270,28.157143,0.191911,54.239643,"Yates County, New York - Week of 2021-05-17",15.998720,0.0,4687.624990,103.991681,1095.912327,37273.018159,48644.108471,51355.891529,12327.013839,13230.941525,13262.938965,9939.204864,9615.230782,13818.894488,14414.846812,8611.311095,4779.617631,-33.333333,,0.342466,0.0,-9.570957,3.029298,81.832785,248.909439,56.996196,-97.775598,3.934532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58641,"Yuma County, Colorado",2022-08-15,0.0,0.0,,,1.0,4476.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,22.316667,14.610318,31.771428,4.080329,55.260483,"Yuma County, Colorado - Week of 2022-08-15",0.000000,0.0,,,10.041169,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-50.000000,0.022346,-13.859020,-10.479478,-10.310526,307.263579,29.454447
58642,"Yuma County, Colorado",2022-08-22,0.0,0.0,,,0.0,4476.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,23.825397,13.609524,33.813492,2.394857,38.153665,"Yuma County, Colorado - Week of 2022-08-22",0.000000,0.0,,,0.000000,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-100.000000,0.000000,6.760553,-6.849910,6.427358,-41.307257,-30.956693
58643,"Yuma County, Colorado",2022-08-29,0.0,0.0,,,0.0,,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,23.581129,13.655379,34.371605,1.320901,39.698171,"Yuma County, Colorado - Week of 2022-08-29",0.000000,0.0,,,0.000000,,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,,,-1.025242,0.336933,1.650563,-44.844264,4.048120
58644,"Yuma County, Colorado",2022-09-05,0.0,0.0,,,11.0,4487.0,9959.0,4956.0,5003.0,1520.0,1364.0,1113.0,1223.0,1117.0,1220.0,1142.0,769.0,491.0,6136.0,79.65,21.013404,11.675926,32.332716,0.464457,42.166975,"Yuma County, Colorado - Week of 2022-09-05",0.000000,0.0,,,110.452857,45054.724370,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,inf,,-10.888898,-14.495775,-5.931899,-64.837864,6.218938


In [9]:
columns_to_keep = [
    'county_name'
    , 'week'
    , 'description'
    , 'area_sq_km'
    , 'life_expectancy'
    , 'average_temperature_celsius'
    , 'minimum_temperature_celsius'
    , 'maximum_temperature_celsius'
    , 'rainfall_mm'
    , 'relative_humidity'
    , 'new_confirmed_per_100k'
    , 'new_deceased_per_100k'
    , 'cumulative_confirmed_per_100k'
    , 'cumulative_deceased_per_100k'
    , 'new_persons_fully_vaccinated_per_100k'
    , 'cumulative_persons_fully_vaccinated_per_100k'
    , 'population_male_per_100k'
    , 'population_female_per_100k'
    , 'population_age_00_09_per_100k'
    , 'population_age_10_19_per_100k'
    , 'population_age_20_29_per_100k'
    , 'population_age_30_39_per_100k'
    , 'population_age_40_49_per_100k'
    , 'population_age_50_59_per_100k'
    , 'population_age_60_69_per_100k'
    , 'population_age_70_79_per_100k'
    , 'population_age_80_and_older_per_100k'
    , 'new_confirmed_per_100k_pct_change'
    , 'new_deceased_per_100k_pct_change'
    , 'cumulative_confirmed_per_100k_pct_change'
    , 'cumulative_deceased_per_100k_pct_change'
    , 'new_persons_fully_vaccinated_per_100k_pct_change'
    , 'cumulative_persons_fully_vaccinated_per_100k_pct_change'
    , 'average_temperature_celsius_pct_change'
    , 'minimum_temperature_celsius_pct_change'
    , 'maximum_temperature_celsius_pct_change'
    , 'rainfall_mm_pct_change'
    , 'relative_humidity_pct_change'
]

weekly_df = weekly_df[columns_to_keep]
weekly_df

Unnamed: 0,county_name,week,description,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change
0,"Adams County, Colorado",2019-12-30,"Adams County, Colorado - Week of 2019-12-30",3102.0,77.977528,2.713334,-4.903333,10.976790,0.000000,43.757030,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,,,,,
1,"Adams County, Colorado",2020-01-06,"Adams County, Colorado - Week of 2020-01-06",3102.0,77.977528,0.588095,-6.238889,8.563492,0.000000,43.450591,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-78.325729,27.237710,-21.985462,,-0.700319
2,"Adams County, Colorado",2020-01-13,"Adams County, Colorado - Week of 2020-01-13",3102.0,77.977528,-0.452381,-7.979365,8.264286,0.000000,41.711824,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-176.923055,27.897216,-3.493977,,-4.001713
3,"Adams County, Colorado",2020-01-20,"Adams County, Colorado - Week of 2020-01-20",3102.0,77.977528,2.561565,-4.450340,11.145692,0.000000,42.763934,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,-666.240740,-44.226888,34.865760,,2.522331
4,"Adams County, Colorado",2020-01-27,"Adams County, Colorado - Week of 2020-01-27",3102.0,77.977528,3.066667,-4.392857,11.653616,1.200453,48.735622,0.0,0.0,,,0.000000,,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,,,19.718495,-1.291657,4.557132,inf,13.964309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58641,"Yuma County, Colorado",2022-08-15,"Yuma County, Colorado - Week of 2022-08-15",6136.0,79.650000,22.316667,14.610318,31.771428,4.080329,55.260483,0.0,0.0,,,10.041169,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-50.0,0.022346,-13.859020,-10.479478,-10.310526,307.263579,29.454447
58642,"Yuma County, Colorado",2022-08-22,"Yuma County, Colorado - Week of 2022-08-22",6136.0,79.650000,23.825397,13.609524,33.813492,2.394857,38.153665,0.0,0.0,,,0.000000,44944.271513,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,-100.0,0.000000,6.760553,-6.849910,6.427358,-41.307257,-30.956693
58643,"Yuma County, Colorado",2022-08-29,"Yuma County, Colorado - Week of 2022-08-29",6136.0,79.650000,23.581129,13.655379,34.371605,1.320901,39.698171,0.0,0.0,,,0.000000,,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,,,-1.025242,0.336933,1.650563,-44.844264,4.048120
58644,"Yuma County, Colorado",2022-09-05,"Yuma County, Colorado - Week of 2022-09-05",6136.0,79.650000,21.013404,11.675926,32.332716,0.464457,42.166975,0.0,0.0,,,110.452857,45054.724370,49764.032533,50235.967467,15262.576564,13696.154232,11175.820866,12280.349433,11215.985541,12250.225926,11467.014761,7721.658801,4930.213877,,,,,inf,,-10.888898,-14.495775,-5.931899,-64.837864,6.218938


In [10]:
# Function to apply binning
def apply_binning(df):
    # area_sq_km
    bins = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, float('inf')]
    labels = ['<1000', '1000-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '9000-10000', '>10000']
    df['area_sq_km_bins'] = pd.cut(df['area_sq_km'], bins=bins, labels=labels, right=False)


    # life_expectancy
    bins = [0, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, float('inf')]
    labels = ['<70', '70-72', '72-74', '74-76', '76-78', '78-80', '80-82', '82-84', '84-86', '86-88', '>88']
    df['life_expectancy_bins'] = pd.cut(df['life_expectancy'], bins=bins, labels=labels, right=False)

    # average_temperature_celsius, minimum_temperature_celsius, maximum_temperature_celsius
    temp_bins = [df['average_temperature_celsius'].min(), 0, 15, 25, 35, df['average_temperature_celsius'].max()]
    temp_labels = ['<0°C', '0-15°C', '15-25°C', '25-35°C', '>35°C']
    df['average_temperature_celsius_bins'] = pd.cut(df['average_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)
    df['minimum_temperature_celsius_bins'] = pd.cut(df['minimum_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)
    df['maximum_temperature_celsius_bins'] = pd.cut(df['maximum_temperature_celsius'], bins=temp_bins, labels=temp_labels, right=False)

    # rainfall_mm
    bins = [0, 5, 10, 15, 20, float('inf')]
    labels = ['<5mm', '5-10mm', '10-15mm', '15-20mm', '>20mm']
    df['rainfall_mm_bins'] = pd.cut(df['rainfall_mm'], bins=bins, labels=labels, right=False)

    # relative_humidity
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    labels = ['<10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '>90%']
    df['relative_humidity_bins'] = pd.cut(df['relative_humidity'], bins=bins, labels=labels, right=False)

    # new_confirmed_per_100k
    bins = [-100000, 50, 100, 150, 200, 250, 300, 350, 400, float('inf')]
    labels = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350-400', '>400']
    df['new_confirmed_per_100k_bins'] = pd.cut(df['new_confirmed_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_confirmed_per_100k
    bins = [-100000, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, float('inf')]
    labels = ['<1000', '1000-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '9000-10000', '>10000']
    df['cumulative_confirmed_per_100k_bins'] = pd.cut(df['cumulative_confirmed_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_deceased_per_100k
    bins = [-100000, 100, 200, 300, 400, 500, 600, 700, 800, float('inf')]
    labels = ['<100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '>800']
    df['cumulative_deceased_per_100k_bins'] = pd.cut(df['cumulative_deceased_per_100k'], bins=bins, labels=labels, right=False)

    # new_persons_fully_vaccinated_per_100k
    bins = [-100000, 50, 100, 150, 200, 250, 300, 350, 400, float('inf')]
    labels = ['<50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350', '350-400', '>400']
    df['new_persons_fully_vaccinated_per_100k_bins'] = pd.cut(df['new_persons_fully_vaccinated_per_100k'], bins=bins, labels=labels, right=False)

    # cumulative_persons_fully_vaccinated_per_100k
    bins = [-100000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, float('inf')]
    labels = ['<10000', '10000-20000', '20000-30000', '30000-40000', '40000-50000', '50000-60000', '60000-70000', '70000-80000', '80000-90000', '90000-100000', '>100000']
    df['cumulative_persons_fully_vaccinated_per_100k_bins'] = pd.cut(df['cumulative_persons_fully_vaccinated_per_100k'], bins=bins, labels=labels, right=False)

    # population_male_per_100k population_female_per_100k
    bins = [40000, 42000, 44000, 46000, 48000, 50000, 52000, 54000, 56000, 58000, 60000, float('inf')]
    labels = ['<42000', '42000-44000', '44000-46000', '46000-48000', '48000-50000', '50000-52000', '52000-54000', '54000-56000', '56000-58000', '58000-60000', '>60000']
    df['population_male_per_100k_bins'] = pd.cut(df['population_male_per_100k'], bins=bins, labels=labels, right=False)
    df['population_female_per_100k_bins'] = pd.cut(df['population_female_per_100k'], bins=bins, labels=labels, right=False)

    # population_age_xxx
    bins = [0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000, float('inf')]
    labels = ['<2000', '2000-4000', '4000-6000', '6000-8000', '8000-10000', '10000-12000', '12000-14000', '14000-16000', '16000-18000', '18000-20000', '>20000']
    df['population_age_00_09_per_100k_bins'] = pd.cut(df['population_age_00_09_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_10_19_per_100k_bins'] = pd.cut(df['population_age_10_19_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_20_29_per_100k_bins'] = pd.cut(df['population_age_20_29_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_30_39_per_100k_bins'] = pd.cut(df['population_age_30_39_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_40_49_per_100k_bins'] = pd.cut(df['population_age_40_49_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_50_59_per_100k_bins'] = pd.cut(df['population_age_50_59_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_60_69_per_100k_bins'] = pd.cut(df['population_age_60_69_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_70_79_per_100k_bins'] = pd.cut(df['population_age_70_79_per_100k'], bins=bins, labels=labels, right=False)
    df['population_age_80_and_older_per_100k_bins'] = pd.cut(df['population_age_80_and_older_per_100k'], bins=bins, labels=labels, right=False)


    # new_confirmed_per_100k_pct_change
    bins = [-100, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 100]
    labels = ['<-50%', '-50% - -40%', '-40% - -30%', '-30% - -20%', '-20% - -10%', '-10% - 0%', '0% - 10%', '10% - 20%', '20% - 30%', '30% - 40%', '40% - 50%', '>50%']
    df['new_confirmed_per_100k_pct_change_bins'] = pd.cut(df['new_confirmed_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['new_deceased_per_100k_pct_change_bins'] = pd.cut(df['new_deceased_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_confirmed_per_100k_pct_change_bins'] = pd.cut(df['cumulative_confirmed_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_deceased_per_100k_pct_change_bins'] = pd.cut(df['cumulative_deceased_per_100k_pct_change'], bins=bins, labels=labels, right=False)
    df['cumulative_persons_fully_vaccinated_per_100k_pct_change_bins'] = pd.cut(df['cumulative_persons_fully_vaccinated_per_100k_pct_change'], bins=bins, labels=labels, right=False)

    df['average_temperature_celsius_pct_change_bins'] = pd.cut(df['average_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)
    df['minimum_temperature_celsius_pct_change_bins'] = pd.cut(df['minimum_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)
    df['maximum_temperature_celsius_pct_change_bins'] = pd.cut(df['maximum_temperature_celsius_pct_change'], bins=bins, labels=labels, right=False)

    df['rainfall_mm_pct_change_bins'] = pd.cut(df['rainfall_mm_pct_change'], bins=bins, labels=labels, right=False)
    df['relative_humidity_pct_change_bins'] = pd.cut(df['relative_humidity_pct_change'], bins=bins, labels=labels, right=False)

    
    
apply_binning(weekly_df)

In [25]:
for col in weekly_df.columns:
    print(col)
    

county_name
week
description
area_sq_km
life_expectancy
average_temperature_celsius
minimum_temperature_celsius
maximum_temperature_celsius
rainfall_mm
relative_humidity
new_confirmed_per_100k
new_deceased_per_100k
cumulative_confirmed_per_100k
cumulative_deceased_per_100k
new_persons_fully_vaccinated_per_100k
cumulative_persons_fully_vaccinated_per_100k
population_male_per_100k
population_female_per_100k
population_age_00_09_per_100k
population_age_10_19_per_100k
population_age_20_29_per_100k
population_age_30_39_per_100k
population_age_40_49_per_100k
population_age_50_59_per_100k
population_age_60_69_per_100k
population_age_70_79_per_100k
population_age_80_and_older_per_100k
new_confirmed_per_100k_pct_change
new_deceased_per_100k_pct_change
cumulative_confirmed_per_100k_pct_change
cumulative_deceased_per_100k_pct_change
new_persons_fully_vaccinated_per_100k_pct_change
cumulative_persons_fully_vaccinated_per_100k_pct_change
average_temperature_celsius_pct_change
minimum_temperature_ce

In [11]:
sample_df = weekly_df.sample(n=10)
sample_df


Unnamed: 0,county_name,week,description,area_sq_km,life_expectancy,average_temperature_celsius,minimum_temperature_celsius,maximum_temperature_celsius,rainfall_mm,relative_humidity,new_confirmed_per_100k,new_deceased_per_100k,cumulative_confirmed_per_100k,cumulative_deceased_per_100k,new_persons_fully_vaccinated_per_100k,cumulative_persons_fully_vaccinated_per_100k,population_male_per_100k,population_female_per_100k,population_age_00_09_per_100k,population_age_10_19_per_100k,population_age_20_29_per_100k,population_age_30_39_per_100k,population_age_40_49_per_100k,population_age_50_59_per_100k,population_age_60_69_per_100k,population_age_70_79_per_100k,population_age_80_and_older_per_100k,new_confirmed_per_100k_pct_change,new_deceased_per_100k_pct_change,cumulative_confirmed_per_100k_pct_change,cumulative_deceased_per_100k_pct_change,new_persons_fully_vaccinated_per_100k_pct_change,cumulative_persons_fully_vaccinated_per_100k_pct_change,average_temperature_celsius_pct_change,minimum_temperature_celsius_pct_change,maximum_temperature_celsius_pct_change,rainfall_mm_pct_change,relative_humidity_pct_change,area_sq_km_bins,life_expectancy_bins,average_temperature_celsius_bins,minimum_temperature_celsius_bins,maximum_temperature_celsius_bins,rainfall_mm_bins,relative_humidity_bins,new_confirmed_per_100k_bins,cumulative_confirmed_per_100k_bins,cumulative_deceased_per_100k_bins,new_persons_fully_vaccinated_per_100k_bins,cumulative_persons_fully_vaccinated_per_100k_bins,population_male_per_100k_bins,population_female_per_100k_bins,population_age_00_09_per_100k_bins,population_age_10_19_per_100k_bins,population_age_20_29_per_100k_bins,population_age_30_39_per_100k_bins,population_age_40_49_per_100k_bins,population_age_50_59_per_100k_bins,population_age_60_69_per_100k_bins,population_age_70_79_per_100k_bins,population_age_80_and_older_per_100k_bins,new_confirmed_per_100k_pct_change_bins,new_deceased_per_100k_pct_change_bins,cumulative_confirmed_per_100k_pct_change_bins,cumulative_deceased_per_100k_pct_change_bins,cumulative_persons_fully_vaccinated_per_100k_pct_change_bins,average_temperature_celsius_pct_change_bins,minimum_temperature_celsius_pct_change_bins,maximum_temperature_celsius_pct_change_bins,rainfall_mm_pct_change_bins,relative_humidity_pct_change_bins
16006,"Franklin County, Florida",2021-12-13,"Franklin County, Florida - Week of 2021-12-13",2687.0,75.375,18.779762,15.695436,22.679563,7.1755,88.057603,68.236097,0.0,19106.107131,383.828045,213.237803,47876.151484,56798.021153,43201.978847,8973.046742,8555.100648,13587.512794,13271.920846,11659.843057,13442.511088,15464.005459,10934.834527,4111.224838,100.0,,0.358423,0.0,4.166667,0.447387,-1.639627,2.049753,-3.429615,-5.042017,-3.150758,2000-3000,74-76,15-25°C,15-25°C,15-25°C,5-10mm,80-90%,50-100,>10000,300-400,200-250,40000-50000,56000-58000,42000-44000,8000-10000,8000-10000,12000-14000,12000-14000,10000-12000,12000-14000,14000-16000,10000-12000,4000-6000,,,0% - 10%,0% - 10%,0% - 10%,-10% - 0%,0% - 10%,-10% - 0%,-10% - 0%,-10% - 0%
4448,"Brown County, Wisconsin",2020-11-16,"Brown County, Wisconsin - Week of 2020-11-16",1594.0,,2.089683,-2.21746,7.209524,1.580445,66.949243,522.612144,3.056211,8282.332806,50.045461,0.0,,49558.377457,50441.622543,13216.968086,13410.655481,13451.532308,13485.914686,12024.281599,14008.144803,11082.968498,5826.284946,3493.249593,-29.045643,-20.0,6.734935,6.504065,,,-63.877074,-259.474883,-38.737525,-92.945324,-13.606178,1000-2000,,0-15°C,<0°C,0-15°C,<5mm,60-70%,>400,8000-9000,<100,<50,,48000-50000,50000-52000,12000-14000,12000-14000,12000-14000,12000-14000,12000-14000,14000-16000,10000-12000,4000-6000,2000-4000,-30% - -20%,-30% - -20%,0% - 10%,0% - 10%,,<-50%,,-40% - -30%,<-50%,-20% - -10%
133,"Adams County, Colorado",2022-07-18,"Adams County, Colorado - Week of 2022-07-18",3102.0,77.977528,25.764286,17.326984,35.05097,5.894917,34.927698,0.0,0.0,,,77.046804,71053.436327,50483.925416,49516.074584,14863.281638,14501.677952,14276.097619,16098.016243,13375.762029,11874.540797,8652.475228,4235.389901,2122.758593,,,,,4.864865,0.108553,5.180794,7.934939,2.569144,2321.735951,-1.411878,3000-4000,76-78,25-35°C,15-25°C,>35°C,5-10mm,30-40%,<50,,,50-100,70000-80000,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,0% - 10%,0% - 10%,0% - 10%,0% - 10%,,-10% - 0%
39373,"Perquimans County, North Carolina",2020-09-28,"Perquimans County, North Carolina - Week of 20...",852.0,77.0,19.007937,14.540564,24.673721,22.468921,79.620578,66.969269,0.0,1458.441848,22.32309,0.0,,48173.227175,51826.772825,10097.477491,10655.554729,9226.877,10171.887789,10298.385297,14792.767319,16154.475779,12627.427636,5975.14696,-10.0,,4.812834,0.0,,,2.074159,0.770799,4.173648,594.022441,-0.693896,<1000,76-78,15-25°C,0-15°C,15-25°C,>20mm,70-80%,50-100,1000-2000,<100,<50,,48000-50000,50000-52000,10000-12000,10000-12000,8000-10000,10000-12000,10000-12000,14000-16000,16000-18000,12000-14000,4000-6000,-10% - 0%,,0% - 10%,0% - 10%,,0% - 10%,0% - 10%,0% - 10%,,-10% - 0%
53911,"Vernon County, Wisconsin",2021-10-11,"Vernon County, Wisconsin - Week of 2021-10-11",2114.0,,12.575397,7.733333,18.274603,14.2748,73.307652,286.626278,3.257117,9136.212625,179.141424,351.768614,49895.772262,50302.911862,49697.088138,14087.030161,14093.544395,9312.096932,10351.117191,10810.37066,14617.940199,13992.573774,7732.395284,5002.931405,-14.563107,0.0,3.238866,1.851852,25.581395,0.710012,-30.988675,-47.186992,-20.517778,38.765432,-12.73249,2000-3000,,0-15°C,0-15°C,15-25°C,10-15mm,70-80%,250-300,9000-10000,100-200,350-400,40000-50000,50000-52000,48000-50000,14000-16000,14000-16000,8000-10000,10000-12000,10000-12000,14000-16000,12000-14000,6000-8000,4000-6000,-20% - -10%,0% - 10%,0% - 10%,0% - 10%,0% - 10%,-40% - -30%,-50% - -40%,-30% - -20%,30% - 40%,-20% - -10%
18154,"Grand County, Colorado",2022-04-18,"Grand County, Colorado - Week of 2022-04-18",4842.0,80.533333,6.742857,-0.414286,15.034921,2.302128,42.806099,123.665712,0.0,23021.348607,169.226764,71.595939,64664.149961,53475.657381,46524.342619,8396.250976,10589.690185,12177.818276,13394.949232,12607.393908,16304.347826,17098.411872,7153.085134,2278.05259,-24.0,,0.54008,0.0,-21.428571,0.120931,1913.270524,-93.387383,87.564354,-0.314196,-14.17269,4000-5000,80-82,0-15°C,<0°C,15-25°C,<5mm,40-50%,100-150,>10000,100-200,50-100,60000-70000,52000-54000,46000-48000,8000-10000,10000-12000,12000-14000,12000-14000,12000-14000,16000-18000,16000-18000,6000-8000,2000-4000,-30% - -20%,,0% - 10%,0% - 10%,0% - 10%,,<-50%,>50%,-10% - 0%,-20% - -10%
3477,"Bertie County, North Carolina",2021-04-26,"Bertie County, North Carolina - Week of 2021-0...",1920.0,77.0,19.327513,12.293827,27.090653,0.0,56.239354,83.06079,0.0,9079.58262,223.225873,581.425531,25774.801433,50365.986606,49634.013394,9328.76499,10294.346675,12817.318175,10912.111301,11446.815138,15039.19431,15054.768208,9282.043295,5824.637907,6.666667,,0.923254,0.0,inf,,53.219246,130.20096,35.474767,-100.0,-15.307345,1000-2000,76-78,15-25°C,0-15°C,25-35°C,<5mm,50-60%,50-100,9000-10000,200-300,>400,20000-30000,50000-52000,48000-50000,8000-10000,10000-12000,12000-14000,10000-12000,10000-12000,14000-16000,14000-16000,8000-10000,4000-6000,0% - 10%,,0% - 10%,0% - 10%,,>50%,,30% - 40%,<-50%,-20% - -10%
9798,"Colusa County, California",2019-12-30,"Colusa County, California - Week of 2019-12-30",2995.0,80.22,8.974321,3.535185,15.652469,2.12725,81.848466,0.0,0.0,,,0.0,,51003.290236,48996.709764,15158.255712,15366.791788,13174.845915,12864.358867,11520.459706,11849.483294,10547.291348,6154.131331,3364.382038,,,,,,,,,,,,2000-3000,80-82,0-15°C,0-15°C,15-25°C,<5mm,80-90%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,12000-14000,12000-14000,10000-12000,10000-12000,10000-12000,6000-8000,2000-4000,,,,,,,,,,
21284,"Hillsborough County, Florida",2022-05-30,"Hillsborough County, Florida - Week of 2022-05-30",3279.0,77.533083,26.812169,22.774251,31.655203,28.203575,72.254361,0.0,0.0,,,93.780489,65314.886566,48892.787453,51107.212547,12496.845948,12534.414215,14473.595676,14418.925435,13450.000561,13111.325431,10344.31037,5949.243588,3221.338776,,,,,16.0451,0.143789,-4.230188,-3.501102,-3.23998,435.838797,9.434361,3000-4000,76-78,25-35°C,15-25°C,25-35°C,>20mm,70-80%,<50,,,50-100,60000-70000,48000-50000,50000-52000,12000-14000,12000-14000,14000-16000,14000-16000,12000-14000,12000-14000,10000-12000,4000-6000,2000-4000,,,,,0% - 10%,-10% - 0%,-10% - 0%,-10% - 0%,,0% - 10%
10684,"Crawford County, Wisconsin",2020-08-24,"Crawford County, Wisconsin - Week of 2020-08-24",1552.0,,23.984127,18.080952,30.624603,9.9822,71.532078,43.212544,0.0,629.668498,0.0,0.0,,51959.997531,48040.002469,10173.467498,12327.921477,10475.955306,10858.694981,10648.805482,15087.351071,15303.413791,9901.845793,5222.544602,-22.222222,,7.368421,,,,13.634653,24.990401,9.258997,346.027787,2.908918,1000-2000,,15-25°C,15-25°C,25-35°C,5-10mm,70-80%,<50,<1000,<100,<50,,50000-52000,48000-50000,10000-12000,12000-14000,10000-12000,10000-12000,10000-12000,14000-16000,14000-16000,8000-10000,4000-6000,-30% - -20%,,0% - 10%,,,10% - 20%,20% - 30%,0% - 10%,,0% - 10%


In [17]:
columns_to_keep = ['county_name'
    , 'week'
    , 'description'
    , 'area_sq_km_bins'
    , 'life_expectancy_bins'
    , 'average_temperature_celsius_bins'
    , 'minimum_temperature_celsius_bins'
    , 'maximum_temperature_celsius_bins'
    , 'rainfall_mm_bins'
    , 'relative_humidity_bins'
    , 'new_confirmed_per_100k_bins'
    , 'cumulative_confirmed_per_100k_bins'
    , 'cumulative_deceased_per_100k_bins'
    , 'new_persons_fully_vaccinated_per_100k_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_bins'
    , 'population_male_per_100k_bins'
    , 'population_female_per_100k_bins'
    , 'population_age_00_09_per_100k_bins'
    , 'population_age_10_19_per_100k_bins'
    , 'population_age_20_29_per_100k_bins'
    , 'population_age_30_39_per_100k_bins'
    , 'population_age_40_49_per_100k_bins'
    , 'population_age_50_59_per_100k_bins'
    , 'population_age_60_69_per_100k_bins'
    , 'population_age_70_79_per_100k_bins'
    , 'population_age_80_and_older_per_100k_bins'
    , 'new_confirmed_per_100k_pct_change_bins'
    , 'new_deceased_per_100k_pct_change_bins'
    , 'cumulative_confirmed_per_100k_pct_change_bins'
    , 'cumulative_deceased_per_100k_pct_change_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_pct_change_bins'
    , 'average_temperature_celsius_pct_change_bins'
    , 'minimum_temperature_celsius_pct_change_bins'
    , 'maximum_temperature_celsius_pct_change_bins'
    , 'rainfall_mm_pct_change_bins'
    , 'relative_humidity_pct_change_bins'
    ]

weekly_df_binned = weekly_df[columns_to_keep]   

In [13]:
for column in weekly_df_binned.columns:
    print(column)

county_name
week
description
area_sq_km_bins
life_expectancy_bins
average_temperature_celsius_bins
minimum_temperature_celsius_bins
maximum_temperature_celsius_bins
rainfall_mm_bins
relative_humidity_bins
new_confirmed_per_100k_bins
cumulative_confirmed_per_100k_bins
cumulative_deceased_per_100k_bins
new_persons_fully_vaccinated_per_100k_bins
cumulative_persons_fully_vaccinated_per_100k_bins
population_male_per_100k_bins
population_female_per_100k_bins
population_age_00_09_per_100k_bins
population_age_10_19_per_100k_bins
population_age_20_29_per_100k_bins
population_age_30_39_per_100k_bins
population_age_40_49_per_100k_bins
population_age_50_59_per_100k_bins
population_age_60_69_per_100k_bins
population_age_70_79_per_100k_bins
population_age_80_and_older_per_100k_bins
new_confirmed_per_100k_pct_change_bins
new_deceased_per_100k_pct_change_bins
cumulative_confirmed_per_100k_pct_change_bins
cumulative_deceased_per_100k_pct_change_bins
cumulative_persons_fully_vaccinated_per_100k_pct_chan

In [14]:
weekly_df_binned.head()

Unnamed: 0,county_name,week,description,area_sq_km_bins,life_expectancy_bins,average_temperature_celsius_bins,minimum_temperature_celsius_bins,maximum_temperature_celsius_bins,rainfall_mm_bins,relative_humidity_bins,new_confirmed_per_100k_bins,cumulative_confirmed_per_100k_bins,cumulative_deceased_per_100k_bins,new_persons_fully_vaccinated_per_100k_bins,cumulative_persons_fully_vaccinated_per_100k_bins,population_male_per_100k_bins,population_female_per_100k_bins,population_age_00_09_per_100k_bins,population_age_10_19_per_100k_bins,population_age_20_29_per_100k_bins,population_age_30_39_per_100k_bins,population_age_40_49_per_100k_bins,population_age_50_59_per_100k_bins,population_age_60_69_per_100k_bins,population_age_70_79_per_100k_bins,population_age_80_and_older_per_100k_bins,new_confirmed_per_100k_pct_change_bins,new_deceased_per_100k_pct_change_bins,cumulative_confirmed_per_100k_pct_change_bins,cumulative_deceased_per_100k_pct_change_bins,cumulative_persons_fully_vaccinated_per_100k_pct_change_bins,average_temperature_celsius_pct_change_bins,minimum_temperature_celsius_pct_change_bins,maximum_temperature_celsius_pct_change_bins,rainfall_mm_pct_change_bins,relative_humidity_pct_change_bins
0,"Adams County, Colorado",2019-12-30,"Adams County, Colorado - Week of 2019-12-30",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,,,,
1,"Adams County, Colorado",2020-01-06,"Adams County, Colorado - Week of 2020-01-06",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,<-50%,20% - 30%,-30% - -20%,,-10% - 0%
2,"Adams County, Colorado",2020-01-13,"Adams County, Colorado - Week of 2020-01-13",3000-4000,76-78,<0°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,20% - 30%,-10% - 0%,,-10% - 0%
3,"Adams County, Colorado",2020-01-20,"Adams County, Colorado - Week of 2020-01-20",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,,-50% - -40%,30% - 40%,,0% - 10%
4,"Adams County, Colorado",2020-01-27,"Adams County, Colorado - Week of 2020-01-27",3000-4000,76-78,0-15°C,<0°C,0-15°C,<5mm,40-50%,<50,,,<50,,50000-52000,48000-50000,14000-16000,14000-16000,14000-16000,16000-18000,12000-14000,10000-12000,8000-10000,4000-6000,2000-4000,,,,,,10% - 20%,-10% - 0%,0% - 10%,,10% - 20%


### Frequent Pattern Analysis


In [18]:
!pip install mlxtend

Looking in indexes: https://pypi.org/simple, https://pypi-user:****@fetch.jfrog.io/artifactory/api/pypi/fetchrewards-pypi-release-local/simple
Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.1
[0m

In [15]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## The following cell is having issues - running out of memory. 

In [None]:
# Ensure all data in the DataFrame columns are categorical
columns_to_convert = weekly_df_binned.columns.difference(['county_name', 'week', 'description'])
for col in columns_to_convert:
    weekly_df_binned.loc[:, col] = weekly_df_binned[col].astype(str)

# Convert DataFrame into a list of transactions (ensure all items are strings)
transactions = weekly_df_binned.drop(['county_name', 'week', 'description'], axis=1).applymap(str).values.tolist()

# Encode the transactions
encoder = TransactionEncoder()
encoded_array = encoder.fit_transform(transactions)
encoded_df = pd.DataFrame(encoded_array, columns=encoder.columns_)

# Find frequent itemsets with a minimum support threshold
frequent_itemsets = apriori(encoded_df, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Display the rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype(str)
A value is trying to be set on 

## The following cell is having issues - running out of memory. 

In [16]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Optimize data types by converting to 'category'
columns_to_convert = weekly_df_binned.columns.difference(['county_name', 'week', 'description'])
for col in columns_to_convert:
    weekly_df_binned[col] = weekly_df_binned[col].astype('category')

# Filtering example: Select only rows where certain conditions are met (customize this as needed)
weekly_df_binned = weekly_df_binned[weekly_df_binned['new_confirmed_per_100k_bins'] != '0-10']

# Convert DataFrame into a list of transactions
transactions = weekly_df_binned.drop(['county_name', 'week', 'description'], axis=1).apply(lambda row: [f"{idx}_{val}" for idx, val in row.items() if val], axis=1).tolist()

# Encode the transactions using TransactionEncoder
encoder = TransactionEncoder()
encoded_array = encoder.fit_transform(transactions)
encoded_df = pd.DataFrame(encoded_array, columns=encoder.columns_).astype('int8')  # using 'int8' for reduced memory usage

# Find frequent itemsets and generate association rules
frequent_itemsets = apriori(encoded_df, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype('category')


MemoryError: Unable to allocate 82.4 GiB for an array with shape (376946, 4, 58646) and data type int8

In [20]:
columns_to_keep = ['county_name'
    , 'week'
    , 'description'
    , 'area_sq_km_bins'
    , 'life_expectancy_bins'
    , 'average_temperature_celsius_bins'
    , 'minimum_temperature_celsius_bins'
    , 'maximum_temperature_celsius_bins'
    , 'rainfall_mm_bins'
    , 'relative_humidity_bins'
    , 'new_confirmed_per_100k_bins'
    , 'cumulative_confirmed_per_100k_bins'
    , 'cumulative_deceased_per_100k_bins'
    , 'new_persons_fully_vaccinated_per_100k_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_bins'
    , 'population_male_per_100k_bins'
    , 'new_confirmed_per_100k_pct_change_bins'
    , 'new_deceased_per_100k_pct_change_bins'
    , 'cumulative_confirmed_per_100k_pct_change_bins'
    , 'cumulative_deceased_per_100k_pct_change_bins'
    , 'cumulative_persons_fully_vaccinated_per_100k_pct_change_bins'
    , 'average_temperature_celsius_pct_change_bins'
    ]

weekly_df_binned = weekly_df[columns_to_keep]   

In [22]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from scipy import sparse

# Convert columns to 'category' to optimize memory usage
columns_to_convert = weekly_df_binned.columns.difference(['county_name', 'week', 'description'])
for col in columns_to_convert:
    weekly_df_binned[col] = weekly_df_binned[col].astype('category')

# Convert DataFrame into a list of transactions
transactions = weekly_df_binned.drop(['county_name', 'week', 'description'], axis=1).apply(
    lambda row: [f"{idx}_{val}" for idx, val in row.items() if val], axis=1).tolist()

# Encode the transactions using TransactionEncoder
encoder = TransactionEncoder()
encoded_array = encoder.fit_transform(transactions)

# Convert to sparse DataFrame to save memory
encoded_sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse.csr_matrix(encoded_array), columns=encoder.columns_)

# Find frequent itemsets and generate association rules using sparse data
frequent_itemsets = apriori(encoded_sparse_df, min_support=0.05, use_colnames=True, low_memory=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_df_binned[col] = weekly_df_binned[col].astype('category')
  encoded_sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse.csr_matrix(encoded_array), columns=encoder.columns_)


                                              antecedents  \
0                             (area_sq_km_bins_1000-2000)   
1                             (area_sq_km_bins_1000-2000)   
2                             (area_sq_km_bins_1000-2000)   
3                             (area_sq_km_bins_1000-2000)   
4                             (area_sq_km_bins_1000-2000)   
...                                                   ...   
201285  (cumulative_confirmed_per_100k_pct_change_bins...   
201286  (cumulative_confirmed_per_100k_bins_nan, cumul...   
201287  (minimum_temperature_celsius_bins_15-25°C, cum...   
201288  (cumulative_confirmed_per_100k_bins_nan, minim...   
201289  (minimum_temperature_celsius_bins_15-25°C, cum...   

                                              consequents   support  \
0       (cumulative_confirmed_per_100k_pct_change_bins...  0.198155   
1       (cumulative_deceased_per_100k_pct_change_bins_...  0.199775   
2       (cumulative_persons_fully_vaccinated_per_100k_

In [24]:
pd.set_option('display.max_colwidth', None)  # None to display all content
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(area_sq_km_bins_1000-2000),(cumulative_confirmed_per_100k_pct_change_bins_0% - 10%),0.314770,0.630955,0.198155,0.629523,0.997731,-0.000451,0.996135,-0.003309
1,(area_sq_km_bins_1000-2000),(cumulative_deceased_per_100k_pct_change_bins_0% - 10%),0.314770,0.627409,0.199775,0.634670,1.011573,0.002286,1.019875,0.016696
2,(area_sq_km_bins_1000-2000),(cumulative_persons_fully_vaccinated_per_100k_pct_change_bins_nan),0.314770,0.516608,0.160795,0.510834,0.988823,-0.001817,0.988196,-0.016227
3,(area_sq_km_bins_1000-2000),(new_deceased_per_100k_pct_change_bins_nan),0.314770,0.660608,0.211080,0.670585,1.015103,0.003141,1.030288,0.021713
4,(area_sq_km_bins_1000-2000),(new_persons_fully_vaccinated_per_100k_bins_<50),0.314770,0.567302,0.176056,0.559317,0.985925,-0.002513,0.981881,-0.020408
...,...,...,...,...,...,...,...,...,...,...
201285,"(cumulative_confirmed_per_100k_pct_change_bins_nan, cumulative_persons_fully_vaccinated_per_100k_pct_change_bins_0% - 10%)","(new_confirmed_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_bins_nan, maximum_temperature_celsius_bins_25-35°C, new_confirmed_per_100k_bins_<50, minimum_temperature_celsius_bins_15-25°C, new_deceased_per_100k_pct_change_bins_nan, cumulative_confirmed_per_100k_bins_nan, cumulative_deceased_per_100k_pct_change_bins_nan)",0.108788,0.064847,0.054906,0.504702,7.783004,0.047851,1.888063,0.977899
201286,"(cumulative_confirmed_per_100k_bins_nan, cumulative_persons_fully_vaccinated_per_100k_pct_change_bins_0% - 10%)","(new_confirmed_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_bins_nan, maximum_temperature_celsius_bins_25-35°C, new_confirmed_per_100k_bins_<50, minimum_temperature_celsius_bins_15-25°C, cumulative_confirmed_per_100k_pct_change_bins_nan, new_deceased_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_pct_change_bins_nan)",0.108788,0.064847,0.054906,0.504702,7.783004,0.047851,1.888063,0.977899
201287,"(minimum_temperature_celsius_bins_15-25°C, cumulative_confirmed_per_100k_pct_change_bins_nan)","(new_confirmed_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_bins_nan, maximum_temperature_celsius_bins_25-35°C, new_confirmed_per_100k_bins_<50, cumulative_persons_fully_vaccinated_per_100k_pct_change_bins_0% - 10%, new_deceased_per_100k_pct_change_bins_nan, cumulative_confirmed_per_100k_bins_nan, cumulative_deceased_per_100k_pct_change_bins_nan)",0.077942,0.079613,0.054906,0.704441,8.848286,0.048700,3.114055,0.961961
201288,"(cumulative_confirmed_per_100k_bins_nan, minimum_temperature_celsius_bins_15-25°C)","(new_confirmed_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_bins_nan, maximum_temperature_celsius_bins_25-35°C, new_confirmed_per_100k_bins_<50, cumulative_persons_fully_vaccinated_per_100k_pct_change_bins_0% - 10%, cumulative_confirmed_per_100k_pct_change_bins_nan, new_deceased_per_100k_pct_change_bins_nan, cumulative_deceased_per_100k_pct_change_bins_nan)",0.074566,0.079613,0.054906,0.736337,9.248918,0.048969,3.490764,0.963742
