In [None]:
import pandas as pd
import os

# Pivot Input Data

In [None]:
# Read the deaths CSV file into a pandas DataFrame
deaths_df = pd.read_csv(os.getcwd() + '/Data/deaths.csv')

# Read the cases CSV file into a pandas DataFrame
cases_df = pd.read_csv(os.getcwd() + '/Data/cases.csv')

# Specify the column name of the county IDs
county_id_column = 'CountyID'

# Use the `melt` function to reshape the deaths DataFrame
melted_deaths_df = pd.melt(deaths_df, id_vars=county_id_column, var_name='Date', value_name='Deaths')

# Use the `melt` function to reshape the deaths DataFrame
melted_cases_df = pd.melt(cases_df, id_vars=county_id_column, var_name='Date', value_name='Cases')


melted_deaths_df.to_csv(os.getcwd() + "/Data/melted_deaths_df.csv", index=False)

melted_cases_df.to_csv(os.getcwd() + "/Data/melted_cases_df.csv", index=False)
print(melted_deaths_df, melted_cases_df)

# Join Data

In [None]:
# Get the common columns between the two datasets
common_columns = list(set(melted_cases_df.columns) & set(melted_deaths_df.columns))

# Drop the common columns from df2 to avoid duplication
melted_deaths_df = melted_deaths_df.drop(common_columns, axis=1)

# Combine the datasets, preserving columns with the same name
combined_df = pd.concat([melted_cases_df, melted_deaths_df], axis=1)
combined_df

# Add State Data

In [None]:
import pandas as pd

def convert_county_to_state(county_fips):
    # Extract the first two digits to get the state FIPS code
    state_fip = str(county_fips)[:-5]
    state_fip = int(state_fip)

    return state_fip

# Add a new column 'StateName' based on 'CountyID' column
combined_df['StateID'] = combined_df['CountyID'].apply(convert_county_to_state).astype(int)

# Save the DataFrame with the added column to a new CSV file
combined_df.to_csv(os.getcwd() + "/Data/covid_df.csv", index=False)
combined_df

# Add Population

In [None]:
# Read the first XLSX file
population_df = pd.read_excel(os.getcwd() + '/Data/co-est2022-pop.xlsx')

# Read the second XLSX file
fips_df = pd.read_excel(os.getcwd() + '/Data/all-geocodes-v2022.xlsx')

# Drop unnecessary columns from fips_df
fips_df = fips_df.drop(['Consolidated City FIPS Code', 'County Subdivision FIPS Code', 'Place FIPS Code', 'Summary Level'], axis=1)
fips_df

In [None]:
# Split the "Area Name" column into two columns
population_df[['County Name', 'State Name']] = population_df['Area Name'].str.split(',', expand=True)
population_df["County Name"] = population_df["County Name"].str.strip()
population_df["County Name"] = population_df["County Name"].str.replace(".", "")
population_df["State Name"] = population_df["State Name"].str.strip()
#population_df.dropna()
# Rename column to make sense
merged_df = population_df.rename(columns={2021: '2021 Population'})
population_df = population_df.drop(['Area Name'], axis=1)

population_df

In [None]:
# Merge "State Name" into population_df
fips_df = fips_df.rename(columns={'Area Name': 'State Name'})
population_df = pd.merge(population_df, fips_df[["State FIPS Code", "State Name"]], on='State Name', how='left')


# Merge "County FIPS Code" into population_df
fips_df = fips_df.rename(columns={'State Name': 'County Name'})
population_df = pd.merge(population_df, fips_df[['State FIPS Code', 'County Name', 'County FIPS Code']], on=['State FIPS Code', 'County Name'], how='left')

# Create Complete FIPS Column
population_df = population_df.dropna() # Drop NA here because some of the FIPS codes arent included in the origional dataset.
population_df['County FIPS Code'] = population_df['County FIPS Code'].astype(int).astype(str)
population_df['County FIPS Code'] = population_df['County FIPS Code'].str.zfill(3)
population_df['State FIPS Code'] = population_df['State FIPS Code'].astype(int).astype(str)
population_df['CountyID'] = population_df['State FIPS Code'] + population_df['County FIPS Code']
population_df['CountyID'] = population_df['CountyID'].astype(int)
population_df = population_df.drop(['State FIPS Code', 'County FIPS Code'], axis=1)

population_df.to_csv(os.getcwd() + '/Data/population_df.csv', index=False)


# Join and Format Population + Covid

In [None]:
# Join merged_df with combined_df
population_df = population_df.rename(columns={'FIPS Code': 'CountyID'})
joined_df = pd.merge(combined_df, population_df, on='CountyID', how='left')

joined_df = joined_df.rename(columns={2021: '2021 Population'})
joined_df = joined_df.reindex(columns=['Date', 'CountyID', 'StateID', 'State Name', 'Cases', 'Deaths', '2021 Population'])
#joined_df = joined_df.dropna(subset=['Date', 'CountyID', 'StateID', 'State Name', 'Cases', 'Deaths', '2021 Population'])

print(joined_df)


# Add Change in Deaths/Cases Columns

In [None]:
cases_counts = {}
change_in_cases = []
for index, row in joined_df.iterrows():
    countyID = row['CountyID']
    cases = row['Cases']
    if countyID not in cases_counts:
        change_in_cases.append(cases)
    else:
        change_in_cases.append(cases - cases_counts[countyID])
    cases_counts[countyID] = cases

In [None]:
deaths_counts = {}
change_in_deaths = []
for index, row in joined_df.iterrows():
    countyID = row['CountyID']
    deaths = row['Deaths']
    if countyID not in deaths_counts:
        change_in_deaths.append(deaths)
    else:
        change_in_deaths.append(deaths - deaths_counts[countyID])
    deaths_counts[countyID] = deaths

In [None]:
joined_df['Cases Delta'] = change_in_cases
joined_df['Deaths Delta'] = change_in_deaths

In [None]:
joined_df.to_csv(os.getcwd() + '/Data/joined_df.csv', index=False)

In [None]:
na_locations = joined_df.isna()
print(na_locations)