In [114]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, when, coalesce, avg, lit, desc
import os

In [115]:
spark = SparkSession.builder \
    .appName("COVID-19 Analysis") \
    .getOrCreate()

In [116]:
Month_Mapping = {
    'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
    'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
    'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
}

In [117]:
Month_Abbrevation = input("Enter the Month Abbreviation (e.g., 'Jan' for January,): ").strip().lower()
year = input("Enter the year (YYYY): ").strip()

Enter the Month Abbreviation (e.g., 'Jan' for January,): may
Enter the year (YYYY): 2021


In [118]:
Month_MM = Month_Mapping.get(Month_Abbrevation)
if not month_mm:
    print("Invalid month abbreviation.")
    exit()

In [119]:
folder_path = "csse_covid_19_daily_reports_us"

dataframes = []

for day in range(1, 32): 
    
    File_Name = f"{Month_MM}-{day:02d}-{year}.csv"
    File_Path = os.path.join(folder_path, File_Name)

In [120]:
if os.path.exists(File_Path):
        
        dataframe = spark.read.csv(File_Path, header=True, inferSchema=True)
        dataframe = dataframe.withColumn("IFR", (col("Deaths") / col("Confirmed")) * 100)
        dataframes.append(dataframe)

In [121]:
if dataframes:
    
    january_df = dataframes[0]
    for df in dataframes[1:]:
        january_df = january_df.union(dataframe)

   
    Aggregated_dataframe = january_df.groupby("Province_State") \
                               .agg(sum("Deaths").alias("Total_Deaths"), 
                                    sum("Confirmed").alias("Total_Confirmed"))

   
    Aggregated_dataframe = Aggregated_dataframe.withColumn("IFR", (col("Total_Deaths") / col("Total_Confirmed")) * 100)

   
    top_10_states = Aggregated_dataframe.orderBy(desc("IFR")).limit(10)
    print(f"Top 10 States with Highest Infection Fatality Ratio (IFR) in {Month_Abbrevation.capitalize()} {year}:")
    top_10_states.show()
else:
    print(f"No data available for {Month_Abbrevation.capitalize()} {year}.")

Top 10 States with Highest Infection Fatality Ratio (IFR) in May 2021:
+--------------------+------------+---------------+------------------+
|      Province_State|Total_Deaths|Total_Confirmed|               IFR|
+--------------------+------------+---------------+------------------+
|      Grand Princess|           3|            103| 2.912621359223301|
|          New Jersey|       26212|        1016332| 2.579078490099692|
|            New York|       53098|        2102404|2.5255849969844046|
|         Connecticut|        8238|         347341|2.3717326776856176|
|District of Columbia|        1132|          48898| 2.315023109329625|
|         Mississippi|        7316|         317713|2.3027071602358102|
|           Louisiana|       10576|         470685| 2.246937973379224|
|        Pennsylvania|       27214|        1215257| 2.239361715258583|
|          New Mexico|        4263|         202821|2.1018533583800494|
|            Maryland|        9614|         459894|2.0904817197006267|
+-----

In [122]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [123]:
states_to_color = [row['Province_State'] for row in top_10_states.collect()]

print("States to color:", states_to_color)
import plotly.graph_objects as go

States to color: ['Grand Princess', 'New Jersey', 'New York', 'Connecticut', 'District of Columbia', 'Mississippi', 'Louisiana', 'Pennsylvania', 'New Mexico', 'Maryland']


In [124]:
state_abbr = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM',
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

known_states_to_color = [state for state in states_to_color if state in state_abbr]

In [125]:
fig = go.Figure()


color_intensity = 1.2 
for state in known_states_to_color:
    fig.add_trace(go.Choropleth(
        locationmode='USA-states',
        locations=[state_abbr[state]],  
        z=[1],    
        colorscale = [[0, f'rgba(0, 255, 0, {color_intensity})'], [1, f'rgba(0, 255, 0, {color_intensity})']],
        showscale=False, 
    ))
    color_intensity -= 0.1

In [126]:
fig.update_layout(
    title_text='Map of US States',
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
    ),
)