In [None]:
# Setup Spark SQL
# Note if running locally you need the JVM https://www.oracle.com/java/technologies/downloads/ 
# Consider running in https://colab.research.google.com/
%pip install pyspark

In [None]:
# Initialize Context - this is where you'd setup information about your Hadoop cluster if you had one!
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("Covid").getOrCreate()

sc = spark.sparkContext

sc.setLogLevel("WARN")

In [None]:
# Download 100mb covid county data file
!curl "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv" > ./uscounties.csv

In [None]:
# Read the file into a Spark DataFrame
usCountiesFilePath = "./uscounties.csv"

df = spark.read.csv(usCountiesFilePath, inferSchema=True, header=True)

df.show()

In [None]:
# SparkSQL API
df.createOrReplaceTempView("covid")  # create table that you can do sql on

print("Max deaths:")
spark.sql(
    """
    select county, state, deaths
    from covid
    order by deaths desc
    limit 1
  """
).show()

In [None]:
# DataFrame style
from pyspark.sql.functions import col

print("Max deaths:")
print(
    df.orderBy(col("deaths").desc()).take(  # .where(col("county") == "New York City") \
        1
    )
)

In [None]:
# RDD MapReduce Style without key
rows = df.rdd


def getMax(cumm, other):
    if other["deaths"] is not None and other["deaths"] > cumm["deaths"]:
        return other
    else:
        return cumm


print("Max deaths:")
print(rows.reduce(getMax))

In [None]:
# RDD MapReduce Style with mapped tuples
rows = df.rdd


def getMax(cumm, other):
    if other[0] > cumm[0]:
        return other
    else:
        return cumm


rows = rows.map(lambda r: (r["deaths"] or 0, f"{r['county']},{r['state']}"))
print("Max deaths:")
print(rows.reduce(getMax))

In [None]:
# Write code to find the county with the most deaths

In [None]:
# Write code to find the county with the most cases

In [None]:
# Write code to find the total number of deaths in Utah county

In [None]:
# Write code to find the death rate for each state and sort the states by death rate descending

In [None]:
# Write code to something else interesting with this data – your choice