In [31]:
val rawDeathCauses = spark.read
    .format("csv")
    .option("header", "true")
    .load("/home/start2/git/data-prep-warmup/src/main/resources/death_causes_usa.csv")

rawDeathCauses: org.apache.spark.sql.DataFrame = [Year: string, 113 Cause Name: string ... 4 more fields]


In [32]:
rawDeathCauses.show(3, false)

+----+----------------------------------------------------+----------------------+-------+------+-----------------------+
|Year|113 Cause Name                                      |Cause Name            |State  |Deaths|Age-adjusted Death Rate|
+----+----------------------------------------------------+----------------------+-------+------+-----------------------+
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Unintentional injuries|Alabama|2755  |55.50                  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Unintentional injuries|Alaska |439   |63.10                  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Unintentional injuries|Arizona|4010  |54.20                  |
+----+----------------------------------------------------+----------------------+-------+------+-----------------------+
only showing top 3 rows



In [33]:
val states = List("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming")
val columnReducedDeathCauses = rawDeathCauses
    .drop("Cause Name")
    .drop("Age-adjusted Death Rate")



val reducedDeathCauses = columnReducedDeathCauses
    .filter(col("State").isin(states:_*))

println("Number of rows in dataset: " + columnReducedDeathCauses.count)
println("Number of rows in column reduced dataset: " + columnReducedDeathCauses.count)


columnCleanedDeathCauses.show(false)

Number of rows in dataset: 10296
Number of rows in column reduced dataset: 10296
+----+----------------------------------------------------+--------------------+------+
|Year|113 Cause Name                                      |State               |Deaths|
+----+----------------------------------------------------+--------------------+------+
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Alabama             |2755  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Alaska              |439   |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Arizona             |4010  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Arkansas            |1604  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|California          |13213 |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Colorado            |2880  |
|2016|Accidents (unintentional injuries) (V01-X59,Y85-Y86)|Connecticut         |1978  |
|2016|Accidents (unintentional injuries

states: List[String] = List(Alabama, Alaska, Arizona, Arkansas, California, Colorado, Connecticut, Delaware, District of Columbia, Florida, Georgia, Hawaii, Idaho, Illinois, Indiana, Iowa, Kansas, Kentucky, Louisiana, Maine, Montana, Nebraska, Nevada, New Hampshire, New Jersey, New Mexico, New York, North Carolina, North Dakota, Ohio, Oklahoma, Oregon, Maryland, Massachusetts, Michigan, Minnesota, Mississippi, Missouri, Pennsylvania, Rhode Island, South Carolina, South Dakota, Tennessee, Texas, Utah, Vermont, Virginia, Washington, West Virginia, Wisconsin, Wyoming)
columnReducedDeathCauses: org.apache.spark.sql.DataFrame = [Year: string, 113 Cause Name: string ... 2 more fields]
reducedDeathCauses: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Year: string, 113 Cause Name: s...

In [34]:
columnCleanedDeathCauses.printSchema

root
 |-- Year: string (nullable = true)
 |-- 113 Cause Name: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Deaths: string (nullable = true)



In [54]:
val deathCauses = columnCleanedDeathCauses
    .withColumn("Deaths", $"Deaths".cast("Int"))
    .withColumn("Year", $"Year".cast("Int"))
    .withColumnRenamed("113 Cause Name", "Death Cause")

deathCauses.printSchema

root
 |-- Year: integer (nullable = true)
 |-- Death Cause: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Deaths: integer (nullable = true)



deathCauses: org.apache.spark.sql.DataFrame = [Year: int, Death Cause: string ... 2 more fields]


In [36]:
val censusSecondDecade = spark.read
    .format("csv")
    .option("header", "true")
    .load("/home/start2/git/data-prep-warmup/src/main/resources/census_2010_2017.csv")

censusSecondDecade.printSchema

root
 |-- SUMLEV: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- DIVISION: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ESTIMATESBASE2010: string (nullable = true)
 |-- POPESTIMATE2010: string (nullable = true)
 |-- POPESTIMATE2011: string (nullable = true)
 |-- POPESTIMATE2012: string (nullable = true)
 |-- POPESTIMATE2013: string (nullable = true)
 |-- POPESTIMATE2014: string (nullable = true)
 |-- POPESTIMATE2015: string (nullable = true)
 |-- POPESTIMATE2016: string (nullable = true)
 |-- POPESTIMATE2017: string (nullable = true)
 |-- NPOPCHG_2010: string (nullable = true)
 |-- NPOPCHG_2011: string (nullable = true)
 |-- NPOPCHG_2012: string (nullable = true)
 |-- NPOPCHG_2013: string (nullable = true)
 |-- NPOPCHG_2014: string (nullable = true)
 |-- NPOPCHG_2015: string (nullable = true)
 |-- NPOPCHG_2016: string (nullable = true)
 |-- NPOPCHG_2017: string (nullable = true)
 |-- PPOPCHG_2010: string (nu

censusSecondDecade: org.apache.spark.sql.DataFrame = [SUMLEV: string, REGION: string ... 53 more fields]


In [37]:
val statePopEst = censusSecondDecade.select(
    "NAME",
    "POPESTIMATE2010",
    "POPESTIMATE2011",
    "POPESTIMATE2012",
    "POPESTIMATE2013",
    "POPESTIMATE2014",
    "POPESTIMATE2015",
    "POPESTIMATE2016",
    "POPESTIMATE2017")

statePopEst: org.apache.spark.sql.DataFrame = [NAME: string, POPESTIMATE2010: string ... 7 more fields]


In [38]:
val reducedStatePopEst = statePopEst.filter(col("NAME").isin(states:_*))
print("Number of states in census 2010-2017 dataset: " + (filteredStatePopEst.count()-1))

Number of states in census 2010-2017 dataset: 50

reducedStatePopEst: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [NAME: string, POPESTIMATE2010: string ... 7 more fields]


In [39]:
reducedStatePopEst.printSchema

root
 |-- NAME: string (nullable = true)
 |-- POPESTIMATE2010: string (nullable = true)
 |-- POPESTIMATE2011: string (nullable = true)
 |-- POPESTIMATE2012: string (nullable = true)
 |-- POPESTIMATE2013: string (nullable = true)
 |-- POPESTIMATE2014: string (nullable = true)
 |-- POPESTIMATE2015: string (nullable = true)
 |-- POPESTIMATE2016: string (nullable = true)
 |-- POPESTIMATE2017: string (nullable = true)



In [52]:
val pop2010 = reducedStatePopEst.select("NAME", "POPESTIMATE2010")
    .withColumnRenamed("POPESTIMATE2010", "Population")
    .withColumn("Year", lit(2010))
val pop2011 = reducedStatePopEst.select("NAME", "POPESTIMATE2011")
    .withColumnRenamed("POPESTIMATE2011", "Population")
    .withColumn("Year", lit(2011))
val pop2012 = reducedStatePopEst.select("NAME", "POPESTIMATE2012")
    .withColumnRenamed("POPESTIMATE2012", "Population")
    .withColumn("Year", lit(2012))
val pop2013 = reducedStatePopEst.select("NAME", "POPESTIMATE2013")
    .withColumnRenamed("POPESTIMATE2013", "Population")
    .withColumn("Year", lit(2013))
val pop2014 = reducedStatePopEst.select("NAME", "POPESTIMATE2014")
    .withColumnRenamed("POPESTIMATE2014", "Population")
    .withColumn("Year", lit(2014))
val pop2015 = reducedStatePopEst.select("NAME", "POPESTIMATE2015")
    .withColumnRenamed("POPESTIMATE2015", "Population")   
    .withColumn("Year", lit(2015))
val pop2016 = reducedStatePopEst.select("NAME", "POPESTIMATE2016")
    .withColumnRenamed("POPESTIMATE2016", "Population")
    .withColumn("Year", lit(2016))
val pop2017 = reducedStatePopEst.select("NAME", "POPESTIMATE2017")
    .withColumnRenamed("POPESTIMATE2017", "Population")
    .withColumn("Year", lit(2017))

val popPerYear = List(pop2010, pop2011, pop2012, pop2013, pop2014, pop2015, pop2016, pop2017)
    .map(df => {
        df.withColumn("Population", $"population".cast("Int"))
    })
    .reduce((df1, df2) => df1.union(df2))
    .withColumnRenamed("NAME","State")

popPerYear.printSchema
popPerYear.show

root
 |-- State: string (nullable = true)
 |-- Population: integer (nullable = true)
 |-- Year: integer (nullable = false)

+--------------------+----------+----+
|               State|Population|Year|
+--------------------+----------+----+
|             Alabama|   4785579|2010|
|              Alaska|    714015|2010|
|             Arizona|   6407002|2010|
|            Arkansas|   2921737|2010|
|          California|  37327690|2010|
|            Colorado|   5048029|2010|
|         Connecticut|   3580171|2010|
|            Delaware|    899712|2010|
|District of Columbia|    605040|2010|
|             Florida|  18846461|2010|
|             Georgia|   9712696|2010|
|              Hawaii|   1363817|2010|
|               Idaho|   1570912|2010|
|            Illinois|  12841196|2010|
|             Indiana|   6490029|2010|
|                Iowa|   3050223|2010|
|              Kansas|   2858403|2010|
|            Kentucky|   4347948|2010|
|           Louisiana|   4544871|2010|
|               Ma

pop2010: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2011: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2012: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2013: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2014: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2015: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2016: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
pop2017: org.apache.spark.sql.DataFrame = [NAME: string, Population: string ... 1 more field]
popPerYear: org.apache.spark.sql.DataFrame = ...

In [55]:
deathCauses.show

+----+--------------------+--------------------+------+
|Year|         Death Cause|               State|Deaths|
+----+--------------------+--------------------+------+
|2016|Accidents (uninte...|             Alabama|  2755|
|2016|Accidents (uninte...|              Alaska|   439|
|2016|Accidents (uninte...|             Arizona|  4010|
|2016|Accidents (uninte...|            Arkansas|  1604|
|2016|Accidents (uninte...|          California| 13213|
|2016|Accidents (uninte...|            Colorado|  2880|
|2016|Accidents (uninte...|         Connecticut|  1978|
|2016|Accidents (uninte...|            Delaware|   516|
|2016|Accidents (uninte...|District of Columbia|   401|
|2016|Accidents (uninte...|             Florida| 12561|
|2016|Accidents (uninte...|             Georgia|  4701|
|2016|Accidents (uninte...|              Hawaii|   577|
|2016|Accidents (uninte...|               Idaho|   849|
|2016|Accidents (uninte...|            Illinois|  5508|
|2016|Accidents (uninte...|             Indiana|

In [57]:
val deathCausesPerStateYear = popPerYear.join(deathCauses, Seq("Year","State"), "inner")

deathCausesPerStateYear: org.apache.spark.sql.DataFrame = [Year: int, State: string ... 3 more fields]


In [60]:
deathCausesPerStateYear.show()

+----+--------------------+----------+--------------------+------+
|Year|               State|Population|         Death Cause|Deaths|
+----+--------------------+----------+--------------------+------+
|2016|             Alabama|   4860545|Accidents (uninte...|  2755|
|2016|              Alaska|    741522|Accidents (uninte...|   439|
|2016|             Arizona|   6908642|Accidents (uninte...|  4010|
|2016|            Arkansas|   2988231|Accidents (uninte...|  1604|
|2016|          California|  39296476|Accidents (uninte...| 13213|
|2016|            Colorado|   5530105|Accidents (uninte...|  2880|
|2016|         Connecticut|   3587685|Accidents (uninte...|  1978|
|2016|            Delaware|    952698|Accidents (uninte...|   516|
|2016|District of Columbia|    684336|Accidents (uninte...|   401|
|2016|             Florida|  20656589|Accidents (uninte...| 12561|
|2016|             Georgia|  10313620|Accidents (uninte...|  4701|
|2016|              Hawaii|   1428683|Accidents (uninte...|   