# PySpark on Mac

In [1]:
# import SparkContext
from pyspark import SparkContext

In [2]:
sc_dataframe = SparkSession.builder.appName("pysparkDataframes").getOrCreate()

In [3]:
dfStorm2019 = sc_dataframe.read.format('csv')\
                .option('header', 'true')\
                .option('delimiter', ',')\
                .option('inferSchema', 'true')\
                .load('../pyspark/data/StormEvents_details-ftp_v1.0_d2019_c20200317.csv.gz')

In [4]:
dfStorm2019.show(10)

+---------------+---------+----------+-------------+-------+--------+----------+--------+--------------+----------+----+----------+--------------------+-------+-------+--------------------+---+------------------+-----------+------------------+---------------+-----------------+-------------+---------------+---------------+------------+--------------------+---------+--------------+-----------+--------+-----------+----------+---------+-------------+------------------+-----------------+-----------------+-----------+-------------+----------------+---------+-----------+----------------+---------+---------+-------+--------+--------------------+--------------------+-----------+
|BEGIN_YEARMONTH|BEGIN_DAY|BEGIN_TIME|END_YEARMONTH|END_DAY|END_TIME|EPISODE_ID|EVENT_ID|         STATE|STATE_FIPS|YEAR|MONTH_NAME|          EVENT_TYPE|CZ_TYPE|CZ_FIPS|             CZ_NAME|WFO|   BEGIN_DATE_TIME|CZ_TIMEZONE|     END_DATE_TIME|INJURIES_DIRECT|INJURIES_INDIRECT|DEATHS_DIRECT|DEATHS_INDIRECT|DAMAGE_PROPERT

In [5]:
dfStorm2019.count()

67337

Checking duplicates rows:

In [6]:
assert dfStorm2019.count() == dfStorm2019.distinct().count()

Displaying schema:

In [7]:
dfStorm2019.printSchema()

root
 |-- BEGIN_YEARMONTH: integer (nullable = true)
 |-- BEGIN_DAY: integer (nullable = true)
 |-- BEGIN_TIME: integer (nullable = true)
 |-- END_YEARMONTH: integer (nullable = true)
 |-- END_DAY: integer (nullable = true)
 |-- END_TIME: integer (nullable = true)
 |-- EPISODE_ID: integer (nullable = true)
 |-- EVENT_ID: integer (nullable = true)
 |-- STATE: string (nullable = true)
 |-- STATE_FIPS: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH_NAME: string (nullable = true)
 |-- EVENT_TYPE: string (nullable = true)
 |-- CZ_TYPE: string (nullable = true)
 |-- CZ_FIPS: integer (nullable = true)
 |-- CZ_NAME: string (nullable = true)
 |-- WFO: string (nullable = true)
 |-- BEGIN_DATE_TIME: string (nullable = true)
 |-- CZ_TIMEZONE: string (nullable = true)
 |-- END_DATE_TIME: string (nullable = true)
 |-- INJURIES_DIRECT: integer (nullable = true)
 |-- INJURIES_INDIRECT: integer (nullable = true)
 |-- DEATHS_DIRECT: integer (nullable = true)
 |-- DEATHS_INDIRE

In [8]:
dfStorm2019.filter(dfStorm2019.STATE == 'TEXAS').show(10)

+---------------+---------+----------+-------------+-------+--------+----------+--------+-----+----------+----+----------+-----------------+-------+-------+----------+---+------------------+-----------+------------------+---------------+-----------------+-------------+---------------+---------------+------------+--------------------+---------+--------------+-----------+--------+-----------+----------+---------+-------------+------------------+-----------------+-----------------+-----------+-------------+--------------+---------+-----------+-------------+---------+---------+-------+--------+--------------------+--------------------+-----------+
|BEGIN_YEARMONTH|BEGIN_DAY|BEGIN_TIME|END_YEARMONTH|END_DAY|END_TIME|EPISODE_ID|EVENT_ID|STATE|STATE_FIPS|YEAR|MONTH_NAME|       EVENT_TYPE|CZ_TYPE|CZ_FIPS|   CZ_NAME|WFO|   BEGIN_DATE_TIME|CZ_TIMEZONE|     END_DATE_TIME|INJURIES_DIRECT|INJURIES_INDIRECT|DEATHS_DIRECT|DEATHS_INDIRECT|DAMAGE_PROPERTY|DAMAGE_CROPS|              SOURCE|MAGNITUDE|MAG

In [9]:
#dfStorm2019.describe().show()

In [10]:
dfStorm2019.groupBy('STATE').count().show(60)

+--------------------+-----+
|               STATE|count|
+--------------------+-----+
|               TEXAS| 4338|
|           MINNESOTA| 2126|
|             VERMONT|  317|
|            NEW YORK| 2514|
|      ATLANTIC SOUTH|  519|
|             FLORIDA|  939|
|       WEST VIRGINIA|  967|
|            ARKANSAS| 1148|
|      GULF OF MEXICO|  907|
|             MONTANA| 1286|
|            MISSOURI| 2159|
|             GEORGIA| 1158|
|         CONNECTICUT|  240|
|             ALABAMA| 1116|
|                OHIO| 2279|
|       MASSACHUSETTS|  860|
|        RHODE ISLAND|   92|
|              KANSAS| 2672|
|                IOWA| 2276|
|            VIRGINIA| 2398|
|              ALASKA|  143|
|      ATLANTIC NORTH| 1054|
|            MARYLAND| 1208|
|          CALIFORNIA| 2643|
|        SOUTH DAKOTA| 2543|
|      NORTH CAROLINA| 1448|
|         MISSISSIPPI| 1140|
|DISTRICT OF COLUMBIA|   52|
|        NORTH DAKOTA| 1095|
|            COLORADO| 1776|
|            KENTUCKY| 1522|
|             

In [11]:
dfStorm2019.groupBy('EVENT_TYPE').count().show(50)

+--------------------+-----+
|          EVENT_TYPE|count|
+--------------------+-----+
|         Flash Flood| 4068|
|   Thunderstorm Wind|18617|
|             Tornado| 1727|
|                Hail| 9013|
|Marine Thundersto...| 2502|
|         Rip Current|   72|
|      Winter Weather| 3800|
|          Waterspout|  183|
|          Heavy Rain| 1416|
|             Drought| 1007|
|        Winter Storm| 3312|
|               Flood| 4943|
|           Lightning|  343|
|            Blizzard|  852|
|Extreme Cold/Wind...| 1065|
|          Heavy Snow| 2844|
|           High Wind| 3743|
|        Funnel Cloud|  348|
|       Coastal Flood|  240|
|      Excessive Heat|  827|
|                Heat| 1291|
|         Marine Hail|   32|
|     Cold/Wind Chill|  470|
|         Strong Wind| 1590|
|         Debris Flow|  184|
|           Dense Fog|  652|
|               Sleet|    1|
|        Frost/Freeze|  654|
|           Avalanche|   44|
|           Hurricane|   10|
|      Tropical Storm|  143|
|           Hi

In [12]:
dfStorm2019.groupBy('STATE', 'MONTH_NAME').count().orderBy('STATE', ascending=True).show(50)

+--------------+----------+-----+
|         STATE|MONTH_NAME|count|
+--------------+----------+-----+
|       ALABAMA|  February|  104|
|       ALABAMA|   October|   35|
|       ALABAMA| September|   41|
|       ALABAMA|       May|   73|
|       ALABAMA|      June|  168|
|       ALABAMA|   January|   32|
|       ALABAMA|      July|   63|
|       ALABAMA|    August|  126|
|       ALABAMA|     March|  148|
|       ALABAMA|  December|   92|
|       ALABAMA|  November|   77|
|       ALABAMA|     April|  157|
|        ALASKA|     April|   12|
|        ALASKA|   October|   19|
|        ALASKA|      June|    4|
|        ALASKA|     March|   17|
|        ALASKA|  February|   14|
|        ALASKA|   January|   10|
|        ALASKA|      July|    2|
|        ALASKA|       May|    2|
|        ALASKA|    August|   11|
|        ALASKA| September|    4|
|        ALASKA|  December|   32|
|        ALASKA|  November|   16|
|AMERICAN SAMOA|       May|    2|
|AMERICAN SAMOA| September|    3|
|AMERICAN SAMO

In [13]:
dfStorm2019.agg({'DEATHS_DIRECT':'sum'}).show()

+------------------+
|sum(DEATHS_DIRECT)|
+------------------+
|               390|
+------------------+



In [14]:
dfStorm2019.agg({'DEATHS_INDIRECT':'sum'}).show()

+--------------------+
|sum(DEATHS_INDIRECT)|
+--------------------+
|                 160|
+--------------------+



In [16]:
dfStorm2019.select('EPISODE_NARRATIVE').distinct().show(5)

+--------------------+
|   EPISODE_NARRATIVE|
+--------------------+
|Thunderstorms dev...|
|An area of low pr...|
|Thunderstorms eru...|
|A strong mid-leve...|
|A strong mid-leve...|
+--------------------+
only showing top 5 rows

