# 3. Dataframes in PySpark 

DataFrames in PySpark are able to handle petabytes of data and share some characteristics with RDD:
- Inmutables
- Lazy evaluations
- Distributed

In [6]:
# import SparkContext
from pyspark import SparkContext

In [7]:
sc_dataframe = SparkSession.builder.appName("pysparkDataframes").getOrCreate()

## Dataset

Storm events in US during 2019 are studied in the following analysis, as entered by NOAA's National Weather Service (source: [NCDC](https://www.ncdc.noaa.gov/stormevents/ftp.jsp)). The folder **data** contains three csv files with event details, fatalities and locations.

In [8]:
dfStorm2019 = sc_dataframe.read.format('csv')\
                .option('header', 'true')\
                .option('delimiter', ',')\
                .option('inferSchema', 'true')\
                .load('../pyspark/data/StormEvents_details-ftp_v1.0_d2019_c20200317.csv.gz')

In [24]:
dfFatalities2019 = sc_dataframe.read.format('csv')\
                    .option('header', 'true')\
                    .option('delimiter', ',')\
                    .option('inferSchema', 'true')\
                    .load('../pyspark/data/StormEvents_fatalities-ftp_v1.0_d2019_c20200317.csv.gz')

In [25]:
dfLocations2019 = sc_dataframe.read.format('csv')\
                    .option('header', 'true')\
                    .option('delimiter', ',')\
                    .option('inferSchema', 'true')\
                    .load('../pyspark/data/StormEvents_locations-ftp_v1.0_d2019_c20200317.csv.gz')

In [9]:
type(dfStorm2019)

pyspark.sql.dataframe.DataFrame

Displaying schema:

In [7]:
dfStorm2019.printSchema()

root
 |-- BEGIN_YEARMONTH: integer (nullable = true)
 |-- BEGIN_DAY: integer (nullable = true)
 |-- BEGIN_TIME: integer (nullable = true)
 |-- END_YEARMONTH: integer (nullable = true)
 |-- END_DAY: integer (nullable = true)
 |-- END_TIME: integer (nullable = true)
 |-- EPISODE_ID: integer (nullable = true)
 |-- EVENT_ID: integer (nullable = true)
 |-- STATE: string (nullable = true)
 |-- STATE_FIPS: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH_NAME: string (nullable = true)
 |-- EVENT_TYPE: string (nullable = true)
 |-- CZ_TYPE: string (nullable = true)
 |-- CZ_FIPS: integer (nullable = true)
 |-- CZ_NAME: string (nullable = true)
 |-- WFO: string (nullable = true)
 |-- BEGIN_DATE_TIME: string (nullable = true)
 |-- CZ_TIMEZONE: string (nullable = true)
 |-- END_DATE_TIME: string (nullable = true)
 |-- INJURIES_DIRECT: integer (nullable = true)
 |-- INJURIES_INDIRECT: integer (nullable = true)
 |-- DEATHS_DIRECT: integer (nullable = true)
 |-- DEATHS_INDIRE

In [21]:
dfStorm2019.columns

['BEGIN_YEARMONTH',
 'BEGIN_DAY',
 'BEGIN_TIME',
 'END_YEARMONTH',
 'END_DAY',
 'END_TIME',
 'EPISODE_ID',
 'EVENT_ID',
 'STATE',
 'STATE_FIPS',
 'YEAR',
 'MONTH_NAME',
 'EVENT_TYPE',
 'CZ_TYPE',
 'CZ_FIPS',
 'CZ_NAME',
 'WFO',
 'BEGIN_DATE_TIME',
 'CZ_TIMEZONE',
 'END_DATE_TIME',
 'INJURIES_DIRECT',
 'INJURIES_INDIRECT',
 'DEATHS_DIRECT',
 'DEATHS_INDIRECT',
 'DAMAGE_PROPERTY',
 'DAMAGE_CROPS',
 'SOURCE',
 'MAGNITUDE',
 'MAGNITUDE_TYPE',
 'FLOOD_CAUSE',
 'CATEGORY',
 'TOR_F_SCALE',
 'TOR_LENGTH',
 'TOR_WIDTH',
 'TOR_OTHER_WFO',
 'TOR_OTHER_CZ_STATE',
 'TOR_OTHER_CZ_FIPS',
 'TOR_OTHER_CZ_NAME',
 'BEGIN_RANGE',
 'BEGIN_AZIMUTH',
 'BEGIN_LOCATION',
 'END_RANGE',
 'END_AZIMUTH',
 'END_LOCATION',
 'BEGIN_LAT',
 'BEGIN_LON',
 'END_LAT',
 'END_LON',
 'EPISODE_NARRATIVE',
 'EVENT_NARRATIVE',
 'DATA_SOURCE']

In [12]:
dfStorm2019.take(1)

[Row(BEGIN_YEARMONTH=201905, BEGIN_DAY=9, BEGIN_TIME=1554, END_YEARMONTH=201905, END_DAY=9, END_TIME=1830, EPISODE_ID=137295, EVENT_ID=824116, STATE='TEXAS', STATE_FIPS=48, YEAR=2019, MONTH_NAME='May', EVENT_TYPE='Flash Flood', CZ_TYPE='C', CZ_FIPS=29, CZ_NAME='BEXAR', WFO='EWX', BEGIN_DATE_TIME='09-MAY-19 15:54:00', CZ_TIMEZONE='CST-6', END_DATE_TIME='09-MAY-19 18:30:00', INJURIES_DIRECT=0, INJURIES_INDIRECT=0, DEATHS_DIRECT=0, DEATHS_INDIRECT=0, DAMAGE_PROPERTY='0.00K', DAMAGE_CROPS='0.00K', SOURCE='Law Enforcement', MAGNITUDE=None, MAGNITUDE_TYPE=None, FLOOD_CAUSE='Heavy Rain', CATEGORY=None, TOR_F_SCALE=None, TOR_LENGTH=None, TOR_WIDTH=None, TOR_OTHER_WFO=None, TOR_OTHER_CZ_STATE=None, TOR_OTHER_CZ_FIPS=None, TOR_OTHER_CZ_NAME=None, BEGIN_RANGE=8, BEGIN_AZIMUTH='N', BEGIN_LOCATION='LEON SPGS', END_RANGE=7, END_AZIMUTH='NNE', END_LOCATION='SAN GERONIMO', BEGIN_LAT=29.7898, BEGIN_LON=-98.6406, END_LAT=29.7158, END_LON=-98.7744, EPISODE_NARRATIVE='Thunderstorms developed along a cold 

In [18]:
dfStorm2019.head(1)

[Row(BEGIN_YEARMONTH=201905, BEGIN_DAY=9, BEGIN_TIME=1554, END_YEARMONTH=201905, END_DAY=9, END_TIME=1830, EPISODE_ID=137295, EVENT_ID=824116, STATE='TEXAS', STATE_FIPS=48, YEAR=2019, MONTH_NAME='May', EVENT_TYPE='Flash Flood', CZ_TYPE='C', CZ_FIPS=29, CZ_NAME='BEXAR', WFO='EWX', BEGIN_DATE_TIME='09-MAY-19 15:54:00', CZ_TIMEZONE='CST-6', END_DATE_TIME='09-MAY-19 18:30:00', INJURIES_DIRECT=0, INJURIES_INDIRECT=0, DEATHS_DIRECT=0, DEATHS_INDIRECT=0, DAMAGE_PROPERTY='0.00K', DAMAGE_CROPS='0.00K', SOURCE='Law Enforcement', MAGNITUDE=None, MAGNITUDE_TYPE=None, FLOOD_CAUSE='Heavy Rain', CATEGORY=None, TOR_F_SCALE=None, TOR_LENGTH=None, TOR_WIDTH=None, TOR_OTHER_WFO=None, TOR_OTHER_CZ_STATE=None, TOR_OTHER_CZ_FIPS=None, TOR_OTHER_CZ_NAME=None, BEGIN_RANGE=8, BEGIN_AZIMUTH='N', BEGIN_LOCATION='LEON SPGS', END_RANGE=7, END_AZIMUTH='NNE', END_LOCATION='SAN GERONIMO', BEGIN_LAT=29.7898, BEGIN_LON=-98.6406, END_LAT=29.7158, END_LON=-98.7744, EPISODE_NARRATIVE='Thunderstorms developed along a cold 

In [13]:
dfStorm2019.count()

67337

Checking duplicates rows:

In [6]:
assert dfStorm2019.count() == dfStorm2019.distinct().count()

In [22]:
dfStorm2019.describe('INJURIES_INDIRECT').show()

+-------+--------------------+
|summary|   INJURIES_INDIRECT|
+-------+--------------------+
|  count|               67337|
|   mean|0.007469890253501047|
| stddev|  0.4528571961352913|
|    min|                   0|
|    max|                  71|
+-------+--------------------+



In [23]:
dfStorm2019.describe('INJURIES_DIRECT').show()

+-------+-------------------+
|summary|    INJURIES_DIRECT|
+-------+-------------------+
|  count|              67337|
|   mean|0.01834058541366559|
| stddev| 0.8481467662887455|
|    min|                  0|
|    max|                166|
+-------+-------------------+



In [16]:
dfStorm2019.filter(dfStorm2019.STATE == 'TEXAS').take(1)

[Row(BEGIN_YEARMONTH=201905, BEGIN_DAY=9, BEGIN_TIME=1554, END_YEARMONTH=201905, END_DAY=9, END_TIME=1830, EPISODE_ID=137295, EVENT_ID=824116, STATE='TEXAS', STATE_FIPS=48, YEAR=2019, MONTH_NAME='May', EVENT_TYPE='Flash Flood', CZ_TYPE='C', CZ_FIPS=29, CZ_NAME='BEXAR', WFO='EWX', BEGIN_DATE_TIME='09-MAY-19 15:54:00', CZ_TIMEZONE='CST-6', END_DATE_TIME='09-MAY-19 18:30:00', INJURIES_DIRECT=0, INJURIES_INDIRECT=0, DEATHS_DIRECT=0, DEATHS_INDIRECT=0, DAMAGE_PROPERTY='0.00K', DAMAGE_CROPS='0.00K', SOURCE='Law Enforcement', MAGNITUDE=None, MAGNITUDE_TYPE=None, FLOOD_CAUSE='Heavy Rain', CATEGORY=None, TOR_F_SCALE=None, TOR_LENGTH=None, TOR_WIDTH=None, TOR_OTHER_WFO=None, TOR_OTHER_CZ_STATE=None, TOR_OTHER_CZ_FIPS=None, TOR_OTHER_CZ_NAME=None, BEGIN_RANGE=8, BEGIN_AZIMUTH='N', BEGIN_LOCATION='LEON SPGS', END_RANGE=7, END_AZIMUTH='NNE', END_LOCATION='SAN GERONIMO', BEGIN_LAT=29.7898, BEGIN_LON=-98.6406, END_LAT=29.7158, END_LON=-98.7744, EPISODE_NARRATIVE='Thunderstorms developed along a cold 

In [17]:
#dfStorm2019.describe().show()

In [10]:
dfStorm2019.groupBy('STATE').count().show(60)

+--------------------+-----+
|               STATE|count|
+--------------------+-----+
|               TEXAS| 4338|
|           MINNESOTA| 2126|
|             VERMONT|  317|
|            NEW YORK| 2514|
|      ATLANTIC SOUTH|  519|
|             FLORIDA|  939|
|       WEST VIRGINIA|  967|
|            ARKANSAS| 1148|
|      GULF OF MEXICO|  907|
|             MONTANA| 1286|
|            MISSOURI| 2159|
|             GEORGIA| 1158|
|         CONNECTICUT|  240|
|             ALABAMA| 1116|
|                OHIO| 2279|
|       MASSACHUSETTS|  860|
|        RHODE ISLAND|   92|
|              KANSAS| 2672|
|                IOWA| 2276|
|            VIRGINIA| 2398|
|              ALASKA|  143|
|      ATLANTIC NORTH| 1054|
|            MARYLAND| 1208|
|          CALIFORNIA| 2643|
|        SOUTH DAKOTA| 2543|
|      NORTH CAROLINA| 1448|
|         MISSISSIPPI| 1140|
|DISTRICT OF COLUMBIA|   52|
|        NORTH DAKOTA| 1095|
|            COLORADO| 1776|
|            KENTUCKY| 1522|
|             

In [11]:
dfStorm2019.groupBy('EVENT_TYPE').count().show(50)

+--------------------+-----+
|          EVENT_TYPE|count|
+--------------------+-----+
|         Flash Flood| 4068|
|   Thunderstorm Wind|18617|
|             Tornado| 1727|
|                Hail| 9013|
|Marine Thundersto...| 2502|
|         Rip Current|   72|
|      Winter Weather| 3800|
|          Waterspout|  183|
|          Heavy Rain| 1416|
|             Drought| 1007|
|        Winter Storm| 3312|
|               Flood| 4943|
|           Lightning|  343|
|            Blizzard|  852|
|Extreme Cold/Wind...| 1065|
|          Heavy Snow| 2844|
|           High Wind| 3743|
|        Funnel Cloud|  348|
|       Coastal Flood|  240|
|      Excessive Heat|  827|
|                Heat| 1291|
|         Marine Hail|   32|
|     Cold/Wind Chill|  470|
|         Strong Wind| 1590|
|         Debris Flow|  184|
|           Dense Fog|  652|
|               Sleet|    1|
|        Frost/Freeze|  654|
|           Avalanche|   44|
|           Hurricane|   10|
|      Tropical Storm|  143|
|           Hi

In [12]:
dfStorm2019.groupBy('STATE', 'MONTH_NAME').count().orderBy('STATE', ascending=True).show(50)

+--------------+----------+-----+
|         STATE|MONTH_NAME|count|
+--------------+----------+-----+
|       ALABAMA|  February|  104|
|       ALABAMA|   October|   35|
|       ALABAMA| September|   41|
|       ALABAMA|       May|   73|
|       ALABAMA|      June|  168|
|       ALABAMA|   January|   32|
|       ALABAMA|      July|   63|
|       ALABAMA|    August|  126|
|       ALABAMA|     March|  148|
|       ALABAMA|  December|   92|
|       ALABAMA|  November|   77|
|       ALABAMA|     April|  157|
|        ALASKA|     April|   12|
|        ALASKA|   October|   19|
|        ALASKA|      June|    4|
|        ALASKA|     March|   17|
|        ALASKA|  February|   14|
|        ALASKA|   January|   10|
|        ALASKA|      July|    2|
|        ALASKA|       May|    2|
|        ALASKA|    August|   11|
|        ALASKA| September|    4|
|        ALASKA|  December|   32|
|        ALASKA|  November|   16|
|AMERICAN SAMOA|       May|    2|
|AMERICAN SAMOA| September|    3|
|AMERICAN SAMO

In [13]:
dfStorm2019.agg({'DEATHS_DIRECT':'sum'}).show()

+------------------+
|sum(DEATHS_DIRECT)|
+------------------+
|               390|
+------------------+



In [14]:
dfStorm2019.agg({'DEATHS_INDIRECT':'sum'}).show()

+--------------------+
|sum(DEATHS_INDIRECT)|
+--------------------+
|                 160|
+--------------------+



In [16]:
dfStorm2019.select('EPISODE_NARRATIVE').distinct().show(5)

+--------------------+
|   EPISODE_NARRATIVE|
+--------------------+
|Thunderstorms dev...|
|An area of low pr...|
|Thunderstorms eru...|
|A strong mid-leve...|
|A strong mid-leve...|
+--------------------+
only showing top 5 rows



In [26]:
dfFatalities2019.printSchema()

root
 |-- FAT_YEARMONTH: integer (nullable = true)
 |-- FAT_DAY: integer (nullable = true)
 |-- FAT_TIME: integer (nullable = true)
 |-- FATALITY_ID: integer (nullable = true)
 |-- EVENT_ID: integer (nullable = true)
 |-- FATALITY_TYPE: string (nullable = true)
 |-- FATALITY_DATE: string (nullable = true)
 |-- FATALITY_AGE: integer (nullable = true)
 |-- FATALITY_SEX: string (nullable = true)
 |-- FATALITY_LOCATION: string (nullable = true)
 |-- EVENT_YEARMONTH: integer (nullable = true)



In [27]:
dfLocations2019.printSchema()

root
 |-- YEARMONTH: integer (nullable = true)
 |-- EPISODE_ID: integer (nullable = true)
 |-- EVENT_ID: integer (nullable = true)
 |-- LOCATION_INDEX: integer (nullable = true)
 |-- RANGE: double (nullable = true)
 |-- AZIMUTH: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- LAT2: integer (nullable = true)
 |-- LON2: integer (nullable = true)



In [38]:
dfAgeSex = dfFatalities2019.crosstab('FATALITY_AGE', 'FATALITY_SEX')

In [49]:
dfAgeSex.show()

+-------------------------+---+---+----+
|FATALITY_AGE_FATALITY_SEX|  F|  M|null|
+-------------------------+---+---+----+
|                       69|  1|  7|   0|
|                        0|  0|  0|   2|
|                     null|  6| 24|  25|
|                        5|  1|  3|   0|
|                       10|  1|  1|   0|
|                       56|  3|  5|   0|
|                       42|  0|  2|   0|
|                       24|  2|  3|   0|
|                       37|  0|  4|   0|
|                       25|  1|  8|   0|
|                       52|  4| 13|   0|
|                       14|  1|  5|   0|
|                       20|  3|  2|   0|
|                       46|  2|  4|   0|
|                       57|  3|  3|   0|
|                       78|  4|  1|   0|
|                       29|  3|  5|   0|
|                       84|  0|  1|   0|
|                       61|  1|  7|   0|
|                       89|  0|  1|   0|
+-------------------------+---+---+----+
only showing top

In [62]:
dfLocationSex = dfFatalities2019.crosstab('FATALITY_LOCATION', 'FATALITY_SEX')

In [63]:
dfLocationSex.show()

+------------------------------+---+---+----+
|FATALITY_LOCATION_FATALITY_SEX|  F|  M|null|
+------------------------------+---+---+----+
|                         Other|  2|  5|   0|
|          Heavy Equipment/C...|  0|  2|   0|
|                    Ball Field|  0|  1|   0|
|                      In Water| 12| 97|   1|
|                       Boating|  2| 15|   0|
|                Permanent Home| 15| 21|   2|
|                    Under Tree|  5|  7|   1|
|          Vehicle/Towed Tra...| 55|100|  13|
|                       Camping|  1|  2|   0|
|           Mobile/Trailer Home| 17| 13|   2|
|            Outside/Open Areas| 25|102|   6|
|                       Unknown|  3| 15|   4|
|           Permanent Structure|  5|  4|   0|
+------------------------------+---+---+----+



In [None]:
dfLocationType = dfFatalities2019.crosstab('FATALITY_LOCATION', 'FATALITY_TYPE')

In [None]:
dfLocationType.show()

In [50]:
AgeSex_df = dfAgeSex.toPandas()

In [51]:
AgeSex_df.head()

Unnamed: 0,FATALITY_AGE_FATALITY_SEX,F,M,null
0,69.0,1,7,0
1,0.0,0,0,2
2,,6,24,25
3,5.0,1,3,0
4,10.0,1,1,0
