In [34]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [33]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit, col, column, expr, desc, asc

In [31]:
# ! pip3 install matplotlib
# ! pip3 install seaborn

In [26]:
# build our own SparkSession
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

In [2]:
myspark

In [5]:
# ! head noaa.csv
# noaa_data.show(10)
noaa_data = myspark.read.load("noaa.csv", format="csv", sep=",", header=True, inferSchema=True)
noaa_data.count()

In [24]:
# noaa_data.show(10)

In [23]:
temp_filt = (noaa_data.filter(noaa_data.ELEVATION <= 5))
# temp_filt.count()

In [22]:
latitude_order = noaa_data.orderBy("LATITUDE", ascending=False)
# latitude_order.show(10)

In [25]:
noaa_data.printSchema()

root
 |-- STATION: integer (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- NAME: string (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- TEMP_ATTRIBUTES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- DEWP_ATTRIBUTES: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- SLP_ATTRIBUTES: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- STP_ATTRIBUTES: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- VISIB_ATTRIBUTES: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- WDSP_ATTRIBUTES: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MAX_ATTRIBUTES: string (nullable = true)
 |-- MIN: double (nullable = true)
 |-- MIN_ATTRIBUTES: string (nullable = true)
 |-- PRCP: double (nullable = 

In [29]:
noaa_data.select("TEMP", "ELEVATION", "VISIB").show(10)

+----+---------+-----+
|TEMP|ELEVATION|VISIB|
+----+---------+-----+
|23.4|      9.0| 28.0|
|31.5|      9.0|  4.9|
|35.0|      9.0|  3.1|
|35.4|      9.0|  6.1|
|27.5|      9.0| 13.5|
|23.6|      9.0|  8.6|
|18.7|      9.0|  4.1|
|26.9|      9.0|999.9|
|31.7|      9.0|  5.9|
|32.1|      9.0|  5.4|
+----+---------+-----+
only showing top 10 rows



In [35]:
columns = noaa_data.columns

for cl in columns:
    noaa_data.describe(cl).show()

for cl in columns:
    noaa_data.select(cl).distinct().show(10)

+-------+--------------------+
|summary|             STATION|
+-------+--------------------+
|  count|               22601|
|   mean| 1.074592154214371E9|
| stddev|3.9437903110216826E7|
|    min|          1001099999|
|    max|          1144099999|
+-------+--------------------+

+-------+----------+
|summary|      DATE|
+-------+----------+
|  count|     22680|
|   mean|      null|
| stddev|      null|
|    min|2021-01-01|
|    max|      DATE|
+-------+----------+

+-------+-----------------+
|summary|         LATITUDE|
+-------+-----------------+
|  count|            22601|
|   mean|68.25025800955247|
| stddev|4.006228418150994|
|    min|       58.3666666|
|    max|            80.65|
+-------+-----------------+

+-------+------------------+
|summary|         LONGITUDE|
+-------+------------------+
|  count|             22601|
|   mean|17.273892601748127|
| stddev| 8.726129473421302|
|    min|        -8.6666667|
|    max|              31.5|
+-------+------------------+

+-------+------

+----+
|TEMP|
+----+
|32.1|
|26.5|
|36.0|
|36.1|
|30.3|
|29.6|
|16.4|
|33.8|
|21.6|
|27.4|
+----+
only showing top 10 rows

+---------------+
|TEMP_ATTRIBUTES|
+---------------+
|           22.0|
|           17.0|
|           13.0|
|           19.0|
|           null|
|           24.0|
|           10.0|
|            5.0|
|           15.0|
|            9.0|
+---------------+
only showing top 10 rows

+----+
|DEWP|
+----+
|16.2|
|30.3|
|22.0|
|25.3|
|19.9|
|26.2|
|23.1|
|23.5|
|33.7|
|19.0|
+----+
only showing top 10 rows

+---------------+
|DEWP_ATTRIBUTES|
+---------------+
|           22.0|
|           17.0|
|           13.0|
|           19.0|
|           null|
|           24.0|
|           10.0|
|            5.0|
|           15.0|
|            9.0|
+---------------+
only showing top 10 rows

+------+
|   SLP|
+------+
|1040.8|
| 999.0|
| 987.6|
| 986.3|
| 981.7|
| 988.6|
| 995.6|
|1011.6|
|1008.1|
| 989.6|
+------+
only showing top 10 rows

+--------------+
|SLP_ATTRIBUTES|
+---------

In [36]:
cols_to_drop = ["STATION", "NAME", ""]
cols_interest = [x for x in columns if x not in cols_to_drop]
df_interest_cols = noaa_data.select(cols_interest).show(10)

+----------+----------+----------+---------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+----+----+--------------+----+--------------+-----+---------------+-----+------+
|      DATE|  LATITUDE| LONGITUDE|ELEVATION|TEMP|TEMP_ATTRIBUTES|DEWP|DEWP_ATTRIBUTES|   SLP|SLP_ATTRIBUTES|  STP|STP_ATTRIBUTES|VISIB|VISIB_ATTRIBUTES|WDSP|WDSP_ATTRIBUTES|MXSPD|GUST| MAX|MAX_ATTRIBUTES| MIN|MIN_ATTRIBUTES| PRCP|PRCP_ATTRIBUTES| SNDP|FRSHTT|
+----------+----------+----------+---------+----+---------------+----+---------------+------+--------------+-----+--------------+-----+----------------+----+---------------+-----+----+----+--------------+----+--------------+-----+---------------+-----+------+
|2021-01-01|70.9333333|-8.6666667|      9.0|23.4|           24.0|15.7|           24.0|1017.7|          24.0| 16.5|          24.0| 28.0|             6.0|10.9|           24.0| 22.1|39.4|26.2|              |20.1|           