In [2]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
warnings.filterwarnings("ignore")

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit, col, column, expr, desc, asc

In [4]:
# ! pip install matplotlib
# ! pip install seaborn
# ! pip install ipynb
# ! pip install nbimporter

In [5]:
# build our own SparkSession
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

22/05/17 11:43:24 WARN Utils: Your hostname, nuno-g14 resolves to a loopback address: 127.0.1.1; using 192.168.1.225 instead (on interface wlp2s0)
22/05/17 11:43:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 11:43:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
myspark

In [7]:
# ! head noaa.csv
# noaa_data.show(10)
noaa_data = myspark.read.load("noaa.csv", format="csv", sep=",", header=True, inferSchema=True)
noaa_data.count()

                                                                                

19329172

In [8]:
# noaa_data.show(10)

In [9]:
# temp_filt = (noaa_data.filter(noaa_data.ELEVATION <= 5))
# temp_filt.count()

In [10]:
#latitude_order = noaa_data.orderBy("LATITUDE", ascending=False)
#latitude_order.show(10)

In [11]:
noaa_data.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- NAME: string (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- TEMP_ATTRIBUTES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- DEWP_ATTRIBUTES: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- SLP_ATTRIBUTES: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- STP_ATTRIBUTES: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- VISIB_ATTRIBUTES: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- WDSP_ATTRIBUTES: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MAX_ATTRIBUTES: string (nullable = true)
 |-- MIN: double (nullable = true)
 |-- MIN_ATTRIBUTES: string (nullable = true)
 |-- PRCP: double (nullable = t

In [12]:
noaa_data.select("TEMP", "ELEVATION", "VISIB").show(10)

+----+---------+-----+
|TEMP|ELEVATION|VISIB|
+----+---------+-----+
|76.7|    93.87| 26.0|
|75.8|    93.87| 19.9|
|74.0|    93.87| 10.5|
|76.0|    93.87| 16.5|
|76.3|    93.87| 25.9|
|76.0|    93.87| 20.6|
|77.5|    93.87| 19.9|
|76.7|    93.87| 14.3|
|77.9|    93.87| 17.0|
|77.9|    93.87| 19.9|
+----+---------+-----+
only showing top 10 rows



In [13]:

columns = noaa_data.columns
"""
for cl in columns:
    noaa_data.describe(cl).show()

for cl in columns:
    noaa_data.select(cl).distinct().show(10)
"""

'\nfor cl in columns:\n    noaa_data.describe(cl).show()\n\nfor cl in columns:\n    noaa_data.select(cl).distinct().show(10)\n'

In [14]:
cols_to_drop = ["STATION", "DATE", "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", "TEMP_ATTRIBUTES", "DEWP_ATTRIBUTES",
               "SLP_ATTRIBUTES", "STP_ATTRIBUTES", "VISIB_ATTRIBUTES", "WDSP_ATTRIBUTES", "MAX_ATTRIBUTES",
               "MIN_ATTRIBUTES", "PRCP_ATTRIBUTES"]

"""
Mean temperature (.1 Fahrenheit) - TEMP
Mean dew point (.1 Fahrenheit) - DEWP
Mean sea level pressure (.1 mb) - SLP
Mean station pressure (.1 mb) - STP
Mean visibility (.1 miles) - VISIB
Mean wind speed (.1 knots) - WDSP
Maximum sustained wind speed (.1 knots) - MXSPD 
Maximum wind gust (.1 knots) - GUST
Maximum temperature (.1 Fahrenheit) - MAX
Minimum temperature (.1 Fahrenheit) - MIN
Precipitation amount (.01 inches) - PRCP
Snow depth (.1 inches) - SNDP
Indicator for occurrence of: Fog, Rain or Drizzle, Snow or Ice Pellets, Hail, Thunder, Tornado/Funnel Cloud. - FRSHTT
"""

cols_interest = [x for x in columns if x not in cols_to_drop]
df_interest_cols = noaa_data.select(cols_interest)

### Data cleansing

In [None]:
df_interest_cols.printSchema()
df_clean = df_interest_cols.dropna()
[df_interest_cols.count(), df_clean.count()]
columns = df_clean.columns

for cl in columns:
    df_clean.describe(cl).show()
"""



for cl in columns:
    df_clean.select(cl).distinct().show(10)
    
for cl in df_clean.columns: 
    df_clean.select(cl).summary().show()
"""


root
 |-- TEMP: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MIN: double (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SNDP: double (nullable = true)
 |-- FRSHTT: integer (nullable = true)



                                                                                

+-------+------------------+
|summary|              TEMP|
+-------+------------------+
|  count|          19329172|
|   mean| 55.01233707268976|
| stddev|22.768293098866074|
|    min|            -114.7|
|    max|             110.0|
+-------+------------------+



                                                                                

+-------+-----------------+
|summary|             DEWP|
+-------+-----------------+
|  count|         19329172|
|   mean|486.8437543374978|
| stddev|2053.069124349846|
|    min|           -120.9|
|    max|           9999.9|
+-------+-----------------+



                                                                                

+-------+-----------------+
|summary|              SLP|
+-------+-----------------+
|  count|         19329172|
|   mean|4146.266166776542|
| stddev|4281.330679825313|
|    min|            904.4|
|    max|           9999.9|
+-------+-----------------+



                                                                                

+-------+-----------------+
|summary|              STP|
+-------+-----------------+
|  count|         19329172|
|   mean|703.9547709374414|
| stddev| 430.617274739191|
|    min|              0.0|
|    max|            999.9|
+-------+-----------------+



                                                                                

+-------+------------------+
|summary|             VISIB|
+-------+------------------+
|  count|          19329172|
|   mean|260.36188831581364|
| stddev|430.60288398229255|
|    min|               0.0|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+-----------------+
|summary|             WDSP|
+-------+-----------------+
|  count|         19329172|
|   mean|44.46515733314976|
| stddev|190.7991395190877|
|    min|              0.0|
|    max|            999.9|
+-------+-----------------+



[Stage 30:>                                                       (0 + 16) / 34]

In [None]:
# df_plot = df_clean.select('TEMP', 'PRCP').toPandas()
# plotHistogram(df_plot, 'TEMP', 'PRCP')
# df_plot_mini = df_plot.sample(n=1000)
# Useful_Visualization_Functions.plotScatterMatrix(df_clean, 'PRCP')
# Useful_Visualization_Functions.plotHistogram(df_plot_mini, 'TEMP', 'PRCP')