In [37]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
from pyspark.ml import *
from pyspark.sql import *
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import col, explode, array, lit
warnings.filterwarnings("ignore")
import pyspark.sql.functions as F

In [38]:
# from pyspark.sql import SparkSession
# from pyspark.sql import Row
# from pyspark.sql.functions import lit, col, column, expr, desc, asc

In [39]:
# ! pip install matplotlib
# ! pip install seaborn
# ! pip install ipynb
# ! pip install nbimporter

In [40]:
# build our own SparkSession
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

In [41]:
myspark

In [46]:
# ! head noaa.csv
# noaa_data.show(10)
noaa_data = myspark.read.load("../Desktop/noaa.csv", format="csv", sep=",", header=True, inferSchema=True)
noaa_data.count()

                                                                                

25933550

In [47]:
noaa_data.show(10)

+-----------+----------+----------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+
|    STATION|      DATE|  LATITUDE| LONGITUDE|ELEVATION|                NAME|TEMP|TEMP_ATTRIBUTES|DEWP|DEWP_ATTRIBUTES|   SLP|SLP_ATTRIBUTES| STP|STP_ATTRIBUTES|VISIB|VISIB_ATTRIBUTES|WDSP|WDSP_ATTRIBUTES|MXSPD| GUST| MAX|MAX_ATTRIBUTES| MIN|MIN_ATTRIBUTES|PRCP|PRCP_ATTRIBUTES| SNDP|FRSHTT|
+-----------+----------+----------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+
|01001099999|2018-01-01|70.9333333|-8.6666667|      9.0|JAN MAYEN NOR NAV...|22.5|           24.0|15.5|           24.0|1006.

In [29]:
noaa_data.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- NAME: string (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- TEMP_ATTRIBUTES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- DEWP_ATTRIBUTES: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- SLP_ATTRIBUTES: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- STP_ATTRIBUTES: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- VISIB_ATTRIBUTES: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- WDSP_ATTRIBUTES: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MAX_ATTRIBUTES: string (nullable = true)
 |-- MIN: double (nullable = true)
 |-- MIN_ATTRIBUTES: string (nullable = true)
 |-- PRCP: double (nullable = t

In [30]:
noaa_data.select("TEMP", "ELEVATION", "VISIB").show(10)

+----+---------+-----+
|TEMP|ELEVATION|VISIB|
+----+---------+-----+
|22.5|      9.0|  9.8|
|21.0|      9.0| 14.0|
|21.6|      9.0| 11.9|
|19.5|      9.0|  7.3|
|11.4|      9.0|  0.7|
|12.8|      9.0|  4.8|
|12.1|      9.0|  3.9|
|25.8|      9.0|  6.2|
|29.9|      9.0|  4.4|
|35.8|      9.0|  5.4|
+----+---------+-----+
only showing top 10 rows



In [48]:
noaa_data = noaa_data.withColumn("ItRained", when((F.length(noaa_data["FRSHTT"]) <= 4), lit(0)) \
                    .when(F.length(noaa_data["FRSHTT"]) == 5, lit(1)) \
                    .otherwise(lit(substring('FRSHTT', 2, 1).cast(IntegerType()))) \
)

noaa_data.show(10)

+-----------+----------+----------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+--------+
|    STATION|      DATE|  LATITUDE| LONGITUDE|ELEVATION|                NAME|TEMP|TEMP_ATTRIBUTES|DEWP|DEWP_ATTRIBUTES|   SLP|SLP_ATTRIBUTES| STP|STP_ATTRIBUTES|VISIB|VISIB_ATTRIBUTES|WDSP|WDSP_ATTRIBUTES|MXSPD| GUST| MAX|MAX_ATTRIBUTES| MIN|MIN_ATTRIBUTES|PRCP|PRCP_ATTRIBUTES| SNDP|FRSHTT|ItRained|
+-----------+----------+----------+----------+---------+--------------------+----+---------------+----+---------------+------+--------------+----+--------------+-----+----------------+----+---------------+-----+-----+----+--------------+----+--------------+----+---------------+-----+------+--------+
|01001099999|2018-01-01|70.9333333|-8.6666667|      9.0|JAN MAYEN NOR NAV...|22.5|           24.0

In [49]:

columns = noaa_data.columns

for cl in columns:
    noaa_data.describe(cl).show()

for cl in columns:
    noaa_data.select(cl).distinct().show(10)

                                                                                

+-------+--------------------+
|summary|             STATION|
+-------+--------------------+
|  count|            25933550|
|   mean|5.306301503662034...|
| stddev|3.060898676315181...|
|    min|         00702699999|
|    max|         A5125600451|
+-------+--------------------+



                                                                                

+-------+----------+
|summary|      DATE|
+-------+----------+
|  count|  25933550|
|   mean|      null|
| stddev|      null|
|    min|2016-01-01|
|    max|2022-05-02|
+-------+----------+



                                                                                

+-------+-----------------+
|summary|         LATITUDE|
+-------+-----------------+
|  count|         25855512|
|   mean|32.34458031097024|
| stddev|28.03616478322267|
|    min|            -90.0|
|    max|            83.65|
+-------+-----------------+



                                                                                

+-------+-------------------+
|summary|          LONGITUDE|
+-------+-------------------+
|  count|           25855512|
|   mean|-0.8675453553730872|
| stddev|   85.4985819331213|
|    min|       -179.9833333|
|    max|             179.75|
+-------+-------------------+



                                                                                

+-------+------------------+
|summary|         ELEVATION|
+-------+------------------+
|  count|          25855045|
|   mean|363.52407377514294|
| stddev| 546.4576349838859|
|    min|            -999.9|
|    max|            7026.0|
+-------+------------------+



                                                                                

+-------+--------------------+
|summary|                NAME|
+-------+--------------------+
|  count|            25855518|
|   mean|                null|
| stddev|                null|
|    min|068 BAFFIN BAY PO...|
|    max|        ZYRYANKA, RS|
+-------+--------------------+



                                                                                

+-------+------------------+
|summary|              TEMP|
+-------+------------------+
|  count|          25933550|
|   mean| 54.79614306178668|
| stddev|23.268999499093304|
|    min|            -114.7|
|    max|             110.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|   TEMP_ATTRIBUTES|
+-------+------------------+
|  count|          25933550|
|   mean|18.128243800019664|
| stddev| 7.476889945115773|
|    min|               4.0|
|    max|              24.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|              DEWP|
+-------+------------------+
|  count|          25933550|
|   mean|488.62406074386394|
| stddev|2057.4176930485223|
|    min|            -120.9|
|    max|            9999.9|
+-------+------------------+



                                                                                

+-------+-----------------+
|summary|  DEWP_ATTRIBUTES|
+-------+-----------------+
|  count|         25933550|
|   mean|17.12054631934309|
| stddev|8.220488344276054|
|    min|              0.0|
|    max|             24.0|
+-------+-----------------+



                                                                                

+-------+-----------------+
|summary|              SLP|
+-------+-----------------+
|  count|         25933550|
|   mean|4163.444167471172|
| stddev|4286.966231240549|
|    min|            904.4|
|    max|           9999.9|
+-------+-----------------+



                                                                                

+-------+------------------+
|summary|    SLP_ATTRIBUTES|
+-------+------------------+
|  count|          25933550|
|   mean|10.169011955555641|
| stddev| 9.768325313957282|
|    min|               0.0|
|    max|              24.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|               STP|
+-------+------------------+
|  count|          25933550|
|   mean| 698.8149438683114|
| stddev|432.57394120621365|
|    min|               0.0|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|    STP_ATTRIBUTES|
+-------+------------------+
|  count|          25933550|
|   mean|11.292145734000936|
| stddev| 9.954688583635043|
|    min|               0.0|
|    max|              24.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|             VISIB|
+-------+------------------+
|  count|          25933550|
|   mean|279.09605141250455|
| stddev|440.60251196673966|
|    min|               0.0|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|  VISIB_ATTRIBUTES|
+-------+------------------+
|  count|          25933550|
|   mean|12.187231674799632|
| stddev|10.040637320406029|
|    min|               0.0|
|    max|              24.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|              WDSP|
+-------+------------------+
|  count|          25933550|
|   mean| 46.66762993881708|
| stddev|196.08742453616355|
|    min|               0.0|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|   WDSP_ATTRIBUTES|
+-------+------------------+
|  count|          25933550|
|   mean|17.174985067605476|
| stddev|  8.20162579982544|
|    min|               0.0|
|    max|              24.0|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|             MXSPD|
+-------+------------------+
|  count|          25933550|
|   mean| 68.24044310553055|
| stddev|229.42500830443115|
|    min|               0.2|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+-----------------+
|summary|             GUST|
+-------+-----------------+
|  count|         25933550|
|   mean|735.8077882579081|
| stddev|433.9757345652434|
|    min|              9.7|
|    max|            999.9|
+-------+-----------------+



                                                                                

+-------+-----------------+
|summary|              MAX|
+-------+-----------------+
|  count|         25933550|
|   mean|70.61048029675804|
| stddev|252.2792810021753|
|    min|           -112.4|
|    max|           9999.9|
+-------+-----------------+



                                                                                

+-------+--------------+
|summary|MAX_ATTRIBUTES|
+-------+--------------+
|  count|      25933550|
|   mean|          null|
| stddev|          null|
|    min|              |
|    max|             *|
+-------+--------------+



                                                                                

+-------+------------------+
|summary|               MIN|
+-------+------------------+
|  count|          25933550|
|   mean|51.935805815247065|
| stddev|  249.204030036391|
|    min|            -117.4|
|    max|            9999.9|
+-------+------------------+



                                                                                

+-------+--------------+
|summary|MIN_ATTRIBUTES|
+-------+--------------+
|  count|      25933550|
|   mean|          null|
| stddev|          null|
|    min|              |
|    max|             *|
+-------+--------------+



                                                                                

+-------+------------------+
|summary|              PRCP|
+-------+------------------+
|  count|          25933550|
|   mean| 7.615574490963987|
| stddev|26.393258596339738|
|    min|               0.0|
|    max|             99.99|
+-------+------------------+



                                                                                

+-------+---------------+
|summary|PRCP_ATTRIBUTES|
+-------+---------------+
|  count|       25933550|
|   mean|           null|
| stddev|           null|
|    min|               |
|    max|              I|
+-------+---------------+



                                                                                

+-------+------------------+
|summary|              SNDP|
+-------+------------------+
|  count|          25933550|
|   mean| 926.5820928782955|
| stddev|259.24233782375796|
|    min|               0.4|
|    max|             999.9|
+-------+------------------+



                                                                                

+-------+------------------+
|summary|            FRSHTT|
+-------+------------------+
|  count|          25933550|
|   mean| 8127.262761326544|
| stddev|23948.355432848646|
|    min|                 0|
|    max|            111111|
+-------+------------------+



                                                                                

+-------+-------------------+
|summary|           ItRained|
+-------+-------------------+
|  count|           25933550|
|   mean|0.23984209643492696|
| stddev| 0.4269869696405484|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



                                                                                

+-----------+
|    STATION|
+-----------+
|01001099999|
|01001499999|
|01007099999|
|01010099999|
|01020099999|
|01023099999|
|01035099999|
|01045099999|
|01059099999|
|01086099999|
+-----------+
only showing top 10 rows



                                                                                

+----------+
|      DATE|
+----------+
|2018-01-01|
|2018-01-03|
|2018-01-08|
|2018-01-15|
|2018-01-19|
|2018-01-27|
|2018-02-11|
|2018-02-15|
|2018-02-22|
|2018-02-26|
+----------+
only showing top 10 rows



                                                                                

+----------+
|  LATITUDE|
+----------+
|     78.25|
|     69.65|
|69.0166667|
|71.0166667|
|70.3666666|
|67.5166667|
| 65.461111|
|      66.9|
| 65.783997|
|66.1166666|
+----------+
only showing top 10 rows



                                                                                

+----------+
| LONGITUDE|
+----------+
|      15.5|
|22.8166666|
| 18.918919|
|18.9333333|
|      19.5|
| 22.139744|
|23.0666667|
| 23.371667|
|23.6833333|
| 24.973489|
+----------+
only showing top 10 rows



                                                                                

+---------+
|ELEVATION|
+---------+
|    438.0|
|     4.87|
|    14.22|
|     7.62|
|     13.0|
|      9.1|
|    319.0|
|     19.0|
|    200.0|
|     17.0|
+---------+
only showing top 10 rows



                                                                                

+--------------------+
|                NAME|
+--------------------+
|      NY ALESUND, SV|
|        LONGYEAR, SV|
|        KONGSOYA, NO|
|         DRAUGEN, NO|
|       PYRAMIDEN, NO|
| BJORNOYA ISLAND, NO|
|       SORKJOSEN, NO|
|        KARASJOK, NO|
|           NYRUD, NO|
|SNORRE B OIL PLAT...|
+--------------------+
only showing top 10 rows



                                                                                

+----+
|TEMP|
+----+
|21.6|
|19.5|
|36.1|
|32.4|
|30.3|
|32.1|
|34.6|
|30.6|
|34.7|
|37.5|
+----+
only showing top 10 rows



                                                                                

+---------------+
|TEMP_ATTRIBUTES|
+---------------+
|           22.0|
|           19.0|
|           17.0|
|           13.0|
|           24.0|
|           15.0|
|            9.0|
|           10.0|
|            5.0|
|           16.0|
+---------------+
only showing top 10 rows



                                                                                

+----+
|DEWP|
+----+
|15.5|
|15.1|
| 4.4|
|26.6|
|32.1|
|29.6|
|21.6|
|26.7|
|27.4|
|21.7|
+----+
only showing top 10 rows



                                                                                

+---------------+
|DEWP_ATTRIBUTES|
+---------------+
|           22.0|
|           19.0|
|           17.0|
|           13.0|
|           24.0|
|           15.0|
|            9.0|
|           10.0|
|            5.0|
|            0.0|
+---------------+
only showing top 10 rows



                                                                                

+------+
|   SLP|
+------+
|1009.3|
| 961.8|
| 995.2|
|1012.8|
|1006.0|
| 984.5|
| 991.2|
| 988.5|
|1009.7|
|1015.7|
+------+
only showing top 10 rows



                                                                                

+--------------+
|SLP_ATTRIBUTES|
+--------------+
|          19.0|
|          22.0|
|          17.0|
|          13.0|
|          24.0|
|          15.0|
|           0.0|
|           9.0|
|           5.0|
|          10.0|
+--------------+
only showing top 10 rows



                                                                                

+-----+
|  STP|
+-----+
| 20.6|
|968.8|
|997.9|
|  1.3|
| 13.0|
| 15.1|
|993.6|
| 22.8|
| 21.5|
| 13.8|
+-----+
only showing top 10 rows



                                                                                

+--------------+
|STP_ATTRIBUTES|
+--------------+
|          19.0|
|          22.0|
|          17.0|
|          13.0|
|          24.0|
|          15.0|
|           0.0|
|           9.0|
|           5.0|
|          10.0|
+--------------+
only showing top 10 rows



                                                                                

+-----+
|VISIB|
+-----+
|  3.9|
|  4.4|
| 15.5|
| 10.9|
|  9.9|
| 11.1|
|  7.5|
|  2.9|
| 10.5|
|  8.2|
+-----+
only showing top 10 rows



                                                                                

+----------------+
|VISIB_ATTRIBUTES|
+----------------+
|            13.0|
|            19.0|
|            17.0|
|            22.0|
|             5.0|
|             9.0|
|            15.0|
|            10.0|
|             0.0|
|            24.0|
+----------------+
only showing top 10 rows



                                                                                

+----+
|WDSP|
+----+
|13.2|
|14.9|
|21.6|
| 4.4|
|16.4|
|14.6|
|15.1|
|19.9|
|17.0|
|16.2|
+----+
only showing top 10 rows



                                                                                

+---------------+
|WDSP_ATTRIBUTES|
+---------------+
|           22.0|
|           19.0|
|           17.0|
|           13.0|
|           24.0|
|           15.0|
|            9.0|
|           10.0|
|            5.0|
|            0.0|
+---------------+
only showing top 10 rows



                                                                                

+-----+
|MXSPD|
+-----+
| 33.0|
| 15.5|
| 13.6|
| 25.3|
|  9.9|
| 22.0|
| 19.0|
| 26.2|
| 22.9|
|  3.9|
+-----+
only showing top 10 rows



                                                                                

+----+
|GUST|
+----+
|21.6|
|25.3|
|32.4|
|13.0|
|20.6|
|53.2|
|37.5|
|10.9|
|33.8|
|32.1|
+----+
only showing top 10 rows



                                                                                

+----+
| MAX|
+----+
|22.8|
|14.7|
|36.1|
|33.8|
|32.4|
|34.7|
|32.9|
|37.9|
|38.1|
|26.6|
+----+
only showing top 10 rows



                                                                                

+--------------+
|MAX_ATTRIBUTES|
+--------------+
|             *|
|              |
+--------------+



                                                                                

+----+
| MIN|
+----+
| 8.2|
|32.4|
|33.8|
|26.6|
|23.5|
|32.9|
|25.3|
|26.2|
|30.6|
|21.6|
+----+
only showing top 10 rows



                                                                                

+--------------+
|MIN_ATTRIBUTES|
+--------------+
|             *|
|              |
+--------------+



                                                                                

+----+
|PRCP|
+----+
|0.28|
|0.07|
|0.08|
|4.77|
|3.78|
|0.27|
|5.51|
|0.75|
|0.57|
|2.33|
+----+
only showing top 10 rows



                                                                                

+---------------+
|PRCP_ATTRIBUTES|
+---------------+
|              G|
|              H|
|              I|
|              E|
|              C|
|              A|
|              D|
|              B|
|               |
|              F|
+---------------+



                                                                                

+----+
|SNDP|
+----+
| 3.9|
| 5.9|
| 4.3|
|13.8|
|14.6|
| 9.1|
|13.0|
| 7.5|
|14.2|
|21.7|
+----+
only showing top 10 rows



                                                                                

+------+
|FRSHTT|
+------+
|110000|
|101000|
|    10|
|100010|
| 11001|
|  1000|
| 11100|
| 10010|
|101001|
|110100|
+------+
only showing top 10 rows





+--------+
|ItRained|
+--------+
|       1|
|       0|
+--------+



                                                                                

In [50]:
# Save File as parquet and create a new notebook

### Data cleansing

In [33]:
cols_to_drop = ["STATION", "DATE", "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", "TEMP_ATTRIBUTES", "DEWP_ATTRIBUTES",
               "SLP_ATTRIBUTES", "STP_ATTRIBUTES", "VISIB_ATTRIBUTES", "WDSP_ATTRIBUTES", "MAX_ATTRIBUTES",
               "MIN_ATTRIBUTES", "PRCP_ATTRIBUTES", "GUST"]

cols_interest = [x for x in columns if x not in cols_to_drop]
df_interest_cols = noaa_data.select(cols_interest)

In [34]:
df_interest_cols.printSchema()
df_clean = df_interest_cols.dropna()
[df_interest_cols.count(), df_clean.count()]

columns = df_clean.columns

# df_clean.select("GUST").summary().show()

"""
for cl in df_clean.columns: 
    print(cl)
    df_clean.select(cl).summary().show()
df_clean.select("ItRained").summary().show()
"""  

    
"""
for cl in columns:
    df_clean.describe(cl).show()


for cl in columns:
    df_clean.select(cl).distinct().show(10)
"""


root
 |-- TEMP: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MIN: double (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SNDP: double (nullable = true)
 |-- FRSHTT: integer (nullable = true)
 |-- ItRained: integer (nullable = true)



                                                                                

'\nfor cl in columns:\n    df_clean.describe(cl).show()\n\n\nfor cl in columns:\n    df_clean.select(cl).distinct().show(10)\n'

In [35]:
df_clean = df_clean.filter(df_clean.TEMP > -10)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.DEWP < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.SLP < 4000) #tira ~10M de rows
print(df_clean.count())
df_clean = df_clean.filter(df_clean.STP < 100) # tira ~10M de rows
print(df_clean.count())
df_clean = df_clean.filter(df_clean.VISIB < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.WDSP < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.MXSPD < 100)
print(df_clean.count())
# df_clean = df_clean.filter(df_clean.GUST < 100)
df_clean = df_clean.filter(df_clean.MAX < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.MIN < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.PRCP < 100)
print(df_clean.count())
df_clean = df_clean.filter(df_clean.SNDP < 100)
print(df_clean.count())


"""
temp_median = df_clean_pd['TEMP'].quantile(0.50)
df_clean_pd['TEMP'] = np.where(df_clean_pd['TEMP'] < -10, temp_median, df_clean_pd['TEMP'])
plt.boxplot(df_clean_pd["TEMP"])
plt.show()

dewp_median = df_clean_pd['DEWP'].quantile(0.50)
df_clean_pd['DEWP'] = np.where(df_clean_pd['DEWP'] > 100, dewp_median, df_clean_pd['DEWP'])
plt.boxplot(df_clean_pd["DEWP"])
plt.show()

df_clean_slp_filter = df_clean.filter(df_clean.SLP < 4000).toPandas()
slp_median = df_clean_slp_filter['SLP'].quantile(0.50)
df_clean_pd['SLP'] = np.where(df_clean_pd['SLP'] > 4000, slp_median, df_clean_pd['SLP'])
plt.boxplot(df_clean_pd["SLP"])
plt.show()

df_clean_stp_filter = df_clean.filter(df_clean.STP < 100).toPandas()
stp_median = df_clean_stp_filter['STP'].quantile(0.50)
df_clean_pd['STP'] = np.where(df_clean_pd['STP'] > 100, stp_median, df_clean_pd['STP'])
plt.boxplot(df_clean_pd["STP"])
plt.show()

df_clean_visib_filter = df_clean.filter(df_clean.VISIB < 100).toPandas()
visib_median = df_clean_visib_filter['VISIB'].quantile(0.50)
df_clean_pd['VISIB'] = np.where(df_clean_pd['VISIB'] > 100, visib_median, df_clean_pd['VISIB'])
plt.boxplot(df_clean_pd["VISIB"])
plt.show()

df_clean_wdsp_filter = df_clean.filter(df_clean.WDSP < 100).toPandas()
wdsp_median = df_clean_wdsp_filter['WDSP'].quantile(0.50)
df_clean_pd['WDSP'] = np.where(df_clean_pd['WDSP'] > 100, wdsp_median, df_clean_pd['WDSP'])
plt.boxplot(df_clean_pd["WDSP"])
plt.show()

df_clean_mxspd_filter = df_clean.filter(df_clean.MXSPD < 100).toPandas()
mxspd_median = df_clean_mxspd_filter['MXSPD'].quantile(0.50)
df_clean_pd['MXSPD'] = np.where(df_clean_pd['MXSPD'] > 100, mxspd_median, df_clean_pd['MXSPD'])
plt.boxplot(df_clean_pd["MXSPD"])
plt.show()

df_clean_gust_filter = df_clean.filter(df_clean.GUST < 100).toPandas()
gust_median = df_clean_gust_filter['GUST'].quantile(0.50)
df_clean_pd['GUST'] = np.where(df_clean_pd['GUST'] > 100, gust_median, df_clean_pd['GUST'])
plt.boxplot(df_clean_pd["GUST"])
plt.show()

df_clean_max_filter = df_clean.filter(df_clean.MAX < 100).toPandas()
max_median = df_clean_max_filter['MAX'].quantile(0.50)
df_clean_pd['MAX'] = np.where((df_clean_pd['MAX'] < 100) & (df_clean_pd['MAX'] > -10), df_clean_pd['MAX'], max_median)
plt.boxplot(df_clean_pd["MAX"])
plt.show()

df_clean_min_filter = df_clean.filter(df_clean.MIN < 100).toPandas()
min_median = df_clean_max_filter['MIN'].quantile(0.50)
df_clean_pd['MIN'] = np.where((df_clean_pd['MIN'] < 100) & (df_clean_pd['MIN'] > -10), df_clean_pd['MIN'], min_median)
plt.boxplot(df_clean_pd["MIN"])
plt.show()

df_clean_prcp_filter = df_clean.filter(df_clean.PRCP < 100).toPandas()
prcp_median = df_clean_prcp_filter['PRCP'].quantile(0.50)
df_clean_pd['PRCP'] = np.where(df_clean_pd['PRCP'] > 50, prcp_median, df_clean_pd['PRCP'])
plt.boxplot(df_clean_pd["PRCP"])
plt.show()

df_clean_sndp_filter = df_clean.filter(df_clean.SNDP < 100).toPandas()
sndp_median = df_clean_sndp_filter['SNDP'].quantile(0.50)
df_clean_pd['SNDP'] = np.where(df_clean_pd['SNDP'] > 200, sndp_median, df_clean_pd['SNDP'])
plt.boxplot(df_clean_pd["SNDP"])
plt.show()
"""

                                                                                

25570167


                                                                                

24451437


                                                                                

15944455


                                                                                

6454723


                                                                                

4889339


                                                                                

4812399


                                                                                

4777308


                                                                                

4725430


                                                                                

4724993


                                                                                

4724993




415129


                                                                                

'\ntemp_median = df_clean_pd[\'TEMP\'].quantile(0.50)\ndf_clean_pd[\'TEMP\'] = np.where(df_clean_pd[\'TEMP\'] < -10, temp_median, df_clean_pd[\'TEMP\'])\nplt.boxplot(df_clean_pd["TEMP"])\nplt.show()\n\ndewp_median = df_clean_pd[\'DEWP\'].quantile(0.50)\ndf_clean_pd[\'DEWP\'] = np.where(df_clean_pd[\'DEWP\'] > 100, dewp_median, df_clean_pd[\'DEWP\'])\nplt.boxplot(df_clean_pd["DEWP"])\nplt.show()\n\ndf_clean_slp_filter = df_clean.filter(df_clean.SLP < 4000).toPandas()\nslp_median = df_clean_slp_filter[\'SLP\'].quantile(0.50)\ndf_clean_pd[\'SLP\'] = np.where(df_clean_pd[\'SLP\'] > 4000, slp_median, df_clean_pd[\'SLP\'])\nplt.boxplot(df_clean_pd["SLP"])\nplt.show()\n\ndf_clean_stp_filter = df_clean.filter(df_clean.STP < 100).toPandas()\nstp_median = df_clean_stp_filter[\'STP\'].quantile(0.50)\ndf_clean_pd[\'STP\'] = np.where(df_clean_pd[\'STP\'] > 100, stp_median, df_clean_pd[\'STP\'])\nplt.boxplot(df_clean_pd["STP"])\nplt.show()\n\ndf_clean_visib_filter = df_clean.filter(df_clean.VISIB < 

In [17]:
cleanfilename = "clean-noaa"
df_clean.write.mode("overwrite").parquet(cleanfilename)

# check in your running directory if that was accomplished
! ls -la



total 5857048
drwxrwxr-x  5 nuno nuno       4096 mai 23 16:03 .
drwxr-xr-x 33 nuno nuno       4096 mai 23 15:56 ..
-rw-rw-r--  1 nuno nuno      17410 mai 23 16:02 AWS-Notebook1.ipynb
drwxr-xr-x  2 nuno nuno      12288 mai 23 16:03 clean-noaa
-rw-rw-r--  1 nuno nuno    1645585 mai 23 15:59 Download-AWS-Data.ipynb
drwxrwxr-x  8 nuno nuno       4096 mai 23 15:59 .git
-rw-rw-r--  1 nuno nuno         66 mai 10 15:50 .gitattributes
-rw-rw-r--  1 nuno nuno         16 mai 23 15:59 .gitignore
drwxrwxr-x  2 nuno nuno       4096 mai 17 12:02 .ipynb_checkpoints
-rw-rw-r--  1 nuno nuno 5995896920 mai 18 11:51 noaa.csv
-rw-rw-r--  1 nuno nuno         62 mai 10 15:50 README.md
-rw-rw-r--  1 nuno nuno       3929 mai 17 12:38 Useful_Visualization_Functions.ipynb


                                                                                

In [None]:
# df = spark.createDataFrame([['a',1],['b',1],['c',1],['d',1], ['e',1], ['f',1], ['x', 0], ['y', 0]], ['feature', 'label'])
# df.show()
major_df = df_clean.filter(col("ItRained") == 0)
minor_df = df_clean.filter(col("ItRained") == 1)
ratio = major_df.count()/minor_df.count()
sampled_majority_df = major_df.sample(False, 1/ratio)
df_clean = sampled_majority_df.unionAll(minor_df)
df_clean.select("ItRained").summary().show()



+-------+------------------+
|summary|          ItRained|
+-------+------------------+
|  count|             93537|
|   mean|0.5003902199129756|
| stddev|0.5000025204881431|
|    min|                 0|
|    25%|                 0|
|    50%|                 1|
|    75%|                 1|
|    max|                 1|
+-------+------------------+



                                                                                

In [19]:
# df_clean = myspark.createDataFrame(df_clean_pd)
df_train, df_test = df_clean.randomSplit([0.8,0.2], seed = 42)
df_train.cache()
print(f"There are {df_train.count()} rows in the training set and {df_test.count()} in the test set")



There are 74913 rows in the training set and 18624 in the test set


                                                                                

In [20]:
vec_assembler = VectorAssembler(inputCols=['TEMP', 'DEWP','SLP', 'STP', 'VISIB', 'WDSP', 'MXSPD', 'MAX', 'MIN'], outputCol="features")
vec_df_train = vec_assembler.transform(df_train)

# show the content of the columns bedrooms, features and price
# vec_df_train.select("TEMP","DEWP","features").show(200)

lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol="ItRained")
pipeline = Pipeline(stages=[vec_assembler, lsvc])
pipeline_model = pipeline.fit(df_train)
df_prediction = pipeline_model.transform(df_test)
# df_prediction.select("features", "ItRained", "prediction").sort("prediction", ascending=False).show(200)

prediction_label = df_prediction.select("prediction", "ItRained")  

# supports metricName="areaUnderROC" (default) and "areaUnderPR"
# it relates sensitivity (TP rate) and specificity (FP rate)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='ItRained', )

# print("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))

n = df_prediction.count()
tp = df_prediction.filter(expr("prediction > 0") & expr("ItRained == prediction")).count()
tn = df_prediction.filter(expr("prediction <= 0") & expr("ItRained == prediction")).count()
fp = df_prediction.filter(expr("prediction > 0") & expr("ItRained != prediction")).count()
fn = n - tp - tn - fp
print("True Positive: ",tp/n * 100, 2,"%", "\nTrue Negative: ", tn/n * 100,"%",
      "\nFalse Positive: ", fp/n * 100 ,"%", "\nFalse Negative: ", fn/n * 100,"%", 
      "\nPrediction count:", n)

"""
prediction_label = df_prediction.select("prediction", "ItRained")

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')

print("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))

lr = LinearRegression(featuresCol="features", labelCol="ItRained")
lr_model = lr.fit(vec_df_train)
avg_ItRained = float(df_train.select(avg("ItRained")).first()[0])
df_pred = df_train.withColumn("avg_ItRained_prediction", lit(avg_ItRained))
avg_ItRained
evaluator = RegressionEvaluator(predictionCol="avg_ItRained_prediction", labelCol="ItRained", metricName="rmse")
print(f"The RMSE for predicting the average frshtt is: {evaluator.evaluate(df_pred):.2f}")
pipeline = Pipeline(stages=[vec_assembler, lr_model])

# get the model (as transformer)
pipeline_model = pipeline.fit(df_train)
df_prediction = pipeline_model.transform(df_test)

# show the columns worth to be looked at
df_prediction.select("features","ItRained","prediction").sample(False, 0.1).sort("ItRained", ascending=False).show(200)

df_prediction.columns
"""



True Positive:  44.979596219931274 2 % 
True Negative:  36.02341065292096 % 
False Positive:  13.799398625429554 % 
False Negative:  5.197594501718213 % 
Prediction count: 18624


                                                                                

'\nprediction_label = df_prediction.select("prediction", "ItRained")\n\nevaluator = BinaryClassificationEvaluator(rawPredictionCol=\'prediction\')\n\nprint("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))\n\nlr = LinearRegression(featuresCol="features", labelCol="ItRained")\nlr_model = lr.fit(vec_df_train)\navg_ItRained = float(df_train.select(avg("ItRained")).first()[0])\ndf_pred = df_train.withColumn("avg_ItRained_prediction", lit(avg_ItRained))\navg_ItRained\nevaluator = RegressionEvaluator(predictionCol="avg_ItRained_prediction", labelCol="ItRained", metricName="rmse")\nprint(f"The RMSE for predicting the average frshtt is: {evaluator.evaluate(df_pred):.2f}")\npipeline = Pipeline(stages=[vec_assembler, lr_model])\n\n# get the model (as transformer)\npipeline_model = pipeline.fit(df_train)\ndf_prediction = pipeline_model.transform(df_test)\n\n# show the columns worth to be looked at\ndf_prediction.select("features","ItRained","prediction").sample(False, 0.1).sort("ItRa