In [57]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *
warnings.filterwarnings("ignore")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import lit, col, column, expr, desc, asc

In [None]:
# ! pip install matplotlib
# ! pip install seaborn
# ! pip install ipynb
# ! pip install nbimporter

In [None]:
# build our own SparkSession
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

In [None]:
myspark

In [None]:
# ! head noaa.csv
# noaa_data.show(10)
noaa_data = myspark.read.load("noaa.csv", format="csv", sep=",", header=True, inferSchema=True)
noaa_data.count()

In [None]:
# noaa_data.show(10)

In [None]:
# temp_filt = (noaa_data.filter(noaa_data.ELEVATION <= 5))
# temp_filt.count()

In [None]:
#latitude_order = noaa_data.orderBy("LATITUDE", ascending=False)
#latitude_order.show(10)

In [None]:
noaa_data.printSchema()

In [None]:
noaa_data.select("TEMP", "ELEVATION", "VISIB").show(10)

In [None]:

columns = noaa_data.columns
"""
for cl in columns:
    noaa_data.describe(cl).show()

for cl in columns:
    noaa_data.select(cl).distinct().show(10)
"""

In [None]:
cols_to_drop = ["STATION", "DATE", "LATITUDE", "LONGITUDE", "ELEVATION", "NAME", "TEMP_ATTRIBUTES", "DEWP_ATTRIBUTES",
               "SLP_ATTRIBUTES", "STP_ATTRIBUTES", "VISIB_ATTRIBUTES", "WDSP_ATTRIBUTES", "MAX_ATTRIBUTES",
               "MIN_ATTRIBUTES", "PRCP_ATTRIBUTES"]

cols_interest = [x for x in columns if x not in cols_to_drop]
df_interest_cols = noaa_data.select(cols_interest)

### Data cleansing

In [None]:
df_interest_cols.printSchema()
df_clean = df_interest_cols.dropna()
[df_interest_cols.count(), df_clean.count()]

columns = df_clean.columns
    
for cl in df_clean.columns: 
    df_clean.select(cl).summary().show()
    
"""
for cl in columns:
    df_clean.describe(cl).show()


for cl in columns:
    df_clean.select(cl).distinct().show(10)
"""


In [None]:
df_clean_pd = df_clean.toPandas()

temp_median = df_clean_pd['TEMP'].quantile(0.50)
df_clean_pd['TEMP'] = np.where(df_clean_pd['TEMP'] < -10, temp_median, df_clean_pd['TEMP'])
plt.boxplot(df_clean_pd["TEMP"])
plt.show()

dewp_median = df_clean_pd['DEWP'].quantile(0.50)
df_clean_pd['DEWP'] = np.where(df_clean_pd['DEWP'] > 100, dewp_median, df_clean_pd['DEWP'])
plt.boxplot(df_clean_pd["DEWP"])
plt.show()

df_clean_slp_filter = df_clean.filter(df_clean.SLP < 4000).toPandas()
slp_median = df_clean_slp_filter['SLP'].quantile(0.50)
df_clean_pd['SLP'] = np.where(df_clean_pd['SLP'] > 4000, slp_median, df_clean_pd['SLP'])
plt.boxplot(df_clean_pd["SLP"])
plt.show()

df_clean_stp_filter = df_clean.filter(df_clean.STP < 100).toPandas()
stp_median = df_clean_stp_filter['STP'].quantile(0.50)
df_clean_pd['STP'] = np.where(df_clean_pd['STP'] > 100, stp_median, df_clean_pd['STP'])
plt.boxplot(df_clean_pd["STP"])
plt.show()

df_clean_visib_filter = df_clean.filter(df_clean.VISIB < 100).toPandas()
visib_median = df_clean_visib_filter['VISIB'].quantile(0.50)
df_clean_pd['VISIB'] = np.where(df_clean_pd['VISIB'] > 100, visib_median, df_clean_pd['VISIB'])
plt.boxplot(df_clean_pd["VISIB"])
plt.show()

df_clean_wdsp_filter = df_clean.filter(df_clean.WDSP < 100).toPandas()
wdsp_median = df_clean_wdsp_filter['WDSP'].quantile(0.50)
df_clean_pd['WDSP'] = np.where(df_clean_pd['WDSP'] > 100, wdsp_median, df_clean_pd['WDSP'])
plt.boxplot(df_clean_pd["WDSP"])
plt.show()

df_clean_mxspd_filter = df_clean.filter(df_clean.MXSPD < 100).toPandas()
mxspd_median = df_clean_mxspd_filter['MXSPD'].quantile(0.50)
df_clean_pd['MXSPD'] = np.where(df_clean_pd['MXSPD'] > 100, mxspd_median, df_clean_pd['MXSPD'])
plt.boxplot(df_clean_pd["MXSPD"])
plt.show()

df_clean_gust_filter = df_clean.filter(df_clean.GUST < 100).toPandas()
gust_median = df_clean_gust_filter['GUST'].quantile(0.50)
df_clean_pd['GUST'] = np.where(df_clean_pd['GUST'] > 100, gust_median, df_clean_pd['GUST'])
plt.boxplot(df_clean_pd["GUST"])
plt.show()

df_clean_max_filter = df_clean.filter(df_clean.MAX < 100).toPandas()
max_median = df_clean_max_filter['MAX'].quantile(0.50)
df_clean_pd['MAX'] = np.where((df_clean_pd['MAX'] < 100) & (df_clean_pd['MAX'] > -10), df_clean_pd['MAX'], max_median)
plt.boxplot(df_clean_pd["MAX"])
plt.show()

df_clean_min_filter = df_clean.filter(df_clean.MIN < 100).toPandas()
min_median = df_clean_max_filter['MIN'].quantile(0.50)
df_clean_pd['MIN'] = np.where((df_clean_pd['MIN'] < 100) & (df_clean_pd['MIN'] > -10), df_clean_pd['MIN'], min_median)
plt.boxplot(df_clean_pd["MIN"])
plt.show()

df_clean_prcp_filter = df_clean.filter(df_clean.PRCP < 100).toPandas()
prcp_median = df_clean_prcp_filter['PRCP'].quantile(0.50)
df_clean_pd['PRCP'] = np.where(df_clean_pd['PRCP'] > 50, prcp_median, df_clean_pd['PRCP'])
plt.boxplot(df_clean_pd["PRCP"])
plt.show()

df_clean_sndp_filter = df_clean.filter(df_clean.SNDP < 100).toPandas()
sndp_median = df_clean_sndp_filter['SNDP'].quantile(0.50)
df_clean_pd['SNDP'] = np.where(df_clean_pd['SNDP'] > 200, sndp_median, df_clean_pd['SNDP'])
plt.boxplot(df_clean_pd["SNDP"])
plt.show()

In [67]:
df_clean = myspark.createDataFrame(df_clean_pd)
df_train, df_test = df_clean.randomSplit([0.8,0.2], seed = 42)
df_train.cache()
print(f"There are {df_train.count()} rows in the training set and {df_test.count()} in the test set")

                                                                                

There are 18133 rows in the training set and 4468 in the test set


In [68]:
vec_assembler = VectorAssembler(inputCols=['TEMP', 'DEWP', 'SLP', 'STP', 'VISIB', 'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN'], outputCol="features")
vec_df_train = vec_assembler.transform(df_train)

# show the content of the columns bedrooms, features and price
vec_df_train.select("TEMP","DEWP","features").show(10)

lr = LinearRegression(featuresCol="features", labelCol="PRCP")
lr_model = lr.fit(vec_df_train)

+----+-----+--------------------+
|TEMP| DEWP|            features|
+----+-----+--------------------+
|-7.7|-12.3|[-7.7,-12.3,998.4...|
|-5.6|-12.5|[-5.6,-12.5,1010....|
|-4.1|-14.6|[-4.1,-14.6,1016....|
|-3.4| 34.0|[-3.4,34.0,1023.0...|
|-3.1|-10.4|[-3.1,-10.4,1010....|
|-2.8| -7.8|[-2.8,-7.8,1013.4...|
|-2.3| -5.6|[-2.3,-5.6,1010.8...|
|-2.1|-16.1|[-2.1,-16.1,1010....|
|-1.8|-13.5|[-1.8,-13.5,1008....|
|-1.4|-15.0|[-1.4,-15.0,1010....|
+----+-----+--------------------+
only showing top 10 rows



22/05/17 15:52:51 WARN Instrumentation: [1a1630b2] regParam is zero, which might cause numerical instability and overfitting.


In [69]:
m =  np.around(lr_model.coefficients, decimals=2)
b =  np.around(lr_model.intercept, decimals= 2)

# print the linear regression line

print(f"The formula for the linear regression line is: \tPRCP = {m[0]} * [INPUTCOLS] + {b}")

The formula for the linear regression line is: 	PRCP = -0.0 * [INPUTCOLS] + 0.93


In [70]:
avg_prcp = float(df_train.select(avg("PRCP")).first()[0])
df_pred = df_train.withColumn("avg_prcp_prediction", lit(avg_prcp))
avg_prcp

0.03605415540726854

In [71]:
evaluator = RegressionEvaluator(predictionCol="avg_prcp_prediction", labelCol="PRCP", metricName="rmse")

print(f"The RMSE for predicting the average price is: {evaluator.evaluate(df_pred):.2f}")

The RMSE for predicting the average price is: 0.21


In [72]:
pipeline = Pipeline(stages=[vec_assembler, lr_model])

# get the model (as transformer)
pipeline_model = pipeline.fit(df_train)

In [74]:
df_prediction = pipeline_model.transform(df_test)

# show the columns worth to be looked at
df_prediction.select("features","PRCP","prediction").sample(False, 0.1).sort("PRCP", ascending=False).show(200)

df_prediction.columns

+--------------------+----+--------------------+
|            features|PRCP|          prediction|
+--------------------+----+--------------------+
|[33.8,22.7,1010.8...|2.76| 0.07207634870316681|
|[61.8,45.3,1020.7...|1.57|  0.0632703377349173|
|[38.5,23.5,1004.7...|1.57| 0.07179576368099228|
|[37.3,27.3,1006.8...|1.18| 0.05567252949824186|
|[50.1,47.0,1007.2...|0.75| 0.05182540963201432|
|[30.0,26.2,979.0,...|0.45| 0.09484696716043806|
|[40.4,34.3,1016.3...|0.41|0.038091194162264363|
|[29.1,25.4,1010.8...|0.39| 0.05046234134859362|
|[33.4,26.8,1003.3...|0.27|0.039482139837274555|
|[44.0,41.8,1015.2...|0.19| 0.00913385700596403|
|[42.6,37.4,1014.8...|0.19| 0.14521492763753774|
|[35.1,33.5,1010.8...|0.17| 0.02624101924591904|
|[45.4,38.7,1010.8...|0.16|0.029902359900101705|
|[38.4,35.0,1010.8...|0.16| 0.03463889992621105|
|[41.0,30.5,1003.9...|0.15|  0.0791146998574882|
|[58.4,48.9,1004.5...|0.15| 0.08425758610305101|
|[33.8,28.6,1013.5...|0.14|  0.0513826680940056|
|[27.2,22.9,1010.8..

['TEMP',
 'DEWP',
 'SLP',
 'STP',
 'VISIB',
 'WDSP',
 'MXSPD',
 'GUST',
 'MAX',
 'MIN',
 'PRCP',
 'SNDP',
 'FRSHTT',
 'features',
 'prediction']