In [1]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
from pyspark.ml import *
from pyspark.sql import *
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import col, explode, array, lit

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

warnings.filterwarnings("ignore")
import pyspark.sql.functions as F

### Build spark session

In [2]:
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "200g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

22/05/24 17:18:15 WARN Utils: Your hostname, nuno-g14 resolves to a loopback address: 127.0.1.1; using 10.15.55.168 instead (on interface wlp2s0)
22/05/24 17:18:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/24 17:18:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/24 17:18:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Import parquet files

In [3]:
df_clean = myspark.read.parquet("clean-noaa")
# noaa_data.show(10)

                                                                                

### Undersampling

In [4]:
zero_df = df_clean.filter(col("ItRained") == 0)
one_df = df_clean.filter(col("ItRained") == 1)

major_df, minor_df = (zero_df, one_df) if zero_df.count() > one_df.count() else (one_df, zero_df)

#if zero_df.count() > one_df.count():
#    major_df = zero_df
#    minor_df = one_df
#else:
#    major_df = one_df
#    minor_df = zero_df
    
ratio = major_df.count()/minor_df.count()
sampled_majority_df = major_df.sample(False, 1/ratio)
df_clean = sampled_majority_df.unionAll(minor_df)
df_clean.select("ItRained").summary().show()

+-------+------------------+
|summary|          ItRained|
+-------+------------------+
|  count|            302295|
|   mean|0.4994822937858714|
| stddev|0.5000005589885445|
|    min|                 0|
|    25%|                 0|
|    50%|                 0|
|    75%|                 1|
|    max|                 1|
+-------+------------------+



In [50]:
cols_interest = ['TEMP','DEWP','SLP','VISIB','WDSP','MXSPD','MAX','MIN']
non_labeled_df = df_clean.select(cols_interest)
labels_col = df_clean.select('ItRained')

In [51]:
assembler = VectorAssembler(inputCols=cols_interest,outputCol="features")
non_labeled_df = assembler.transform(non_labeled_df)
# Trains a k-means model.
kmeans = KMeans(featuresCol="features").setK(2).setSeed(1)
model = kmeans.fit(non_labeled_df)

# Make predictions
predictions = model.transform(non_labeled_df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

                                                                                

Silhouette with squared euclidean distance = 0.6367633410107271
Cluster Centers: 
[  34.06326043   29.03794858 1010.76606115   10.75443248    7.12901242
   11.46170338   39.68588165   27.8144322 ]
[ 9.90349044e+00  3.38431328e+00  1.02009727e+03  1.40831367e+01
  6.12271038e+00  9.61559101e+00  1.83445110e+01 -3.93025874e-01]


In [49]:
# predictions.orderBy('prediction', ascending=True).show()
predictions_df_sampled = predictions.sample(False, 0.1).toPandas()
# Useful_Visualization_Functions.plotScatter(predictions_df_sampled,"TEMP", "DEWP", "prediction")
            
#Useful_Visualization_Functions.plotScatterMatrix(new, "prediction")

+----+-----+------+-----+----+-----+----+----+--------------------+----------+
|TEMP| DEWP|   SLP|VISIB|WDSP|MXSPD| MAX| MIN|            features|prediction|
+----+-----+------+-----+----+-----+----+----+--------------------+----------+
|30.5| 25.7| 998.3|  8.5|12.6| 23.3|35.6|25.7|[30.5,25.7,998.3,...|       0.0|
| 7.7| -3.4|1001.5| 31.1| 7.2|  9.7|10.2| 4.1|[7.7,-3.4,1001.5,...|       1.0|
| 9.6|  2.6| 996.2| 29.3| 7.6|  9.7|14.9| 4.5|[9.6,2.6,996.2,29...|       1.0|
|30.6| 23.7|1028.9| 31.1| 6.5| 15.5|34.9|28.9|[30.6,23.7,1028.9...|       0.0|
|22.1| 15.1|1007.7| 18.8|11.1| 19.4|28.0|14.2|[22.1,15.1,1007.7...|       0.0|
|20.2| 15.7| 985.0| 13.2|14.7| 27.2|25.0|13.5|[20.2,15.7,985.0,...|       0.0|
| 0.6| -6.8|1020.0| 27.6| 5.5|  7.8| 3.7|-1.7|[0.6,-6.8,1020.0,...|       1.0|
|11.0|  3.5|1013.0| 11.4|10.6| 19.4|17.2|-6.0|[11.0,3.5,1013.0,...|       1.0|
|12.3|  4.4|1016.4| 22.4| 8.3| 19.4|14.9| 9.7|[12.3,4.4,1016.4,...|       1.0|
|10.0| -0.4|1004.7| 28.7| 6.6| 13.6|19.4|-4.7|[10.0,

TypeError: 'ListedColormap' object is not iterable