In [1]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
from pyspark.ml import *
from pyspark.sql import *
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import col, explode, array, lit

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

### Build spark session

In [2]:
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

22/05/24 13:13:25 WARN Utils: Your hostname, saltedcookie-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.199 instead (on interface enp42s0)
22/05/24 13:13:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/24 13:13:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/24 13:13:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/24 13:13:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/24 13:13:26 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


### Import parquet files

In [3]:
df_clean = myspark.read.parquet("clean-noaa")

# Current day

### Undersampling

In [19]:
zero_df_cur = df_clean.filter(col("ItRained") == 0)
one_df_cur = df_clean.filter(col("ItRained") == 1)

major_df_cur, minor_df_cur = (zero_df_cur, one_df_cur) if zero_df_cur.count() > one_df_cur.count() else (one_df_cur, zero_df_cur)

#if zero_df.count() > one_df.count():
#    major_df = zero_df
#    minor_df = one_df
#else:
#    major_df = one_df
#    minor_df = zero_df
    
ratio_cur = major_df_cur.count()/minor_df_cur.count()
sampled_majority_df_cur = major_df_cur.sample(False, 1/ratio_cur)
df_cur = sampled_majority_df_cur.unionAll(minor_df_cur)
df_cur.select("ItRained").summary().show()

+-------+------------------+
|summary|          ItRained|
+-------+------------------+
|  count|            301999|
|   mean|0.4999718542114378|
| stddev|0.5000008270271799|
|    min|                 0|
|    25%|                 0|
|    50%|                 0|
|    75%|                 1|
|    max|                 1|
+-------+------------------+



### Split into training and testing data

In [20]:
df_train_cur, df_test_cur = df_cur.randomSplit([0.8,0.2], seed = 42)
df_train_cur.cache()
print(f"There are {df_train_cur.count()} rows in the training set and {df_test_cur.count()} in the test set")

There are 241772 rows in the training set and 60227 in the test set


### Create model

In [21]:
vec_assembler_cur = VectorAssembler(inputCols=['TEMP', 'DEWP','SLP', 'VISIB', 'WDSP', 'MXSPD', 'MAX', 'MIN'], outputCol="features")
vec_df_train_cur = vec_assembler_cur.transform(df_train_cur)

# show the content of the columns bedrooms, features and price
# vec_df_train.select("TEMP","DEWP","features").show(200)

lsvc_cur = LinearSVC(maxIter=10, regParam=0.1, labelCol="ItRained")
pipeline_cur = Pipeline(stages=[vec_assembler_cur, lsvc_cur])
pipeline_model_cur = pipeline_cur.fit(df_train_cur)

### Test model

In [32]:
df_prediction_cur = pipeline_model_cur.transform(df_test_cur)
# df_prediction.select("features", "ItRained", "prediction").sort("prediction", ascending=False).show(200)

prediction_label_cur = df_prediction_cur.select("prediction", "ItRained")  

# supports metricName="areaUnderROC" (default) and "areaUnderPR"
# it relates sensitivity (TP rate) and specificity (FP rate)

evaluator_cur = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='ItRained', )

# print("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))

n_cur = df_prediction_cur.count()
tp_cur = df_prediction_cur.filter(expr("prediction > 0") & expr("ItRained == prediction")).count()
tn_cur = df_prediction_cur.filter(expr("prediction <= 0") & expr("ItRained == prediction")).count()
fp_cur = df_prediction_cur.filter(expr("prediction > 0") & expr("ItRained != prediction")).count()
fn_cur = n_cur - tp_cur - tn_cur - fp_cur
print("True Positive: ",tp_cur/n_cur * 100, 2,"%", "\nTrue Negative: ", tn_cur/n_cur * 100,"%",
      "\nFalse Positive: ", fp_cur/n_cur * 100 ,"%", "\nFalse Negative: ", fn_cur/n_cur* 100,"%", 
      "\nPrediction count:", n_cur)
print("Accuracy: ", (tp_cur/n_cur * 100) + (tn_cur/n_cur * 100))

True Positive:  45.08110980125193 2 % 
True Negative:  36.04031414481877 % 
False Positive:  13.930629119829977 % 
False Negative:  4.947946934099325 % 
Prediction count: 60227
Accuracy:  81.1214239460707


# Next Day

### Undersampling

In [23]:
zero_df = df_clean.filter(col("NextDay") == 0)
one_df = df_clean.filter(col("NextDay") == 1)

major_df, minor_df = (zero_df, one_df) if zero_df.count() > one_df.count() else (one_df, zero_df)

#if zero_df.count() > one_df.count():
#    major_df = zero_df
#    minor_df = one_df
#else:
#    major_df = one_df
#    minor_df = zero_df
    
ratio = major_df.count()/minor_df.count()
sampled_majority_df = major_df.sample(False, 1/ratio)
df_next = sampled_majority_df.unionAll(minor_df)
df_next.select("NextDay").summary().show()

+-------+------------------+
|summary|           NextDay|
+-------+------------------+
|  count|            293588|
|   mean|0.4997036663623854|
| stddev|0.5000007637218347|
|    min|                 0|
|    25%|                 0|
|    50%|                 0|
|    75%|                 1|
|    max|                 1|
+-------+------------------+



### Split into training and testing data

In [24]:
df_train, df_test = df_next.randomSplit([0.8,0.2], seed = 42)
df_train.cache()
print(f"There are {df_train.count()} rows in the training set and {df_test.count()} in the test set")

There are 234992 rows in the training set and 58596 in the test set


### Create model

In [25]:
vec_assembler = VectorAssembler(inputCols=['TEMP', 'DEWP','SLP', 'VISIB', 'WDSP', 'MXSPD', 'MAX', 'MIN'], outputCol="features")
vec_df_train = vec_assembler.transform(df_train)

# show the content of the columns bedrooms, features and price
# vec_df_train.select("TEMP","DEWP","features").show(200)

lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol="NextDay")
pipeline = Pipeline(stages=[vec_assembler, lsvc])
pipeline_model_next = pipeline.fit(df_train)

### Test model

In [27]:
df_prediction = pipeline_model_next.transform(df_test)
# df_prediction.select("features", "ItRained", "prediction").sort("prediction", ascending=False).show(200)

prediction_label = df_prediction.select("prediction", "NextDay")  

# supports metricName="areaUnderROC" (default) and "areaUnderPR"
# it relates sensitivity (TP rate) and specificity (FP rate)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='NextDay', )

# print("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))

n = df_prediction.count()
tp = df_prediction.filter(expr("prediction > 0") & expr("NextDay == prediction")).count()
tn = df_prediction.filter(expr("prediction <= 0") & expr("NextDay == prediction")).count()
fp = df_prediction.filter(expr("prediction > 0") & expr("NextDay != prediction")).count()
fn = n - tp - tn - fp
print("True Positive: ",tp/n * 100,"%", "\nTrue Negative: ", tn/n * 100,"%",
      "\nFalse Positive: ", fp/n * 100 ,"%", "\nFalse Negative: ", fn/n * 100,"%", 
      "\nPrediction count:", n)
print("Accuracy: ", (tp/n * 100) + (tn/n * 100))

True Positive:  42.460236193596835 % 
True Negative:  31.00894258993788 % 
False Positive:  18.934739572667077 % 
False Negative:  7.596081643798211 % 
Prediction count: 58596
Accuracy:  73.46917878353472
