In [24]:
import os 

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nbimporter
import Useful_Visualization_Functions
from pyspark.ml import *
from pyspark.sql import *
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
#from pyspark.sql.functions import *
from pyspark.sql.functions import col, explode, array, lit, expr

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

### Build spark session

In [14]:
myspark = SparkSession\
    .builder\
    .appName("AWS-Spark")\
    .config("spark.driver.memory", "15g") \
    .config("spark.sql.shuffle.partitions",6)\
    .config("spark.sql.repl.eagereval.enabled",True)\
    .getOrCreate()

### Import parquet files

In [15]:
df_clean = myspark.read.parquet("clean-noaa")

### Choice of target column

In [16]:
target_column = "ItRained"
#target_column = "ItRainedOrSnowed"
#target_column = "NextDayIR"
#target_column = "NextDayIROS"

### Undersampling

In [17]:
zero_df = df_clean.filter(col(target_column) == 0)
one_df = df_clean.filter(col(target_column) == 1)

major_df, minor_df = (zero_df, one_df) if zero_df.count() > one_df.count() else (one_df, zero_df)
    
ratio = major_df.count()/minor_df.count()
sampled_majority_df = major_df.sample(False, 1/ratio)
df_next = sampled_majority_df.unionAll(minor_df)
df_next.select(target_column).summary().show()

+-------+------------------+
|summary|          ItRained|
+-------+------------------+
|  count|            302327|
|   mean| 0.499429425754233|
| stddev|0.5000005013656338|
|    min|                 0|
|    25%|                 0|
|    50%|                 0|
|    75%|                 1|
|    max|                 1|
+-------+------------------+



### Split into training and testing data

In [18]:
df_train, df_test = df_next.randomSplit([0.8,0.2], seed = 42)
df_train.cache()
print(f"There are {df_train.count()} rows in the training set and {df_test.count()} in the test set")

                                                                                

There are 242044 rows in the training set and 60283 in the test set


### Create model

In [19]:
vec_assembler = VectorAssembler(inputCols=['TEMP', 'DEWP', 'SLP', 'VISIB', 'WDSP', 'MXSPD', 'MAX', 'MIN'], outputCol="features")
vec_df_train = vec_assembler.transform(df_train)

# show the content of the columns bedrooms, features and price
# vec_df_train.select("TEMP","DEWP","features").show(200)

lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol=target_column)
pipeline = Pipeline(stages=[vec_assembler, lsvc])
pipeline_model_next = pipeline.fit(df_train)

### Test model

In [25]:
df_prediction = pipeline_model_next.transform(df_test)
# df_prediction.select("features", "ItRained", "prediction").sort("prediction", ascending=False).show(200)

prediction_label = df_prediction.select("prediction", target_column)  

# supports metricName="areaUnderROC" (default) and "areaUnderPR"
# it relates sensitivity (TP rate) and specificity (FP rate)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol=target_column, )

# print("areaUnderROC = " + str(evaluator.evaluate(prediction_label)))

n = df_prediction.count()
tp = df_prediction.filter(expr("prediction > 0") & expr(f"{target_column} == prediction")).count()
tn = df_prediction.filter(expr("prediction <= 0") & expr(f"{target_column} == prediction")).count()
fp = df_prediction.filter(expr("prediction > 0") & expr(f"{target_column} != prediction")).count()
fn = n - tp - tn - fp
true_positive_percentage = round(tp/n * 100, 2)
true_negative_percentage = round(tn/n * 100, 2)
false_postive_percentage = round(fp/n * 100, 2)
false_negative_percentage = round(fn/n * 100, 2)
accuracy = round((tp/n + tn/n) * 100, 2) 
print("True Positive: ",true_positive_percentage,"%", "\nTrue Negative: ", true_negative_percentage,"%",
      "\nFalse Positive: ", false_postive_percentage ,"%", "\nFalse Negative: ", false_negative_percentage,"%" 
      "\nPrediction count:", n)
print("Accuracy: ", accuracy, "%")

TypeError: Invalid argument, not a string or column: 45.01269014481694 of type <class 'float'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.