# HW07 - Spark ML

### Importing libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col, when
from statistics import mode as _mode
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

### Spark dataframe

In [2]:
spark = SparkSession.builder.getOrCreate()

### Load dataset and define schema

In [3]:
df = spark.read.csv('weatherAUS.csv', inferSchema = True, header = True)
df.toPandas()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44,W,...,71,22,1007.7,1007.1,8,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0,,,WNW,44,NNW,...,44,25,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0,,,WSW,46,W,...,38,30,1007.6,1008.7,,2,21,23.2,No,No
3,2008-12-04,Albury,9.2,28,0,,,NE,24,SE,...,45,16,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1,,,W,41,ENE,...,82,33,1010.8,1006,7,8,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0,,,E,31,SE,...,51,24,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0,,,NNW,22,SE,...,56,21,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0,,,N,37,SE,...,53,24,1021,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27,0,,,SE,28,SSE,...,51,24,1019.4,1016.5,3,2,15.1,26,No,No


In [4]:
df.show(5)

+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+
|      Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RainTomorrow|
+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+------------+
|2008-12-01|  Albury|   13.4|   22.9|     0.6|         NA|      NA|          W|           44|         W|       WNW|          20|          24|         71|         22|     1007.7|     1007.1|       8|      NA|   16.9|   21.8|       No|          No|
|2008-12-02|

### Drop Columns from the Dataset with too many NA values
- Date
- Location
- Evaporation
- Sunshine
- WindGustDir
- Cloud9am
- Cloud3pm
- Temp9am
- Temp3pm

In [5]:
df = df.drop('Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm')

In [6]:
df.show(10)

+-------+-------+--------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|RainToday|RainTomorrow|
+-------+-------+--------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+
|   13.4|   22.9|     0.6|           44|         W|       WNW|          20|          24|         71|         22|     1007.7|     1007.1|       No|          No|
|    7.4|   25.1|       0|           44|       NNW|       WSW|           4|          22|         44|         25|     1010.6|     1007.8|       No|          No|
|   12.9|   25.7|       0|           46|         W|       WSW|          19|          26|         38|         30|     1007.6|     1008.7|       No|          No|
|    9.2|     28|       0|           24|

In [7]:
df.columns

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'RainToday',
 'RainTomorrow']

### Print the Number of Missing Data in each Column - display the number of **null values in each column

In [8]:
def missing():
    
    for columns in df.columns:
        print(columns + ' has number of NULLs : ' + str(df[df[columns] == 'NA'].count()))

In [9]:
missing()

MinTemp has number of NULLs : 1485
MaxTemp has number of NULLs : 1261
Rainfall has number of NULLs : 3261
WindGustSpeed has number of NULLs : 10263
WindDir9am has number of NULLs : 10566
WindDir3pm has number of NULLs : 4228
WindSpeed9am has number of NULLs : 1767
WindSpeed3pm has number of NULLs : 3062
Humidity9am has number of NULLs : 2654
Humidity3pm has number of NULLs : 4507
Pressure9am has number of NULLs : 15065
Pressure3pm has number of NULLs : 15028
RainToday has number of NULLs : 3261
RainTomorrow has number of NULLs : 3267


### Fill the Missing Data with Average Value and Maximum Occurrence Value

To fill the missing values in the dataset. If it's a **numerical column**, then replace it with the **average of that column**. If it's a **categorical column**, replace it by the **maximum frequency value**.

In [10]:
for columns in df.columns:
    
    print('The datatype of ' + str(columns) + ' ' + str(type(columns)))

The datatype of MinTemp <class 'str'>
The datatype of MaxTemp <class 'str'>
The datatype of Rainfall <class 'str'>
The datatype of WindGustSpeed <class 'str'>
The datatype of WindDir9am <class 'str'>
The datatype of WindDir3pm <class 'str'>
The datatype of WindSpeed9am <class 'str'>
The datatype of WindSpeed3pm <class 'str'>
The datatype of Humidity9am <class 'str'>
The datatype of Humidity3pm <class 'str'>
The datatype of Pressure9am <class 'str'>
The datatype of Pressure3pm <class 'str'>
The datatype of RainToday <class 'str'>
The datatype of RainTomorrow <class 'str'>


In [11]:
for columns in df.columns:
    print(df.select(_mean(col(columns))).collect())

[Row(avg(MinTemp)=12.194034380968981)]
[Row(avg(MaxTemp)=23.221348275647017)]
[Row(avg(Rainfall)=2.36091814991655)]
[Row(avg(WindGustSpeed)=40.03523007167319)]
[Row(avg(WindDir9am)=None)]
[Row(avg(WindDir3pm)=None)]
[Row(avg(WindSpeed9am)=14.043425914971502)]
[Row(avg(WindSpeed3pm)=18.662656778887342)]
[Row(avg(Humidity9am)=68.88083133761887)]
[Row(avg(Humidity3pm)=51.5391158755046)]
[Row(avg(Pressure9am)=1017.6499397983008)]
[Row(avg(Pressure3pm)=1015.255888830957)]
[Row(avg(RainToday)=None)]
[Row(avg(RainTomorrow)=None)]


#### For Numerical Columns

First find the average of that column and then use that to replace it with the NA's in that column and used the `.describe()`, `.collect()`, `.select()`, `.withColumn()`, `when()` and `otherwise()` methods, in a chained format for replacement.

In [12]:
MinTemp_mean = df.select('MinTemp').describe().collect()[1][1]
df = df.withColumn('MinTemp', when(df['MinTemp'] == 'NA', MinTemp_mean).
                                                                     otherwise(df['MinTemp']))

In [13]:
MaxTemp_mean = df.select('MaxTemp').describe().collect()[1][1]
df = df.withColumn('MaxTemp', when(df['MaxTemp'] == 'NA', MaxTemp_mean).
                                                                     otherwise(df['MaxTemp']))

In [14]:
Rainfall_mean = df.select('Rainfall').describe().collect()[1][1]
df = df.withColumn('Rainfall', when(df['Rainfall'] == 'NA', Rainfall_mean).
                                                                     otherwise(df['Rainfall']))

In [15]:
WindGustSpeed_mean = df.select('WindGustSpeed').describe().collect()[1][1]
df = df.withColumn('WindGustSpeed', when(df['WindGustSpeed'] == 'NA', WindGustSpeed_mean).
                                                                     otherwise(df['WindGustSpeed']))

In [16]:
WindSpeed9am_mean = df.select('WindSpeed9am').describe().collect()[1][1]
df = df.withColumn('WindSpeed9am', when(df['WindSpeed9am'] == 'NA', WindSpeed9am_mean).
                                                                     otherwise(df['WindSpeed9am']))

In [17]:
WindSpeed3pm_mean = df.select('WindSpeed3pm').describe().collect()[1][1]
df = df.withColumn('WindSpeed3pm', when(df['WindSpeed3pm'] == 'NA', WindSpeed3pm_mean).
                                                                     otherwise(df['WindSpeed3pm']))

In [18]:
Humidity9am_mean = df.select('Humidity9am').describe().collect()[1][1]
df = df.withColumn('Humidity9am', when(df['Humidity9am'] == 'NA', Humidity9am_mean).
                                                                     otherwise(df['Humidity9am']))

In [19]:
Humidity3pm_mean = df.select('Humidity3pm').describe().collect()[1][1]
df = df.withColumn('Humidity3pm', when(df['Humidity3pm'] == 'NA', Humidity3pm_mean).
                                                                     otherwise(df['Humidity3pm']))

In [20]:
Pressure9am_mean = df.select('Pressure9am').describe().collect()[1][1]
df = df.withColumn('Pressure9am', when(df['Pressure9am'] == 'NA', Pressure9am_mean).
                                                                     otherwise(df['Pressure9am']))

In [21]:
Pressure3pm_mean = df.select('Pressure3pm').describe().collect()[1][1]
df = df.withColumn('Pressure3pm', when(df['Pressure3pm'] == 'NA', Pressure3pm_mean).
                                                                     otherwise(df['Pressure3pm']))

In [22]:
missing()

MinTemp has number of NULLs : 0
MaxTemp has number of NULLs : 0
Rainfall has number of NULLs : 0
WindGustSpeed has number of NULLs : 0
WindDir9am has number of NULLs : 10566
WindDir3pm has number of NULLs : 4228
WindSpeed9am has number of NULLs : 0
WindSpeed3pm has number of NULLs : 0
Humidity9am has number of NULLs : 0
Humidity3pm has number of NULLs : 0
Pressure9am has number of NULLs : 0
Pressure3pm has number of NULLs : 0
RainToday has number of NULLs : 3261
RainTomorrow has number of NULLs : 3267


#### For Categorical Columns
For categorical columns, use of the `.groupBy()`, `.count()`, `orderBy()` and `.sort()` methods.

In [23]:
df.groupBy('WindDir9am').count().orderBy('WindDir9am').sort('count', ascending = False).show()

+----------+-----+
|WindDir9am|count|
+----------+-----+
|         N|11758|
|        NA|10566|
|        SE| 9287|
|         E| 9176|
|       SSE| 9112|
|        NW| 8749|
|         S| 8659|
|         W| 8459|
|        SW| 8423|
|       NNE| 8129|
|       NNW| 7980|
|       ENE| 7836|
|        NE| 7671|
|       ESE| 7630|
|       SSW| 7587|
|       WNW| 7414|
|       WSW| 7024|
+----------+-----+



In [24]:
df.groupBy('WindDir3pm').count().orderBy('WindDir3pm').sort('count', ascending = False).show()

+----------+-----+
|WindDir3pm|count|
+----------+-----+
|        SE|10838|
|         W|10110|
|         S| 9926|
|       WSW| 9518|
|       SSE| 9399|
|        SW| 9354|
|         N| 8890|
|       WNW| 8874|
|        NW| 8610|
|       ESE| 8505|
|         E| 8472|
|        NE| 8263|
|       SSW| 8156|
|       NNW| 7870|
|       ENE| 7857|
|       NNE| 6590|
|        NA| 4228|
+----------+-----+



In [25]:
df.groupBy('RainToday').count().orderBy('RainToday').sort('count', ascending = False).show()

+---------+------+
|RainToday| count|
+---------+------+
|       No|110319|
|      Yes| 31880|
|       NA|  3261|
+---------+------+



In [26]:
df.groupBy('RainTomorrow').count().orderBy('RainTomorrow').sort('count', ascending = False).show()

+------------+------+
|RainTomorrow| count|
+------------+------+
|          No|110316|
|         Yes| 31877|
|          NA|  3267|
+------------+------+



For categorical columns, `.groupBy()`, `.count()`, `orderBy()` and `.sort()` methods were used on the identified dataset, and then changed their values in the dataset.

In [27]:
WindDir9am_mode = df.groupBy('WindDir9am').count().orderBy('WindDir9am').sort('count', ascending = False).collect()[0][0]
WindDir9am_mode

'N'

In [28]:
WindDir3pm_mode = df.groupBy('WindDir3pm').count().orderBy('WindDir3pm').sort('count', ascending = False).collect()[0][0]
WindDir3pm_mode

'SE'

In [29]:
RainToday_mode = df.groupBy('RainToday').count().orderBy('RainToday').sort('count', ascending = False).collect()[0][0]
RainToday_mode

'No'

In [30]:
RainTomorrow_mode = df.groupBy('RainTomorrow').count().orderBy('RainTomorrow').sort('count', ascending = False).collect()[0][0]
RainTomorrow_mode

'No'

In [31]:
df = df.withColumn('WindDir9am', when(df['WindDir9am'] == 'NA', WindDir9am_mode).
                                                                     otherwise(df['WindDir9am']))

In [32]:
df = df.withColumn('WindDir3pm', when(df['WindDir3pm'] == 'NA', WindDir3pm_mode).
                                                                     otherwise(df['WindDir3pm']))

In [33]:
df = df.withColumn('RainToday', when(df['RainToday'] == 'NA', RainToday_mode).
                                                                     otherwise(df['RainToday']))

In [34]:
df = df.withColumn('RainTomorrow', when(df['RainTomorrow'] == 'NA', RainTomorrow_mode).
                                                                     otherwise(df['RainTomorrow']))

Once again, let's call our great friend, the `missing()` to get the number of nulls in the dataset. Remember that all the numerical columns were without nulls in the starting of this part.

In [35]:
missing()

MinTemp has number of NULLs : 0
MaxTemp has number of NULLs : 0
Rainfall has number of NULLs : 0
WindGustSpeed has number of NULLs : 0
WindDir9am has number of NULLs : 0
WindDir3pm has number of NULLs : 0
WindSpeed9am has number of NULLs : 0
WindSpeed3pm has number of NULLs : 0
Humidity9am has number of NULLs : 0
Humidity3pm has number of NULLs : 0
Pressure9am has number of NULLs : 0
Pressure3pm has number of NULLs : 0
RainToday has number of NULLs : 0
RainTomorrow has number of NULLs : 0


### Data Transformation

Before running Spark ML algorithms and to avoid error,  the numerical columns were converted from **string** to **double** and categorical columns converted from **string** to **numbers**

In [36]:
numerical_list = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
                  'Humidity3pm', 'Pressure9am', 'Pressure3pm']

In [37]:
for columns in numerical_list:
    df = df.withColumn(columns, df[columns].cast(DoubleType()))

In [38]:
df.printSchema()

root
 |-- MinTemp: double (nullable = true)
 |-- MaxTemp: double (nullable = true)
 |-- Rainfall: double (nullable = true)
 |-- WindGustSpeed: double (nullable = true)
 |-- WindDir9am: string (nullable = true)
 |-- WindDir3pm: string (nullable = true)
 |-- WindSpeed9am: double (nullable = true)
 |-- WindSpeed3pm: double (nullable = true)
 |-- Humidity9am: double (nullable = true)
 |-- Humidity3pm: double (nullable = true)
 |-- Pressure9am: double (nullable = true)
 |-- Pressure3pm: double (nullable = true)
 |-- RainToday: string (nullable = true)
 |-- RainTomorrow: string (nullable = true)



For categorical columns, list named `categorical_list` created, loop over it and use the `StringIndexer()` method, call the `Pipeline()` method on it and finally `.fit()` used to fit it to the dataset.

In [39]:
categorical_list = ['WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

In [40]:
indexers = [StringIndexer(inputCol = column, outputCol = column + "_index").fit(df) for column in categorical_list]


pipeline = Pipeline(stages = indexers)
df_indexers = pipeline.fit(df).transform(df)

df_indexers.show(10)

+-------+-------+--------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+----------------+----------------+---------------+------------------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|RainToday|RainTomorrow|WindDir9am_index|WindDir3pm_index|RainToday_index|RainTomorrow_index|
+-------+-------+--------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+---------+------------+----------------+----------------+---------------+------------------+
|   13.4|   22.9|     0.6|         44.0|         W|       WNW|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|       No|          No|             6.0|             7.0|            0.0|               0.0|
|    7.4|   25.1|     0.0|         44.0|       NNW|       WSW|         4.0|        2

In [41]:
drop = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

df_indexers = df_indexers.drop(*drop)

df_indexers.show(5)

+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+----------------+----------------+---------------+------------------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|WindDir9am_index|WindDir3pm_index|RainToday_index|RainTomorrow_index|
+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+----------------+----------------+---------------+------------------+
|   13.4|   22.9|     0.6|         44.0|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|             6.0|             7.0|            0.0|               0.0|
|    7.4|   25.1|     0.0|         44.0|         4.0|        22.0|       44.0|       25.0|     1010.6|     1007.8|             9.0|             3.0|            0.0|               0.0|
|   12.9|   25.7|     0.0|         46.0|        19.0|        26.0|       38.0|  

### Create the Feature Vector and Divide the Dataset

In [42]:
target = ['RainTomorrow_index']

assembler = VectorAssembler(
    inputCols = [columns for columns in df_indexers.columns if columns not in target],
    outputCol = 'feature_vector')

df_indexers_data = assembler.transform(df_indexers) 

In [43]:
df_indexers_data.show(5)

+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+----------------+----------------+---------------+------------------+--------------------+
|MinTemp|MaxTemp|Rainfall|WindGustSpeed|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|WindDir9am_index|WindDir3pm_index|RainToday_index|RainTomorrow_index|      feature_vector|
+-------+-------+--------+-------------+------------+------------+-----------+-----------+-----------+-----------+----------------+----------------+---------------+------------------+--------------------+
|   13.4|   22.9|     0.6|         44.0|        20.0|        24.0|       71.0|       22.0|     1007.7|     1007.1|             6.0|             7.0|            0.0|               0.0|[13.4,22.9,0.6,44...|
|    7.4|   25.1|     0.0|         44.0|         4.0|        22.0|       44.0|       25.0|     1010.6|     1007.8|             9.0|             3.0|            0.0|               0

In [44]:
rain_test, rain_train = df_indexers_data.randomSplit([0.2, 0.8], seed = 12345)

In [45]:
rain_test.count()

29194

In [46]:
rain_train.count()

116266

### Decision Tree Classifier

Using the `DecisionTreeClassifier()` and `.fit()` method on the training data.

In [47]:
DT_Classifier = DecisionTreeClassifier(labelCol = 'RainTomorrow_index', featuresCol = 'feature_vector')
DT_Model = DT_Classifier.fit(rain_train)

In [48]:
DT_Prediction = DT_Model.transform(rain_test)

In [49]:
DT_Prediction.head(5)

[Row(MinTemp=-4.7, MaxTemp=8.3, Rainfall=0.0, WindGustSpeed=30.0, WindSpeed9am=2.0, WindSpeed3pm=15.0, Humidity9am=76.0, Humidity3pm=64.0, Pressure9am=1026.7, Pressure3pm=1025.4, WindDir9am_index=12.0, WindDir3pm_index=5.0, RainToday_index=0.0, RainTomorrow_index=1.0, feature_vector=DenseVector([-4.7, 8.3, 0.0, 30.0, 2.0, 15.0, 76.0, 64.0, 1026.7, 1025.4, 12.0, 5.0, 0.0]), rawPrediction=DenseVector([27576.0, 6012.0]), probability=DenseVector([0.821, 0.179]), prediction=0.0),
 Row(MinTemp=-3.2, MaxTemp=15.7, Rainfall=0.0, WindGustSpeed=20.0, WindSpeed9am=2.0, WindSpeed3pm=13.0, Humidity9am=71.0, Humidity3pm=36.0, Pressure9am=1026.2, Pressure3pm=1023.0, WindDir9am_index=10.0, WindDir3pm_index=3.0, RainToday_index=0.0, RainTomorrow_index=0.0, feature_vector=DenseVector([-3.2, 15.7, 0.0, 20.0, 2.0, 13.0, 71.0, 36.0, 1026.2, 1023.0, 10.0, 3.0, 0.0]), rawPrediction=DenseVector([50758.0, 4361.0]), probability=DenseVector([0.9209, 0.0791]), prediction=0.0),
 Row(MinTemp=-3.1, MaxTemp=16.9, Rai

In [50]:
DT_Evaluator = MulticlassClassificationEvaluator(labelCol =  'RainTomorrow_index', predictionCol = 'prediction')

In [51]:
DT_Accuracy = DT_Evaluator.evaluate(DT_Prediction)

In [52]:
print('Decision Tree Accuracy is : ' + str(DT_Accuracy * 100))
print('Test Error is : ' + str(1 - DT_Accuracy))

Decision Tree Accuracy is : 81.67422935736865
Test Error is : 0.18325770642631356


### Random Forest

Using the `RandomForestClassifier()` and `.fit()` method on the training data.

In [53]:
RF_Classifier = RandomForestClassifier(labelCol = 'RainTomorrow_index', featuresCol = 'feature_vector',  maxDepth = 5,
    maxBins = 32, numTrees = 500)

RF_Model = RF_Classifier.fit(rain_train)

In [54]:
RF_Prediction = RF_Model.transform(rain_test)

In [55]:
RF_Prediction.head(5)

[Row(MinTemp=-4.7, MaxTemp=8.3, Rainfall=0.0, WindGustSpeed=30.0, WindSpeed9am=2.0, WindSpeed3pm=15.0, Humidity9am=76.0, Humidity3pm=64.0, Pressure9am=1026.7, Pressure3pm=1025.4, WindDir9am_index=12.0, WindDir3pm_index=5.0, RainToday_index=0.0, RainTomorrow_index=1.0, feature_vector=DenseVector([-4.7, 8.3, 0.0, 30.0, 2.0, 15.0, 76.0, 64.0, 1026.7, 1025.4, 12.0, 5.0, 0.0]), rawPrediction=DenseVector([441.3777, 58.6223]), probability=DenseVector([0.8828, 0.1172]), prediction=0.0),
 Row(MinTemp=-3.2, MaxTemp=15.7, Rainfall=0.0, WindGustSpeed=20.0, WindSpeed9am=2.0, WindSpeed3pm=13.0, Humidity9am=71.0, Humidity3pm=36.0, Pressure9am=1026.2, Pressure3pm=1023.0, WindDir9am_index=10.0, WindDir3pm_index=3.0, RainToday_index=0.0, RainTomorrow_index=0.0, feature_vector=DenseVector([-3.2, 15.7, 0.0, 20.0, 2.0, 13.0, 71.0, 36.0, 1026.2, 1023.0, 10.0, 3.0, 0.0]), rawPrediction=DenseVector([447.5913, 52.4087]), probability=DenseVector([0.8952, 0.1048]), prediction=0.0),
 Row(MinTemp=-3.1, MaxTemp=16.

In [56]:
RF_Evaluator = MulticlassClassificationEvaluator(labelCol = 'RainTomorrow_index', predictionCol = 'prediction')

In [57]:
RF_Accuracy = RF_Evaluator.evaluate(RF_Prediction)

In [58]:
print('Random Forest Accuracy is : ' + str(RF_Accuracy * 100))
print('Test Error is : ' + str(1 - RF_Accuracy))

Random Forest Accuracy is : 81.33219333347263
Test Error is : 0.18667806666527365


### Gradient Boosted Tree

Using the `GBTClassifier()` and `.fit()` method on the training data.

In [59]:
GBT_Classifier = GBTClassifier(labelCol = 'RainTomorrow_index', featuresCol = 'feature_vector', maxIter = 20, maxDepth = 5, 
                               maxBins = 32)

In [60]:
GBT_Model = GBT_Classifier.fit(rain_train)

In [61]:
GBT_Prediction = GBT_Model.transform(rain_test)

In [62]:
GBT_Prediction.head(5)

[Row(MinTemp=-4.7, MaxTemp=8.3, Rainfall=0.0, WindGustSpeed=30.0, WindSpeed9am=2.0, WindSpeed3pm=15.0, Humidity9am=76.0, Humidity3pm=64.0, Pressure9am=1026.7, Pressure3pm=1025.4, WindDir9am_index=12.0, WindDir3pm_index=5.0, RainToday_index=0.0, RainTomorrow_index=1.0, feature_vector=DenseVector([-4.7, 8.3, 0.0, 30.0, 2.0, 15.0, 76.0, 64.0, 1026.7, 1025.4, 12.0, 5.0, 0.0]), rawPrediction=DenseVector([1.0593, -1.0593]), probability=DenseVector([0.8927, 0.1073]), prediction=0.0),
 Row(MinTemp=-3.2, MaxTemp=15.7, Rainfall=0.0, WindGustSpeed=20.0, WindSpeed9am=2.0, WindSpeed3pm=13.0, Humidity9am=71.0, Humidity3pm=36.0, Pressure9am=1026.2, Pressure3pm=1023.0, WindDir9am_index=10.0, WindDir3pm_index=3.0, RainToday_index=0.0, RainTomorrow_index=0.0, feature_vector=DenseVector([-3.2, 15.7, 0.0, 20.0, 2.0, 13.0, 71.0, 36.0, 1026.2, 1023.0, 10.0, 3.0, 0.0]), rawPrediction=DenseVector([1.428, -1.428]), probability=DenseVector([0.9456, 0.0544]), prediction=0.0),
 Row(MinTemp=-3.1, MaxTemp=16.9, Rai

In [63]:
GBT_Evaluator = MulticlassClassificationEvaluator(labelCol = 'RainTomorrow_index', predictionCol = 'prediction')

In [64]:
GBT_Accuracy = GBT_Evaluator.evaluate(GBT_Prediction)

In [65]:
print('GBT Accuracy is : ' + str(GBT_Accuracy * 100))
print('Test Error is : ' + str(1 - GBT_Accuracy))

GBT Accuracy is : 82.78054327096406
Test Error is : 0.17219456729035942


### Logistic Regression

Using the `LogisticRegression()` and `.fit()` method on the training data.

In [66]:
LogReg_Classifier = LogisticRegression(regParam = 0.3, labelCol = "RainTomorrow_index", featuresCol = 'feature_vector', \
                                       maxIter = 20,  elasticNetParam = 0.8)

In [67]:
LogReg_Model = LogReg_Classifier.fit(rain_train)

In [68]:
LogReg_Prediction = LogReg_Model.transform(rain_test)

In [69]:
LogReg_Prediction.head(5)

[Row(MinTemp=-4.7, MaxTemp=8.3, Rainfall=0.0, WindGustSpeed=30.0, WindSpeed9am=2.0, WindSpeed3pm=15.0, Humidity9am=76.0, Humidity3pm=64.0, Pressure9am=1026.7, Pressure3pm=1025.4, WindDir9am_index=12.0, WindDir3pm_index=5.0, RainToday_index=0.0, RainTomorrow_index=1.0, feature_vector=DenseVector([-4.7, 8.3, 0.0, 30.0, 2.0, 15.0, 76.0, 64.0, 1026.7, 1025.4, 12.0, 5.0, 0.0]), rawPrediction=DenseVector([1.2715, -1.2715]), probability=DenseVector([0.781, 0.219]), prediction=0.0),
 Row(MinTemp=-3.2, MaxTemp=15.7, Rainfall=0.0, WindGustSpeed=20.0, WindSpeed9am=2.0, WindSpeed3pm=13.0, Humidity9am=71.0, Humidity3pm=36.0, Pressure9am=1026.2, Pressure3pm=1023.0, WindDir9am_index=10.0, WindDir3pm_index=3.0, RainToday_index=0.0, RainTomorrow_index=0.0, feature_vector=DenseVector([-3.2, 15.7, 0.0, 20.0, 2.0, 13.0, 71.0, 36.0, 1026.2, 1023.0, 10.0, 3.0, 0.0]), rawPrediction=DenseVector([1.2715, -1.2715]), probability=DenseVector([0.781, 0.219]), prediction=0.0),
 Row(MinTemp=-3.1, MaxTemp=16.9, Rainf

In [70]:
LogReg_Evaluator = MulticlassClassificationEvaluator(labelCol = "RainTomorrow_index", predictionCol = "prediction", \
                                                     metricName = "accuracy")

In [71]:
LogReg_Accuracy = LogReg_Evaluator.evaluate(LogReg_Prediction)

In [72]:
print('Logistic Regression Accuracy is : ' + str(LogReg_Accuracy * 100))
print('Test Error is : ' + str(1 - LogReg_Accuracy))

Logistic Regression Accuracy is : 78.02973213674042
Test Error is : 0.21970267863259574


### Calculate the confusion matrix and find the precision-recall of each classification algorithm

### Gradient Boosted Tree Model Performance

In [73]:
GBT_Prediction_Labels = GBT_Prediction.select(['prediction', 'RainTomorrow_index'])

GBT_KPI = MulticlassMetrics(GBT_Prediction_Labels.rdd.map(list))



In [74]:
GBT_confusion_matrix = GBT_KPI.confusionMatrix().toArray()

print('GBT Confusion Matrix')

print(GBT_confusion_matrix)

GBT Confusion Matrix
[[21685.  1095.]
 [ 3513.  2901.]]


In [75]:
GBT_precision = (GBT_confusion_matrix[0][0]) / (GBT_confusion_matrix[0][0] + GBT_confusion_matrix[1][0])
print('GBT Precision = ' + str(GBT_precision))

GBT Precision = 0.860584173347091


In [76]:
GBT_recall = (GBT_confusion_matrix[0][0]) / (GBT_confusion_matrix[0][0] + GBT_confusion_matrix[0][1])
print('GBT Recall = ' + str(GBT_recall))

GBT Recall = 0.9519315188762072


In [77]:
GBT_f1Score = (GBT_precision * GBT_recall) / (GBT_precision + GBT_recall) * 2
print('GBT F1 Score = ' + str(GBT_f1Score))

GBT F1 Score = 0.9039559798240862


### Decision Trees Model Performance

In [78]:
DT_Prediction_Labels = DT_Prediction.select(['prediction', 'RainTomorrow_index'])

DT_KPI = MulticlassMetrics(DT_Prediction_Labels.rdd.map(list))

In [79]:
DT_confusion_matrix = DT_KPI.confusionMatrix().toArray()

print('DT Confusion Matrix')

print(DT_confusion_matrix)

DT Confusion Matrix
[[21868.   912.]
 [ 3868.  2546.]]


In [80]:
DT_precision = (DT_confusion_matrix[0][0]) / (DT_confusion_matrix[0][0] + DT_confusion_matrix[1][0])
print('DT Precision = ' + str(DT_precision))

DT Precision = 0.8497046938141125


In [81]:
DT_recall = (DT_confusion_matrix[0][0]) / (DT_confusion_matrix[0][0] + DT_confusion_matrix[0][1])
print('DT Recall = ' + str(DT_recall))

DT Recall = 0.959964881474978


In [82]:
DT_f1Score = (DT_precision * DT_recall) / (DT_precision + DT_recall) * 2
print('DT F1 Score = ' + str(DT_f1Score))

DT F1 Score = 0.9014758017973452


### Random Forest Model Performance

In [83]:
RF_Prediction_Labels = RF_Prediction.select(['prediction', 'RainTomorrow_index'])

RF_KPI = MulticlassMetrics(RF_Prediction_Labels.rdd.map(list))

In [84]:
RF_confusion_matrix = RF_KPI.confusionMatrix().toArray()

print('RF Confusion Matrix')

print(RF_confusion_matrix)

RF Confusion Matrix
[[22068.   712.]
 [ 4059.  2355.]]


In [85]:
RF_precision = (RF_confusion_matrix[0][0]) / (RF_confusion_matrix[0][0] + RF_confusion_matrix[1][0])
print('RF Precision = ' + str(RF_precision))

RF Precision = 0.8446434722700654


In [86]:
RF_recall = (RF_confusion_matrix[0][0]) / (RF_confusion_matrix[0][0] + RF_confusion_matrix[0][1])
print('RF Recall = ' + str(RF_recall))

RF Recall = 0.9687445127304654


In [87]:
RF_f1Score = (RF_precision * RF_recall) / (RF_precision + RF_recall) * 2
print('RF F1 Score = ' + str(RF_f1Score))

RF F1 Score = 0.9024475024025191


### Logistic Regression Model Performance

In [88]:
LogReg_Prediction_Labels = LogReg_Prediction.select(['prediction', 'RainTomorrow_index'])

LogReg_KPI = MulticlassMetrics(LogReg_Prediction_Labels.rdd.map(list))

In [89]:
LogReg_confusion_matrix = LogReg_KPI.confusionMatrix().toArray()

print('LogReg Confusion Matrix')

print(LogReg_confusion_matrix)

LogReg Confusion Matrix
[[22780.     0.]
 [ 6414.     0.]]


In [90]:
LogReg_precision = (LogReg_confusion_matrix[0][0]) / (LogReg_confusion_matrix[0][0] + LogReg_confusion_matrix[1][0])
print('LogReg Precision = ' + str(LogReg_precision))

LogReg Precision = 0.7802973213674043


In [91]:
LogReg_recall = (LogReg_confusion_matrix[0][0]) / (LogReg_confusion_matrix[0][0] + LogReg_confusion_matrix[0][1])
print('LogReg Recall = ' + str(LogReg_recall))

LogReg Recall = 1.0


In [92]:
LogReg_f1Score = (LogReg_precision * LogReg_recall) / (LogReg_precision + LogReg_recall) * 2
print('LogReg F1 Score = ' + str(LogReg_f1Score))

LogReg F1 Score = 0.8765921422249586


## Even though all the Spark ML algorithms works good to predict RainTomorrow, Gradient Boosted Tree Model performs better both in Accuracy and Precision 