In [8]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.ml.feature import Normalizer, StandardScaler
# import random
# import logging
import time
import json
from datetime import datetime
from pymongo import MongoClient



schema = StructType([ \
    StructField("RGIID",StringType(),True), 
    StructField("Time",StringType(),True), 
    StructField("Area",FloatType(),True), 
    StructField("dh",FloatType(),True), 
    StructField("err_dh",FloatType(),True),                  
  ])

client = MongoClient('localhost', 27017)
db = client.ICS5114

iceland_collection = db.glacial_iceland_collection
europe_collection = db.glacial_europe_collection

def collection_getter(cursor):
    temp_collection = []
    for document in cursor:  
        try:
            temp_collection.append((document["rgiid"],document["time"],document["area"],document["dh"],document["err_dh"]))
        except Exception as e: print()
    return temp_collection

def initial_df(collection, time_format):
    df = spark.createDataFrame(data=collection,schema=schema)
    if(time_format == "/"):
        time_style = "dd/MM/yyyy"
    else:
        time_style = "yyyy-MM-dd"
    df = df.withColumn("Time_formatted", f.to_date(f.col("Time"), time_style))     
    df = df.sort("RGIID","Time_formatted")
    return df

temp_iceland_collection = collection_getter(iceland_collection.find({}))
temp_europe_collection = collection_getter(europe_collection.find({}))

iceland_df = initial_df(temp_iceland_collection, "/")
europe_df = initial_df(temp_europe_collection, "-")

# iceland_df

print("Iceland dataframe")
iceland_df.printSchema()
iceland_df.show()

print("Europe dataframe")
europe_df.printSchema()
europe_df.show()

Iceland dataframe
root
 |-- RGIID: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Area: float (nullable = true)
 |-- dh: float (nullable = true)
 |-- err_dh: float (nullable = true)
 |-- Time_formatted: date (nullable = true)

+--------------+----------+---------+------+------+--------------+
|         RGIID|      Time|     Area|    dh|err_dh|Time_formatted|
+--------------+----------+---------+------+------+--------------+
|RGI60-06.00001|01/01/2000|4903000.0|   0.0| 2.342|    2000-01-01|
|RGI60-06.00001|31/01/2000|4903000.0|-0.064|  2.32|    2000-01-31|
|RGI60-06.00001|02/03/2000|4903000.0|-0.143| 2.298|    2000-03-02|
|RGI60-06.00001|01/04/2000|4903000.0|-0.247| 2.275|    2000-04-01|
|RGI60-06.00001|02/05/2000|4903000.0|-0.412| 2.251|    2000-05-02|
|RGI60-06.00001|01/06/2000|4903000.0|-0.656| 2.227|    2000-06-01|
|RGI60-06.00001|02/07/2000|4903000.0|-0.931| 2.205|    2000-07-02|
|RGI60-06.00001|01/08/2000|4903000.0|-1.152| 2.196|    2000-08-01|
|RGI60-06.00001|0

In [9]:
print("Iceland dataframe")
iceland_df.describe().show()

print("Europe dataframe")
europe_df.describe().show()


Iceland dataframe
+-------+--------------+----------+--------------------+-----------------+------------------+
|summary|         RGIID|      Time|                Area|               dh|            err_dh|
+-------+--------------+----------+--------------------+-----------------+------------------+
|  count|        136887|    136887|              136887|           136887|            136887|
|   mean|          null|      null|1.9471436196570896E7|-4.64186489664232| 2.572899574157931|
| stddev|          null|      null|1.1690602907993454E8|7.298624478050692|0.9519735023456869|
|    min|RGI60-06.00001|01/01/2000|             44000.0|          -91.909|             0.413|
|    max|RGI60-06.00568|31/12/2016|        1.56121805E9|           34.833|             12.39|
+-------+--------------+----------+--------------------+-----------------+------------------+

Europe dataframe
+-------+--------------+----------+-----------------+------------------+-----------------+
|summary|         RGIID|   

In [10]:
# count null values
import pyspark.sql.functions as f
print("Iceland dataframe")
nulls = iceland_df.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in iceland_df.columns])
nulls.show()
print("Europe dataframe")
nulls = europe_df.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in europe_df.columns])
nulls.show()

Iceland dataframe
+-----+----+----+---+------+--------------+
|RGIID|Time|Area| dh|err_dh|Time_formatted|
+-----+----+----+---+------+--------------+
|    0|   0|   0|  0|     0|             0|
+-----+----+----+---+------+--------------+

Europe dataframe
+-----+----+----+------+------+--------------+
|RGIID|Time|Area|    dh|err_dh|Time_formatted|
+-----+----+----+------+------+--------------+
|    0|   0|   0|124115|123874|             0|
+-----+----+----+------+------+--------------+



In [14]:
# pearson correlation
from pyspark.sql.functions import * 
print("Iceland - Pearrson correlation between Area and Glacier elavation change")
print(iceland_df.stat.corr("Area", "dh"))
print("Europe - Pearrson correlation between Area and Glacier elavation change")
print(europe_df.stat.corr("Area", "dh"))


Iceland - Pearrson correlation between Area and Glacier elavation change
-0.1457903292574538
Europe - Pearrson correlation between Area and Glacier elavation change
-0.10496186240891502


In [12]:
iceland_df.columns

['RGIID', 'Time', 'Area', 'dh', 'err_dh', 'Time_formatted']

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [16]:
print("Iceland - Number of Distinct Glacials:", iceland_df.select('RGIID').distinct().count())
print("Europe - Number of Distinct Glacials:", europe_df.select('RGIID').distinct().count())


Iceland - Number of Distinct Glacials: 568
Europe - Number of Distinct Glacials: 3927


In [17]:
from pyspark.sql.window import Window

def window_function(df):
    window_days = 3
    window = (
        Window
        .partitionBy(f.col("RGIID"))
        .orderBy(f.col("Time_formatted").cast("timestamp").cast("long"))
        .rowsBetween(-window_days, Window.currentRow-1)
    )

    new_all_data = (df
        .withColumn("sum",f.sum(f.col("dh")).over(window))
        .withColumn("mean",f.avg(f.col("dh")).over(window))
        .withColumn("min",f.min(f.col("dh")).over(window))
        .withColumn("max",f.max(f.col("dh")).over(window)))
    #     .withColumn("stddev",f.stddev(f.col("dh")).over(window)))

    w = (
        Window
        .partitionBy(f.col("RGIID"))
        .orderBy(f.col("Time_formatted").cast("timestamp").cast("long"))
    ) 

    # creating difference between last known dh and first known dh value in rolling window.
    df_lagger = new_all_data.withColumn('diff', f.lag(f.col("dh"),1).over(w) - f.lag(f.col("dh"),window_days).over(w))
    # df_lagger.show()

    # now including sign 
    return df_lagger.withColumn("Trend", f.signum(f.col("diff")))

iceland_df_trend = window_function(iceland_df)
europe_df_trend = window_function(europe_df)

print("Iceland dataframe with Trends")
iceland_df_trend.show()
print("Europe dataframe with Trends")
europe_df_trend.show()


Iceland dataframe with Trends
+--------------+----------+---------+------+------+--------------+--------------------+--------------------+------+------+------------+-----+
|         RGIID|      Time|     Area|    dh|err_dh|Time_formatted|                 sum|                mean|   min|   max|        diff|Trend|
+--------------+----------+---------+------+------+--------------+--------------------+--------------------+------+------+------------+-----+
|RGI60-06.00001|01/01/2000|4903000.0|   0.0| 2.342|    2000-01-01|                null|                null|  null|  null|        null| null|
|RGI60-06.00001|31/01/2000|4903000.0|-0.064|  2.32|    2000-01-31|                 0.0|                 0.0|   0.0|   0.0|        null| null|
|RGI60-06.00001|02/03/2000|4903000.0|-0.143| 2.298|    2000-03-02|-0.06400000303983688|-0.03200000151991844|-0.064|   0.0|        null| null|
|RGI60-06.00001|01/04/2000|4903000.0|-0.247| 2.275|    2000-04-01| -0.2070000097155571|-0.06900000323851903|-0.143|   

In [22]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

def feature_vector_adder(df):
    
    formatted_df = df.withColumn('Year', f.year(f.col('Time_formatted')))
    # vec_assembler = VectorAssembler(inputCols = ["Area", "dh","sum", "min", "max", "mean"], outputCol='features')
    vec_assembler = VectorAssembler(inputCols = ["Area","mean", "diff"], outputCol='features')
    output_df = vec_assembler.transform(formatted_df.dropna()) 
    output_df.printSchema()
    output_df.show()
    return output_df

print("Iceland df - with features")
iceland_df_trend_features = feature_vector_adder(iceland_df_trend)

print("Europe df - with features")
europe_df_trend_features = feature_vector_adder(europe_df_trend)

Iceland df - with features
root
 |-- RGIID: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Area: float (nullable = true)
 |-- dh: float (nullable = true)
 |-- err_dh: float (nullable = true)
 |-- Time_formatted: date (nullable = true)
 |-- sum: double (nullable = true)
 |-- mean: double (nullable = true)
 |-- min: float (nullable = true)
 |-- max: float (nullable = true)
 |-- diff: float (nullable = true)
 |-- Trend: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- features: vector (nullable = true)

+--------------+----------+---------+------+------+--------------+--------------------+--------------------+------+------+------------+-----+----+--------------------+
|         RGIID|      Time|     Area|    dh|err_dh|Time_formatted|                 sum|                mean|   min|   max|        diff|Trend|Year|            features|
+--------------+----------+---------+------+------+--------------+--------------------+--------------------+------+------

In [23]:
def feature_df_prep(df):
    final_data = df.select('features','Trend')
    final_data = final_data.withColumn("Trend",f.when(f.col("Trend") <= 0, 0).otherwise(f.col("Trend")))
    final_data.show()
    return final_data

print("Iceland features df")
iceland_feat_df = feature_df_prep(iceland_df_trend_features)
print("Europe features df")
europe_feat_df = feature_df_prep(europe_df_trend_features)


Iceland features df
+--------------------+-----+
|            features|Trend|
+--------------------+-----+
|[4903000.0,-0.069...|  0.0|
|[4903000.0,-0.151...|  0.0|
|[4903000.0,-0.267...|  0.0|
|[4903000.0,-0.438...|  0.0|
|[4903000.0,-0.666...|  0.0|
|[4903000.0,-0.912...|  0.0|
|[4903000.0,-1.117...|  0.0|
|[4903000.0,-1.238...|  0.0|
|[4903000.0,-1.279...|  0.0|
|[4903000.0,-1.279...|  1.0|
|[4903000.0,-1.279...|  0.0|
|[4903000.0,-1.304...|  0.0|
|[4903000.0,-1.355...|  0.0|
|[4903000.0,-1.428...|  0.0|
|[4903000.0,-1.535...|  0.0|
|[4903000.0,-1.695...|  0.0|
|[4903000.0,-1.912...|  0.0|
|[4903000.0,-2.147...|  0.0|
|[4903000.0,-2.341...|  0.0|
|[4903000.0,-2.450...|  0.0|
+--------------------+-----+
only showing top 20 rows

Europe features df
+--------------------+-----+
|            features|Trend|
+--------------------+-----+
|[908000.0,0.06433...|  1.0|
|[908000.0,0.10433...|  1.0|
|[908000.0,0.12166...|  1.0|
|[908000.0,0.13066...|  1.0|
|[908000.0,0.12099...|  0.0|
|[90800

In [24]:
from pyspark.sql import Row

def train_test_df_split(df):
    train_df = df.where((f.col('Year') != '2020') & (col('Trend').isNotNull()))
    test_df = df.where((f.col('Year') == '2020') & (col('Trend').isNotNull()))
    return train_df, test_df

iceland_train_df, iceland_test_df = train_test_df_split(iceland_feat_df)
europe_train_df, europe_test_df = train_test_df_split(europe_feat_df)


print("Training data - Iceland")
iceland_train_df.show()
print("Testing data - Iceland")
iceland_test_df.show()
print("----------------------------")
print("Training data - Europe")
europe_train_df.show()
print("Testing data - Europe")
europe_test_df.show()

Training data - Iceland
+--------------------+-----+
|            features|Trend|
+--------------------+-----+
|[4903000.0,-0.069...|  0.0|
|[4903000.0,-0.151...|  0.0|
|[4903000.0,-0.267...|  0.0|
|[4903000.0,-0.438...|  0.0|
|[4903000.0,-0.666...|  0.0|
|[4903000.0,-0.912...|  0.0|
|[4903000.0,-1.117...|  0.0|
|[4903000.0,-1.238...|  0.0|
|[4903000.0,-1.279...|  0.0|
|[4903000.0,-1.279...|  1.0|
|[4903000.0,-1.279...|  0.0|
|[4903000.0,-1.304...|  0.0|
|[4903000.0,-1.355...|  0.0|
|[4903000.0,-1.428...|  0.0|
|[4903000.0,-1.535...|  0.0|
|[4903000.0,-1.695...|  0.0|
|[4903000.0,-1.912...|  0.0|
|[4903000.0,-2.147...|  0.0|
|[4903000.0,-2.341...|  0.0|
|[4903000.0,-2.450...|  0.0|
+--------------------+-----+
only showing top 20 rows

Testing data - Iceland
+--------------------+-----+
|            features|Trend|
+--------------------+-----+
|[4903000.0,-6.634...|  1.0|
|[67000.0,1.772000...|  1.0|
|[2712000.0,-10.75...|  1.0|
|[448000.0,-4.6656...|  0.0|
|[77000.0,-1.45099...|  0.0|

In [25]:
from pyspark.ml.classification import *

def LR_prep(train_df, test_df):
    LR_model = LogisticRegression(labelCol = 'Trend')
    LR_model = LR_model.fit(train_df)
    predictions = LR_model.evaluate(test_df)
    predictions = predictions.predictions
    predictions.show()
    return predictions

print("Iceland log regression results")
iceland_lr_predictions = LR_prep(iceland_train_df, iceland_test_df)

print("Europe log regression results")
europe_lr_predictions = LR_prep(europe_train_df, europe_test_df)

Iceland log regression results




+--------------------+-----+--------------------+--------------------+----------+
|            features|Trend|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[4903000.0,-6.634...|  1.0|[-8352.5905563867...|           [0.0,1.0]|       1.0|
|[67000.0,1.772000...|  1.0|[-2024.2199083947...|           [0.0,1.0]|       1.0|
|[2712000.0,-10.75...|  1.0|[-4667.4727496981...|           [0.0,1.0]|       1.0|
|[448000.0,-4.6656...|  0.0|[1830.52009536162...|           [1.0,0.0]|       0.0|
|[77000.0,-1.45099...|  0.0|[1660.93936765431...|           [1.0,0.0]|       0.0|
|[119000.0,-7.2006...|  1.0|[-2873.1858082007...|           [0.0,1.0]|       1.0|
|[165000.0,-4.5836...|  1.0|[-7770.6435292382...|           [0.0,1.0]|       1.0|
|[86000.0,0.340666...|  0.0|[1491.29897469272...|           [1.0,0.0]|       0.0|
|[6608000.0,-8.237...|  1.0|[-3067.1454253699...|           [0.0,1.0]|       1.0|
|[277000.0,-2.04

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# evaluating predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol="Trend")

print("Iceland predictions evaluation")
print(evaluator.evaluate(iceland_lr_predictions))

print("Europe predictions evaluation")
print(evaluator.evaluate(europe_lr_predictions))


Iceland predictions evaluation
Europe predictions evaluation


1.0

In [None]:
# https://www.youtube.com/watch?v=oDTJxEl95Go  - used as guide for spark.ml library

In [27]:
from pyspark.ml.classification import LinearSVC

def SVM_prep(train_df, test_df):
    svm = LinearSVC(maxIter=10, regParam=0.1 ,labelCol = 'Trend')
    SVM_model = svm.fit(train_df)
    svm_predictions = SVM_model.evaluate(test_df)
    svm_predictions = svm_predictions.predictions
    svm_predictions.show()
    return svm_predictions

print("Iceland svm results")
iceland_svm_predictions = SVM_prep(iceland_train_df, iceland_test_df)

print("Europe svm results")
europe_svm_predictions = SVM_prep(europe_train_df, europe_test_df)

Iceland svm results
+--------------------+-----+--------------------+----------+
|            features|Trend|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|[4903000.0,-6.634...|  1.0|[-1.0030995043271...|       1.0|
|[67000.0,1.772000...|  1.0|[-0.2574118189273...|       1.0|
|[2712000.0,-10.75...|  1.0|[-0.4241631559555...|       1.0|
|[448000.0,-4.6656...|  0.0|[0.38219824201411...|       0.0|
|[77000.0,-1.45099...|  0.0|[0.30701235046642...|       0.0|
|[119000.0,-7.2006...|  1.0|[-0.2314579126406...|       1.0|
|[165000.0,-4.5836...|  1.0|[-0.9548093217350...|       1.0|
|[86000.0,0.340666...|  0.0|[0.25464478883786...|       0.0|
|[6608000.0,-8.237...|  1.0|[-0.2421486505337...|       1.0|
|[277000.0,-2.0480...|  0.0|[0.09059244811404...|       0.0|
|[167000.0,3.46999...|  1.0|[-0.0114574573179...|       1.0|
|[128000.0,-11.118...|  1.0|[-0.1449565637031...|       1.0|
|[101000.0,-1.5883...|  1.0|[-1.8225191705240...|       1.0|
|[16

In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

svm_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol="Trend")

print("Iceland svm predictions evaluation")
print(svm_evaluator.evaluate(iceland_svm_predictions))

print("Europe svm predictions evaluation")
print(svm_evaluator.evaluate(europe_svm_predictions))

Iceland svm predictions evaluation
0.9393203883495146
Europe svm predictions evaluation
0.7323434932366915


In [32]:
print("Null Counter")

print("Iceland dataframe")
nulls = iceland_df_trend.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in iceland_df_trend.columns])
nulls.show()
print("Europe dataframe")
nulls = europe_df_trend.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in europe_df_trend.columns])
nulls.show()

Null Counter
Iceland dataframe
+-----+----+----+---+------+--------------+---+----+---+---+----+-----+
|RGIID|Time|Area| dh|err_dh|Time_formatted|sum|mean|min|max|diff|Trend|
+-----+----+----+---+------+--------------+---+----+---+---+----+-----+
|    0|   0|   0|  0|     0|             0|568| 568|568|568|1704| 1704|
+-----+----+----+---+------+--------------+---+----+---+---+----+-----+

Europe dataframe
+-----+----+----+------+------+--------------+------+------+------+------+------+------+
|RGIID|Time|Area|    dh|err_dh|Time_formatted|   sum|  mean|   min|   max|  diff| Trend|
+-----+----+----+------+------+--------------+------+------+------+------+------+------+
|    0|   0|   0|124115|123874|             0|127527|127527|127527|127527|134351|134351|
+-----+----+----+------+------+--------------+------+------+------+------+------+------+



In [34]:
# # vec_assembler = VectorAssembler(inputCols = ["Area", "dh","sum", "min", "max", "mean"], outputCol='features')
# vec_assembler = VectorAssembler(inputCols = ["Area","mean"], outputCol='features')

# # very important to drop any null values here as the feature vectors must have the same shape
# output_df = vec_assembler.transform(formatted_df.dropna())
# output_df.printSchema()

# final_data = output_df.select('features','dh')
# # final_data = final_data.withColumn("dh",f.when(f.col("Trend") <= 0, 0).otherwise(f.col("Trend")))

# train_df = final_data.where(f.col('Year') != '2020')
# test_df = final_data.where(f.col('Year') == '2020')

# train_df = train_df.where((col('dh').isNotNull()))
# test_df = test_df.where((col('dh').isNotNull()))

def lin_reg_prep(train_df, test_df):
    lm = LinearRegression(labelCol="dh")
    LR_model = lm.fit(train_df)

    LR_predictions_res = LR_model.evaluate(test_df)
    LR_predictions = LR_predictions_res.predictions
    LR_predictions.show()
    return LR_predictions


print("Iceland linear regression predictions")
iceland_lin_reg_pred = lin_reg_prep(iceland_train_df, iceland_test_df)

print("Europe linear regression predictions")
europe_lin_reg_pred = lin_reg_prep(europe_train_df, europe_test_df)




root
 |-- RGIID: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Area: float (nullable = true)
 |-- dh: float (nullable = true)
 |-- err_dh: float (nullable = true)
 |-- Time_formatted: date (nullable = true)
 |-- sum: double (nullable = true)
 |-- mean: double (nullable = true)
 |-- min: float (nullable = true)
 |-- max: float (nullable = true)
 |-- diff: float (nullable = true)
 |-- Trend: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+-------+--------------------+
|            features|     dh|          prediction|
+--------------------+-------+--------------------+
|[4903000.0,-6.634...| -6.326|  -6.713959201897589|
|[67000.0,1.772000...|  1.904|  1.7382289042337273|
|[2712000.0,-10.75...|-10.577| -10.855493572890087|
|[448000.0,-4.6656...|  -4.67|  -4.733976758754863|
|[77000.0,-1.45099...| -1.519| -1.5020455518562374|
|[119000.0,-7.2006...| -7.132|  -7.282531219014964|
|[165000.0,-4.58

In [36]:
print("Linear Regression - Further Analysis")
print("RMSE -> ",LR_predictions_res.rootMeanSquaredError)
print("R2 -> ",LR_predictions_res.r2)
print("MSE -> ",LR_predictions_res.meanSquaredError)
print("MAE -> ",LR_predictions_res.meanAbsoluteError)

Linear Regression - Further Analysis
RMSE ->  0.4843833827312313
R2 ->  0.9976002182314097
MSE ->  0.23462726146615054
MAE ->  0.33434976223643426
