##Setup and Functions

In [0]:
from pyspark.sql import Row,SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import time
from pyspark.sql.functions import *
import numpy as np
import random
import datetime
import pandas as pd
import xgboost as xgb
import mlflow.xgboost
import math
import itertools

from sparkdl.xgboost import XgboostRegressor,XgboostClassifier
from sklearn.model_selection import RandomizedSearchCV

from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, udf

In [0]:
blob_container = "team06" # The name of your container created in https://portal.azure.com
storage_account = "apatel" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team06" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team06" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
# this is not used; it was for an alternate strategy we did not choose

def find_optimal_threshold_df(df, fold_num, search_center=0.6, search_bounds=0.2, granularity=5, times_to_zoom=4):
    """Finds optimal threshold for a model given a fold number and inserts new 
    prediction column based on this optimal threshold (scored using f1 score)"""
    
    prob_name = "probability_" + str(fold_num)
    pred_name = "prediction_" + str(fold_num)
    rev_pred_name = "rev_pred_" + str(fold_num)
    label_name = "label_" + str(fold_num)
    
    def ith_(v, i):
        try:
            return float(v[i])
        except ValueError:
            return None
    
    ith = udf(ith_, DoubleType())
    output = df.withColumn("del_prob",ith(prob_name, lit(1)))
    
    for i in range(times_to_zoom):
        search_space = np.linspace(search_center - search_bounds, search_center + search_bounds, granularity)
        best_score = 0
        prior_score = 0
        best_thresh = -1
        for threshold in search_space:
            test_df = output.select("_id",label_name,'del_prob')
            test_df = test_df.withColumn('prediction', when((col('del_prob') >= lit(threshold)), 1.0).otherwise(0.0))
            test_df = test_df.withColumnRenamed(label_name, "label")
            test_df = test_df.select('label','prediction')
            test_df.cache()
            test_metrics = MulticlassMetrics(test_df.rdd)
            f1_score = test_metrics.fMeasure(1.0,1.0)
            print("threshold:",threshold,"f1 score:",f1_score)
            if f1_score > best_score:
                best_score = f1_score
                best_thresh = threshold
            elif f1_score < prior_score:
                break
            prior_score = f1_score
        print("="*45)
        print("best score this level:", best_score, "at threshold", best_thresh)
        print("="*45)
        search_center = best_thresh
        search_bounds = search_bounds / 4
        test_df.unpersist()
        
    join_df = output.select("_id",label_name,'del_prob')
    join_df = test_df.withColumn('prediction', when((col('del_prob') >= lit(best_thresh)), 1.0).otherwise(0.0))
    join_metric = join_df.withColumnRenamed(label_name, "label").select("label","prediction")
    join_metrics = MulticlassMetrics(join_metric.rdd)
    new_f1 = test_metrics.fMeasure(1.0,1.0)
    
    join_df = join_df.withColumnRenamed("prediction",rev_pred_name).select('_id',rev_pred_name)

    score_metrics = df.select(label_name, pred_name).withColumnRenamed(label_name, "label").withColumnRenamed(pred_name, "prediction")
    metrics = MulticlassMetrics(score_metrics.rdd)
    orig_f1 = metrics.fMeasure(1.0,1.0)
    print("="*45)
    print("For fold",fold_num)
    print("="*45)
    print("Original f1 score:",orig_f1)
    print("New f1 score:", new_f1)

    final_df = df.join(join_df,['_id'])

    return final_df

In [0]:
def find_optimal_threshold(df, search_center=0.6, search_bounds=0.2, granularity=5, times_to_zoom=4):
    """Finds optimal threshold for a model based on f1 score"""
    
    def ith_(v, i):
        try:
            return float(v[i])
        except ValueError:
            return None
    
    ith = udf(ith_, DoubleType())
    output = df.withColumn("del_prob",ith("probability", lit(1)))
    
    for i in range(times_to_zoom):
        search_space = np.linspace(search_center - search_bounds, search_center + search_bounds, granularity)
        best_score = 0
        prior_score = 0
        best_thresh = -1
        for threshold in search_space:
            test_df = output.select('label','del_prob')
            test_df = test_df.withColumn('prediction', when((col('del_prob') >= lit(threshold)), 1.0).otherwise(0.0))
            test_df = test_df.select('label','prediction')
            test_df.cache()
            test_metrics = MulticlassMetrics(test_df.rdd)
            f1_score = test_metrics.fMeasure(1.0,1.0)
            print("threshold:",threshold,"f1 score:",f1_score)
            if f1_score > best_score:
                best_score = f1_score
                best_thresh = threshold
            elif f1_score < prior_score:
                break
            prior_score = f1_score
        print("="*45)
        print("best score this level:", best_score, "at threshold", best_thresh)
        print("="*45)
        search_center = best_thresh
        search_bounds = search_bounds / 4
        test_df.unpersist()

    print("overall best threshold:", best_thresh, "with f1 score", best_score)
    return best_thresh

In [0]:
# file names for reference
# val_1 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup1_0410_xgbm')
# val_2 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup2_0410_xgbm')
# val_3 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup3_0410_xgbm')
# val_4 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup4_0410_xgbm')
# val_5 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup5_0410_xgbm')

# preds_1 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup1_0410_xgbm')
# preds_2 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup2_0410_xgbm')
# preds_3 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup3_0410_xgbm')
# preds_4 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup4_0410_xgbm')
# preds_5 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup5_0410_xgbm')

UniqueID,label,probability,prediction,features
G4220NVINDPIE2019-02-032019-02-03 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7220805551053295, 0.2779194448946705))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 4, 8, 9, 10, 11, 12, 17, 22, 24, 26, 29, 31, 32, 33, 37, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 9.0, 0.17, 107.0, 21.0, 0.09, 1.0, 1.0, 1.0, 1.0, 9.0, 0.17, 0.09, 0.005237982757867136, 0.006267708678759388, 1.0, 10162.0, 26.0, 1676.0, 11265.0, 28.0, 17.0, 10159.0, 9866.0, 1676.0, 0.068, 1.0, 47.0, 45.0, 1.0, 356.94, 336.64, 364.35, 3.73, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-042019-02-04 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7929360925542948, 0.20706390744570513))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 17, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 57.0, 1.0, 57.0, 1.0, 0.22, 133.0, 23.0, 0.07, 1.0, 1.0, 1.0, 1.0, 57.0, 1.0, 57.0, 0.22, 1.0, 0.07, 0.005237982757867136, 0.006267708678759388, 10116.0, 26.0, 5946.596622889306, 16000.0, 61.0, 50.0, 10190.973555337903, 9828.0, 1197.9019407558733, 0.056, 1.0, 47.0, 45.0, 1.0, 655.0, 465.83, 421.16, 375.05, 4.224299065420561, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-072019-02-07 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.790316318708831, 0.20968368129116904))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 27, 29, 31, 32, 33, 37, 42, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.22, 127.0, 22.0, 0.11, 1.0, 1.0, 1.0, 1.0, 0.22, 0.11, 0.005237982757867136, 0.006267708678759388, 1.0, 1.0, 10122.0, 26.0, 61.0, 402.0, 67.0, 67.0, 10122.0, 9830.0, 61.0, 0.081, 0.11, 6.745901639344262, 47.0, 45.0, 1.0, 1065.0, 678.43, 297.89, 1263.6, 6.745901639344262, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4220NVINDPIE2019-02-082019-02-08 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.773555280126357, 0.22644471987364306))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 28, 29, 30, 31, 32, 33, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 16.0, 1.0, 0.26, 134.0, 23.0, 0.19, 1.0, 1.0, 1.0, 1.0, 1.0, 16.0, 0.26, 1.0, 0.19, 0.005237982757867136, 0.006267708678759388, 1.0, 10247.0, 72.0, 5946.596622889306, 14000.0, -78.0, -122.0, 10190.973555337903, 9953.0, 1197.9019407558733, 0.077, 1.0, 47.0, 45.0, 1.0, 994.0, 678.27, 300.41, 850.77, 14.251968503937007, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-092019-02-09 14:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7849111844742793, 0.21508881552572068))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.17, 102.0, 21.0, 0.15, 1.0, 1.0, 1.0, 1.0, 1.0, 0.17, 0.15, 0.005237982757867136, 0.006267708678759388, 10455.0, 46.0, 22000.0, 16000.0, -117.0, -156.0, 10190.973555337903, 10145.0, 1197.9019407558733, 0.137, 0.15, 14.075187969924812, 47.0, 45.0, 1.0, 695.0, 389.06, 418.29, 305.17, 14.075187969924812, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-102019-02-10 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8113224989635686, 0.18867750103643147))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.15, 120.0, 23.0, 0.11, 1.0, 1.0, 1.0, 1.0, 1.0, 0.15, 0.11, 0.005237982757867136, 0.006267708678759388, 10337.0, 36.0, 22000.0, 16093.0, -67.0, -94.0, 10325.0, 10028.0, 1197.9019407558733, 0.095, 1.0, 47.0, 45.0, 1.0, 1010.0, 355.99, 412.5, 360.29, 9.392156862745098, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-112019-02-11 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7741837821307982, 0.2258162178692018))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 17, 22, 25, 26, 27, 28, 29, 31, 32, 33, 35, 43, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 19.0, 10.0, 1.0, 0.23, 130.0, 22.0, 0.2, 1.0, 1.0, 1.0, 1.0, 19.0, 1.0, 10.0, 0.23, 0.2, 0.005237982757867136, 0.006267708678759388, 3.0, 1.0, 10207.0, 15.0, 5946.596622889306, 400.0, 6.0, 6.0, 10190.973555337903, 9910.0, 1197.9019407558733, 0.067, 1.0, 47.0, 45.0, 1.0, 702.0, 413.68, 380.14, 330.35, 16.808333333333334, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4220NVINDPIE2019-02-132019-02-13 14:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6876366866371065, 0.31236331336289347))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 29, 31, 32, 33, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.08, 128.0, 18.0, 0.27, 1.0, 1.0, 1.0, 0.08, 0.27, 0.005237982757867136, 0.006267708678759388, 1.0, 10158.0, 93.0, 5946.596622889306, 14000.0, -61.0, -100.0, 10190.973555337903, 9861.0, 1197.9019407558733, 0.198, 0.27, 18.946902654867255, 47.0, 45.0, 1.0, 31.0, 497.87, 452.1, 599.0, 18.946902654867255, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4240NVINDPIE2019-02-142019-02-14 14:17:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7668109464851913, 0.2331890535148087))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.16, 137.0, 24.0, 0.14, 1.0, 1.0, 1.0, 0.16, 0.14, 0.005237982757867136, 0.006267708678759388, 10112.0, 41.0, 5946.596622889306, 16000.0, 33.0, -33.0, 10190.973555337903, 9821.0, 1197.9019407558733, 0.159, 1.0, 47.0, 45.0, 1.0, 635.0, 1013.08, 290.81, 1411.89, 11.846153846153847, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-152019-02-15 16:36:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7256945591173379, 0.274305440882662))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.25, 139.0, 24.0, 0.15, 1.0, 1.0, 1.0, 1.0, 0.25, 0.15, 0.005237982757867136, 0.006267708678759388, 10205.04435483871, 62.0, 7010.0, 16093.0, -39.0, -78.0, 10119.0, 9827.0, 549.0, 0.213, 0.15, 8.792592592592593, 47.0, 45.0, 1.0, 64.0, 507.22, 278.89, 339.58, 8.792592592592593, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"


In [0]:
LR_preds_1 = spark.read.parquet(f'{blob_url}/LR_pred_table_cvgroup1_0410_xgbm')

display(LR_preds_1)

UniqueID,label,probability,prediction,vectorized_features
G4220NVINDPIE2019-02-032019-02-03 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7870047635406735, 0.2129952364593265))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 4, 8, 9, 10, 11, 12, 17, 22, 24, 26, 29, 31, 32, 33, 37, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 9.0, 0.17, 107.0, 21.0, 0.09, 1.0, 1.0, 1.0, 1.0, 9.0, 0.17, 0.09, 0.005237982757867136, 0.006267708678759388, 1.0, 10162.0, 26.0, 1676.0, 11265.0, 28.0, 17.0, 10159.0, 9866.0, 1676.0, 0.068, 1.0, 47.0, 45.0, 1.0, 356.94, 336.64, 364.35, 3.73, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-042019-02-04 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7010566973591937, 0.29894330264080626))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 17, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 57.0, 1.0, 57.0, 1.0, 0.22, 133.0, 23.0, 0.07, 1.0, 1.0, 1.0, 1.0, 57.0, 1.0, 57.0, 0.22, 1.0, 0.07, 0.005237982757867136, 0.006267708678759388, 10116.0, 26.0, 5946.596622889306, 16000.0, 61.0, 50.0, 10190.973555337903, 9828.0, 1197.9019407558733, 0.056, 1.0, 47.0, 45.0, 1.0, 655.0, 465.83, 421.16, 375.05, 4.224299065420561, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-072019-02-07 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8431653987276045, 0.15683460127239546))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 27, 29, 31, 32, 33, 37, 42, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.22, 127.0, 22.0, 0.11, 1.0, 1.0, 1.0, 1.0, 0.22, 0.11, 0.005237982757867136, 0.006267708678759388, 1.0, 1.0, 10122.0, 26.0, 61.0, 402.0, 67.0, 67.0, 10122.0, 9830.0, 61.0, 0.081, 0.11, 6.745901639344262, 47.0, 45.0, 1.0, 1065.0, 678.43, 297.89, 1263.6, 6.745901639344262, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4220NVINDPIE2019-02-082019-02-08 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8648728125936691, 0.1351271874063309))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 28, 29, 30, 31, 32, 33, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 16.0, 1.0, 0.26, 134.0, 23.0, 0.19, 1.0, 1.0, 1.0, 1.0, 1.0, 16.0, 0.26, 1.0, 0.19, 0.005237982757867136, 0.006267708678759388, 1.0, 10247.0, 72.0, 5946.596622889306, 14000.0, -78.0, -122.0, 10190.973555337903, 9953.0, 1197.9019407558733, 0.077, 1.0, 47.0, 45.0, 1.0, 994.0, 678.27, 300.41, 850.77, 14.251968503937007, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-092019-02-09 14:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.8886042541062512, 0.11139574589374879))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.17, 102.0, 21.0, 0.15, 1.0, 1.0, 1.0, 1.0, 1.0, 0.17, 0.15, 0.005237982757867136, 0.006267708678759388, 10455.0, 46.0, 22000.0, 16000.0, -117.0, -156.0, 10190.973555337903, 10145.0, 1197.9019407558733, 0.137, 0.15, 14.075187969924812, 47.0, 45.0, 1.0, 695.0, 389.06, 418.29, 305.17, 14.075187969924812, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-102019-02-10 12:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9096173922052806, 0.0903826077947194))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 7, 8, 9, 10, 11, 12, 17, 22, 24, 27, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 0.15, 120.0, 23.0, 0.11, 1.0, 1.0, 1.0, 1.0, 1.0, 0.15, 0.11, 0.005237982757867136, 0.006267708678759388, 10337.0, 36.0, 22000.0, 16093.0, -67.0, -94.0, 10325.0, 10028.0, 1197.9019407558733, 0.095, 1.0, 47.0, 45.0, 1.0, 1010.0, 355.99, 412.5, 360.29, 9.392156862745098, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-112019-02-11 11:30:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.5264794862372242, 0.4735205137627758))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 17, 22, 25, 26, 27, 28, 29, 31, 32, 33, 35, 43, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 1.0, 19.0, 10.0, 1.0, 0.23, 130.0, 22.0, 0.2, 1.0, 1.0, 1.0, 1.0, 19.0, 1.0, 10.0, 0.23, 0.2, 0.005237982757867136, 0.006267708678759388, 3.0, 1.0, 10207.0, 15.0, 5946.596622889306, 400.0, 6.0, 6.0, 10190.973555337903, 9910.0, 1197.9019407558733, 0.067, 1.0, 47.0, 45.0, 1.0, 702.0, 413.68, 380.14, 330.35, 16.808333333333334, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4220NVINDPIE2019-02-132019-02-13 14:00:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.6438718163706759, 0.3561281836293241))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 29, 31, 32, 33, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.08, 128.0, 18.0, 0.27, 1.0, 1.0, 1.0, 0.08, 0.27, 0.005237982757867136, 0.006267708678759388, 1.0, 10158.0, 93.0, 5946.596622889306, 14000.0, -61.0, -100.0, 10190.973555337903, 9861.0, 1197.9019407558733, 0.198, 0.27, 18.946902654867255, 47.0, 45.0, 1.0, 31.0, 497.87, 452.1, 599.0, 18.946902654867255, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4240NVINDPIE2019-02-142019-02-14 14:17:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7944646961836941, 0.20553530381630591))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.16, 137.0, 24.0, 0.14, 1.0, 1.0, 1.0, 0.16, 0.14, 0.005237982757867136, 0.006267708678759388, 10112.0, 41.0, 5946.596622889306, 16000.0, 33.0, -33.0, 10190.973555337903, 9821.0, 1197.9019407558733, 0.159, 1.0, 47.0, 45.0, 1.0, 635.0, 1013.08, 290.81, 1411.89, 11.846153846153847, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"
G4219NVINDPIE2019-02-152019-02-15 16:36:00,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.7331829373713256, 0.26681706262867444))",0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 8, 9, 10, 11, 12, 17, 22, 24, 29, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(840.0, 797.0, 11.0, 0.25, 139.0, 24.0, 0.15, 1.0, 1.0, 1.0, 1.0, 0.25, 0.15, 0.005237982757867136, 0.006267708678759388, 10205.04435483871, 62.0, 7010.0, 16093.0, -39.0, -78.0, 10119.0, 9827.0, 549.0, 0.213, 0.15, 8.792592592592593, 47.0, 45.0, 1.0, 64.0, 507.22, 278.89, 339.58, 8.792592592592593, 0.18112659953388155, 13.758893628248538, 6.0, 6.0, 0.15870820792320833, 11.600504352070283, 151.0, 200.0, 0.10975609756097561, 13.073170731707316, 2815.0, 1947.0))"


In [0]:
for i in range(1,6):
    preds_df = find_optimal_threshold(preds_df, search_center=0.6, search_bounds=0.2, granularity=5, times_to_zoom=4, fold_num=i)

In [0]:
preds_1 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup1_0410_xgbm')
preds_1.cache()
preds_1_thresh = find_optimal_threshold(preds_1)
preds_1.unpersist()

preds_2 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup2_0410_xgbm')
preds_2.cache()
preds_2_thresh = find_optimal_threshold(preds_2) 
preds_2.unpersist()

preds_3 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup3_0410_xgbm')
preds_3.cache()
preds_3_thresh = find_optimal_threshold(preds_3) 
preds_3.unpersist()

preds_4 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup4_0410_xgbm')
preds_4.cache()
preds_4_thresh = find_optimal_threshold(preds_4) 
preds_4.unpersist()

preds_5 = spark.read.parquet(f'{blob_url}/RF_val_pred_table_cvgroup5_0410_xgbm')
preds_5.cache()
preds_5_thresh = find_optimal_threshold(preds_5) 
preds_5.unpersist()

##Re-Labeling on new thresholds

In [0]:
def infer_new_labels(df, threshold, name):
    """Based on input df and threshold, output df with re-inferred labels from new threshold."""
    def ith_(v, i):
        try:
            return float(v[i])
        except ValueError:
            return None
    
    ith = udf(ith_, DoubleType())
    output = df.withColumn("del_prob",ith("probability", lit(1)))
    
    metrics = MulticlassMetrics(df.select('label','prediction').rdd)
    orig_f1 = metrics.fMeasure(1.0,1.0)
    print("="*45)
    print("For",name)
    print("Original f1 score:",orig_f1)
    
    test_df = output.select('features','label','del_prob')
    test_df = test_df.withColumn('prediction', when((col('del_prob') >= lit(threshold)), 1.0).otherwise(0.0))
    test_df = test_df.select('features','label','prediction')
    test_df.cache()
    test_metrics = MulticlassMetrics(test_df.select('label','prediction').rdd)
    new_f1 = test_metrics.fMeasure(1.0,1.0)
    test_df.unpersist()
    
    print("New f1 score:", new_f1)
    
    return test_df

### Apply new labels and save out files

In [0]:
CV_name = 'fold_1'
save_name = 'RF_reinf_4_10_'+CV_name
preds_1 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup1_0410_xgbm')
preds_1.cache()
preds_1_reinf = infer_new_labels(df=preds_1, threshold=preds_1_thresh, name=CV_name)
preds_1.unpersist()
preds_1_reinf.cache()
preds_1_reinf.write.parquet(f"{blob_url}/{save_name}")
preds_1_reinf.unpersist()

CV_name = 'fold_2'
save_name = 'RF_reinf_4_10_'+CV_name
preds_2 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup2_0410_xgbm')
preds_2.cache()
preds_2_reinf = infer_new_labels(df=preds_2, threshold=preds_2_thresh, name=CV_name)
preds_2.unpersist()
preds_2_reinf.cache()
preds_2_reinf.write.parquet(f"{blob_url}/{save_name}")
preds_2_reinf.unpersist()

CV_name = 'fold_3'
save_name = 'RF_reinf_4_10_'+CV_name
preds_3 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup3_0410_xgbm')
preds_3.cache()
preds_3_reinf = infer_new_labels(df=preds_3, threshold=preds_3_thresh, name=CV_name)
preds_3.unpersist()
preds_3_reinf.cache()
preds_3_reinf.write.parquet(f"{blob_url}/{save_name}")
preds_3_reinf.unpersist()

CV_name = 'fold_4'
save_name = 'RF_reinf_4_10_'+CV_name
preds_4 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup4_0410_xgbm')
preds_4.cache()
preds_4_reinf = infer_new_labels(df=preds_4, threshold=preds_4_thresh, name=CV_name)
preds_4.unpersist()
preds_4_reinf.cache()
preds_4_reinf.write.parquet(f"{blob_url}/{save_name}")
preds_4_reinf.unpersist()

CV_name = 'fold_5'
save_name = 'RF_reinf_4_10_'+CV_name
preds_5 = spark.read.parquet(f'{blob_url}/RF_pred_table_cvgroup5_0410_xgbm')
preds_5.cache()
preds_5_reinf = infer_new_labels(df=preds_5, threshold=preds_5_thresh, name=CV_name)
preds_5.unpersist()
preds_5_reinf.cache()
preds_5_reinf.write.parquet(f"{blob_url}/{save_name}")
preds_5_reinf.unpersist()

#Voting and Test Set Performance

In [0]:
# We realized after conducting inference that the order had become scrambled between the folds
# Not all was lost! After much testing, we determined that only 24 rows had duplicate features, and only 18 of those with duplicate labels
# Reasoning that a fold's prediction will always be consistent with the same features, we held fold 1 in place and joined the other folds to it based on features
# 

# read in files

preds_1_reinf = spark.read.parquet(f'{blob_url}/RF_reinf_4_10_fold_1')
preds_2_reinf = spark.read.parquet(f'{blob_url}/RF_reinf_4_10_fold_2')
preds_3_reinf = spark.read.parquet(f'{blob_url}/RF_reinf_4_10_fold_3')
preds_4_reinf = spark.read.parquet(f'{blob_url}/RF_reinf_4_10_fold_4')
preds_5_reinf = spark.read.parquet(f'{blob_url}/RF_reinf_4_10_fold_5')

# verify lengths

print(preds_1_reinf.count())
print(preds_2_reinf.count())
print(preds_3_reinf.count())
print(preds_4_reinf.count())
print(preds_5_reinf.count())

In [0]:
all_preds = preds_1_reinf.withColumnRenamed("prediction","prediction_1").select("features","prediction_1","label")

all_preds = all_preds.alias("L").join(preds_2_reinf.alias("R").select("features","prediction","label")\
                                      .withColumnRenamed("prediction","prediction_2"), ['features', 'label'])

# drop duplicates with more than 2 matches

all_preds.createOrReplaceTempView("Preds")
all_preds = spark.sql("SELECT features, label, prediction_1, prediction_2 FROM (SELECT P.*, ROW_NUMBER() OVER(PARTITION BY features, label ORDER BY prediction_1 ASC) as rownum FROM Preds P) a WHERE a.rownum < 3")


all_preds = all_preds.alias("L").join(preds_3_reinf.alias("R").select("features","prediction","label")\
                                      .withColumnRenamed("prediction","prediction_3"), ['features', 'label'])

all_preds.createOrReplaceTempView("Preds")
all_preds = spark.sql("SELECT features, label, prediction_1, prediction_2, prediction_3 FROM (SELECT P.*, ROW_NUMBER() OVER(PARTITION BY features, label ORDER BY prediction_1 ASC) as rownum FROM Preds P) a WHERE a.rownum < 3")


all_preds = all_preds.alias("L").join(preds_4_reinf.alias("R").select("features","prediction","label")\
                                      .withColumnRenamed("prediction","prediction_4"), ['features', 'label'])

all_preds.createOrReplaceTempView("Preds")
all_preds = spark.sql("SELECT features, label, prediction_1, prediction_2, prediction_3, prediction_4 FROM (SELECT P.*, ROW_NUMBER() OVER(PARTITION BY features, label ORDER BY prediction_1 ASC) as rownum FROM Preds P) a WHERE a.rownum < 3")


all_preds = all_preds.alias("L").join(preds_5_reinf.alias("R").select("features","prediction","label")\
                                      .withColumnRenamed("prediction","prediction_5"), ['features', 'label'])

all_preds.createOrReplaceTempView("Preds")
all_preds = spark.sql("SELECT label, features, prediction_1, prediction_2, prediction_3, prediction_4, prediction_5 FROM (SELECT P.*, ROW_NUMBER() OVER(PARTITION BY features, label ORDER BY prediction_1 ASC) as rownum FROM Preds P) a WHERE a.rownum < 3")

In [0]:
# verify join results

all_preds.cache()

display(all_preds)
all_preds.count()

label,features,prediction_1,prediction_2,prediction_3,prediction_4,prediction_5
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 49, 50, 51, 52, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(1554.0, 20.0, 487.0, 1.0, 159.0, 1.0, 156.0, 1.0, 0.33, 401.0, 199.0, 0.21, 1.0, 1.0, 1.0, 1.0, 159.0, 1.0, 156.0, 0.33, 1.0, 0.21, 0.009816910506378085, 0.01025791928499623, 118.0, 10100.0, 82.0, 22000.0, 16093.0, -83.0, 10102.0, 10091.0, 1524.0, 0.111, 1.0, 26.0, 21.0, 1.0, 619.0, 253.07, 347.06, 251.83, 13.962666666666667, 0.20807924111123, 11.206206982632574, 3.0, 10.0, 0.19555506837302347, 13.98635279932038, 52.0, 125.0, 0.08695652173913043, 6.6521739130434785, 3091.0, 4412.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 21, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 81, 83, 84), values -> List(1212.0, 728.0, 607.0, 1.0, 24.0, 1.0, 17.0, 1.0, 0.17, 65.0, 684.0, 0.18, 1.0, 1.0, 1.0, 1.0, 1.0, 24.0, 1.0, 17.0, 0.17, 1.0, 0.18, 0.0034340306679597715, 0.004106753071779026, 10159.0, 51.0, 22000.0, 16000.0, -50.0, -100.0, 10170.035815268615, 9881.0, 936.1568998109641, 0.071, 1.0, 72.0, 61.0, 1.0, 488.0, 364.77, 244.85, 344.29, 16.705128205128204, 0.16533909524420967, 11.721165724304962, 8.0, 9.0, 0.13805686483199026, 10.893720541280219, 226.0, 230.0, 0.45454545454545453, 3391.0, 5258.0))",0.0,0.0,0.0,0.0,0.0
1.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(1999.0, 146.0, 1135.0, 1.0, 24.0, 1.0, 18.0, 1.0, 0.17, 241.0, 481.0, 0.14, 1.0, 1.0, 1.0, 1.0, 24.0, 1.0, 18.0, 0.17, 1.0, 0.14, 0.00930196273177531, 0.008631014346010992, 10173.0, 36.0, 7620.0, 16093.0, -122.0, 10173.0, 10116.0, 5791.0, 0.054, 1.5, 4.0, 28.0, 29.0, 1.0, 397.0, 329.88, 335.07, 373.94, 9.17117117117117, 0.20807924111123, 11.206206982632574, 3.0, 10.0, 0.18663221228016044, 11.848426411601357, 65.0, 188.0, 0.1391509433962264, 9.617924528301886, 2270.0, 3279.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(364.0, 36.0, 76.0, 1.0, 103.0, 1.0, 106.0, 1.0, 0.36, 324.0, 38.0, 0.31, 1.0, 1.0, 1.0, 103.0, 1.0, 106.0, 0.36, 1.0, 0.31, 0.008701554732167022, 0.009101861508585643, 10026.0, 36.0, 366.0, 12875.0, 256.0, 228.0, 10027.0, 10016.0, 244.0, 0.273, 1.333, 3.0, 31.0, 26.0, 1.0, 939.0, 228.69, 282.13, 181.05, 25.75811209439528, 0.16280382878038288, 12.261440632394063, 14.0, 11.0, 0.14269574059428314, 10.450193951719204, 230.0, 268.0, 0.12149532710280374, 5.841121495327103, 2734.0, 4785.0))",0.0,1.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(587.0, 146.0, 599.0, 1.0, 21.0, 1.0, 23.0, 1.0, 0.24, 300.0, 220.0, 0.4, 1.0, 1.0, 1.0, 21.0, 1.0, 23.0, 0.24, 1.0, 0.4, 0.00948494656296053, 0.008076466597018312, 10148.0, 7620.0, 16093.0, 228.0, 206.0, 10149.0, 10093.0, 4267.0, 0.123, 0.2, 51.6, 5.0, 26.0, 29.0, 1.0, 437.0, 219.98, 302.39, 260.44, 62.15613382899628, 0.2398833534977601, 14.220697917069211, 5.0, 14.0, 0.22719161699381146, 14.414062796613386, 53.0, 213.0, 0.22287968441814596, 13.303747534516765, 1434.0, 3169.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 21, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(479.0, 650.0, 96.0, 1.0, 95.0, 1.0, 89.0, 1.0, 0.36, 59.0, 376.0, 0.25, 1.0, 1.0, 1.0, 1.0, 95.0, 1.0, 89.0, 0.36, 1.0, 0.25, 0.0034362139165875897, 0.003369378573457475, 10162.0, 22000.0, 16093.0, 194.0, 172.0, 10169.0, 9940.0, 2814.7330729166665, 0.084, 0.333, 3.0, 67.0, 73.0, 1.0, 436.0, 255.22, 317.53, 347.05, 22.545454545454547, 0.20844771293625383, 11.74154436237994, 3.0, 14.0, 0.16388358778625955, 12.013120229007633, 172.0, 218.0, 0.14427860696517414, 9.124378109452737, 2354.0, 3841.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 21, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(1083.0, 650.0, 5431.0, 1.0, 20.0, 1.0, 16.0, 1.0, 0.22, 53.0, 653.0, 0.23, 1.0, 1.0, 1.0, 1.0, 20.0, 1.0, 16.0, 0.22, 1.0, 0.23, 0.0034362139165875897, 0.003369378573457475, 10220.0, 22000.0, 16093.0, 161.0, 133.0, 10227.0, 9996.0, 7620.0, 0.056, 2.0, 67.0, 73.0, 1.0, 510.0, 979.97, 296.98, 290.42, 14.5, 0.2020475085979365, 19.965618918126317, 1.0, 5.0, 2.0, 0.16388358778625955, 12.013120229007633, 172.0, 218.0, 0.2, 7.672727272727273, 1316.0, 4339.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 21, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(1888.0, 26.0, 4227.0, 1.0, 59.0, 1.0, 73.0, 1.0, 0.1, 235.0, 314.0, 0.47, 1.0, 1.0, 1.0, 1.0, 59.0, 1.0, 73.0, 0.1, 1.0, 0.47, 0.008255730076970672, 0.008484659861292751, 10198.798345398138, 853.0, 14484.0, 178.0, 167.0, 10166.0, 10162.0, 213.0, 0.345, 1.0, 33.0, 27.0, 1.0, 441.0, 490.7, 266.75, 277.57, 32.86695278969957, 0.13976920811430713, 10.67751162437862, 16.0, 16.0, 0.17988804276564627, 12.854909476019971, 130.0, 203.0, 0.04411764705882353, 1.2058823529411764, 3282.0, 5227.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(192.0, 236.0, 1026.0, 1.0, 25.0, 1.0, 15.0, 1.0, 0.1, 22.0, 1060.0, 0.16, 1.0, 1.0, 1.0, 1.0, 25.0, 1.0, 15.0, 0.1, 1.0, 0.16, 0.0016240220751936492, 0.001479771782829008, 10155.0, 15.0, 1250.0, 16093.0, 67.0, 67.0, 10156.0, 10073.0, 122.0, 0.112, 6.0, 144.0, 155.0, 1.0, 963.0, 526.58, 412.0, 514.11, 8.894736842105264, 0.12154643832964007, 7.731726942191433, 16.0, 16.0, 0.13340623291416073, 11.787862219792236, 232.0, 190.0, 0.08925869894099848, 9.703479576399396, 3069.0, 3233.0))",0.0,0.0,0.0,0.0,0.0
0.0,"Map(vectorType -> sparse, length -> 85, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 22, 25, 26, 27, 28, 29, 30, 31, 32, 33, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 83, 84), values -> List(390.0, 896.0, 435.0, 1.0, 76.0, 1.0, 65.0, 1.0, 0.22, 132.0, 189.0, 0.19, 1.0, 1.0, 1.0, 1.0, 76.0, 1.0, 65.0, 0.22, 1.0, 0.19, 0.005605913353046195, 0.006235004123687768, 10314.0, 15.0, 22000.0, 14484.0, -94.0, -128.0, 10305.0, 9981.0, 2275.280155642023, 0.118, 1.0, 45.0, 44.0, 1.0, 527.0, 263.66, 288.42, 242.9, 18.585365853658537, 0.15813352609157041, 14.09230863874748, 15.0, 11.0, 0.17840490797546013, 14.714437627811861, 112.0, 152.0, 0.0847457627118644, 9.0, 3241.0, 3908.0))",0.0,0.0,0.0,0.0,0.0


In [0]:
all_preds_f = all_preds.withColumn("weighted_pred", lit(0.15)*col('prediction_1') + lit(0.15)*col('prediction_2') + lit(0.2)*col('prediction_3') + lit(0.3)*col('prediction_4')+ lit(0.2)*col('prediction_5'))


all_preds_f = all_preds_f.withColumn("prediction", (col("weighted_pred") >= lit(0.5)).cast('double')).cache()

test_df = all_preds_f.select('prediction','label').cache()

In [0]:
all_preds_f.write.mode("overwrite").parquet(f"{blob_url}/RF_all_folds_weighted_preds_4_10")
# all_preds_f = spark.read.parquet(f'{blob_url}/xgboost_all_folds_weighted_preds_4_9')

In [0]:
test_df = preds_1_reinf.select('prediction','label')
test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("Fold 1 test set (2019) f1 score:", f1_score)

test_df = preds_2_reinf.select('prediction','label')
test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("Fold 2 test set (2019) f1 score:", f1_score)

test_df = preds_3_reinf.select('prediction','label')
test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("Fold 3 test set (2019) f1 score:", f1_score)

test_df = preds_4_reinf.select('prediction','label')
test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("Fold 4 test set (2019) f1 score:", f1_score)

test_df = preds_5_reinf.select('prediction','label')
test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("Fold 5 test set (2019) f1 score:", f1_score)


test_df = all_preds_f.select('prediction','label')

test_metrics = MulticlassMetrics(test_df.rdd)
f1_score = test_metrics.fMeasure(1.0,1.0)

print("All Folds weighted vote test set (2019) f1 score:", f1_score)