In [18]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import when
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import ensemble
import importlib
importlib.reload(ensemble)
from ensemble import *

In [2]:
#.config("spark.executor.memoryOverhead", "64g")\
spark = SparkSession\
    .builder\
    .appName("ensemble")\
    .config("spark.executor.heartbeatInterval", "60s")\
    .config("spark.executor.memory", "32g")\
    .config("spark.driver.memory", "32g")\
    .config("spark.driver.maxResultSize", "64g")\
    .config("spark.sql.crossJoin.enabled", True)\
    .getOrCreate()
sc = spark.sparkContext

In [25]:
base_dir = "/tmp/supersecret_ensemble/"

train_pred_files = {
    "like": {
        "mf": base_dir + "mf/train_prediction_like.tsv", 
        "rf": base_dir + "rf/rf_like_out_ensembletrain_format.csv", 
        "nncf": base_dir + "nncf/like.supersecret_ensembletrain5k_bootstrap.tsv"
    },
    "reply": {
        "mf": base_dir + "mf/train_prediction_reply.tsv", 
        "rf": base_dir + "rf/rf_reply_out_ensembletrain_format.csv", 
        "nncf": base_dir + "nncf/reply.supersecret_ensembletrain5k_bootstrap.tsv"
    },
    "retweet": {
        "mf": base_dir + "mf/train_prediction_retweet.tsv", 
        "rf": base_dir + "rf/rf_retweet_out_ensembletrain_format.csv", 
        "nncf": base_dir + "nncf/retweet.supersecret_ensembletrain5k_bootstrap.tsv"
    },
    "retweet_with_comment": {
        "mf": base_dir + "mf/train_prediction_retweet_with_comment.tsv", 
        "rf": base_dir + "rf/rf_retweet_comment_out_ensembletrain_format.csv", 
        "nncf": base_dir + "nncf/retweet_comment.supersecret_ensembletrain5k_bootstrap.tsv"
    },
}

test_pred_files = {
    "like": {
        "mf": base_dir + "mf/test_prediction_like.tsv", 
        "rf": base_dir + "rf/rf_like_out_test_format.csv", 
        "nncf": base_dir + "nncf/like.supersecret_test5k_bootstrap.tsv"
    },
    "reply": {
        "mf": base_dir + "mf/test_prediction_reply.tsv", 
        "rf": base_dir + "rf/rf_reply_out_test_format.csv", 
        "nncf": base_dir + "nncf/reply.supersecret_test5k_bootstrap.tsv"
    },
    "retweet": {
        "mf": base_dir + "mf/test_prediction_retweet.tsv", 
        "rf": base_dir + "rf/rf_retweet_out_test_format.csv", 
        "nncf": base_dir + "nncf/retweet.supersecret_test5k_bootstrap.tsv"
    },
    "retweet_with_comment": {
        "mf": base_dir + "mf/test_prediction_retweet_with_comment.tsv", 
        "rf": base_dir + "rf/rf_retweet_comment_out_test_format.csv", 
        "nncf": base_dir + "nncf/retweet_comment.supersecret_test5k_bootstrap.tsv"
    },
}
    
train_label_file = base_dir + "labels/train_labels.tsv"
test_label_file = base_dir + "labels/test_labels.tsv"
schemas = {
    "nncf": StructType([
    StructField("engaging_user_id", StringType()),
    StructField("tweet_id", StringType()),
    StructField("prediction", DoubleType())
    ]),
    "rf": StructType([
        StructField("tweet_id", StringType()),
        StructField("engaging_user_id", StringType()),
        StructField("prediction", DoubleType())
    ])
}

ens = ensemble(spark, sc)

In [26]:
def file_exist(path):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    return fs.exists(sc._jvm.org.apache.hadoop.fs.Path(path))
for engagement in train_pred_files:
    for model_name in train_pred_files[engagement]:
        if not file_exist(train_pred_files[engagement][model_name]):
            print("File does not exist for engagement {} and model_name {}".format(engagement, model_name))
        if not file_exist(test_pred_files[engagement][model_name]):
            print("File does not exist for engagement {} and model_name {}".format(engagement, model_name))

In [27]:
coef_list = []
for engagement in train_pred_files:
    lr_model = ens.train(train_pred_files[engagement], schemas, train_label_file, engagement)
    lr_model.save(base_dir + "model_" + engagement)
    coef_list.append(lr_model.coefficients)

    eval_df = ens.test_evaluate(lr_model, test_pred_files[engagement], schemas, test_label_file, engagement, thresholds=[0.1, 0.3, 0.5, 0.7, 0.9])
    eval_df.to_csv("ensemble_{}.csv".format(engagement), index=False)

root
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: string (nullable = true)
 |-- reply: string (nullable = true)
 |-- retweet: string (nullable = true)
 |-- retweet_with_comment: string (nullable = true)

root
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: string (nullable = true)
 |-- reply: string (nullable = true)
 |-- retweet: string (nullable = true)
 |-- retweet_with_comment: string (nullable = true)

root
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: string (nullable = true)
 |-- reply: string (nullable = true)
 |-- retweet: string (nullable = true)
 |-- retweet_with_comment: string (nullable = true)

root
 |-- tweet_id: string (nullable = true)
 |-- engaging_user_id: string (nullable = true)
 |-- like: string (nullable = true)
 |-- reply: string (nullable = true)
 |-- retweet: string (nullable = true)
 |-- retweet_with_com

In [32]:
list(coef_list[0])

[11.647919876905869, 1.3963699553481372, 0.0697410798905738]

In [38]:
coef_list = [[e] + list(coef) for (coef, e) in zip(coef_list, train_pred_files.keys())]
pd.DataFrame(coef_list, columns=["engagement", "mf", "rf", "nncf"]).to_csv("coefficients.csv", index=False)