In [24]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import when
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import ensemble
import importlib
importlib.reload(ensemble)
from ensemble import *

In [2]:
#.config("spark.executor.memoryOverhead", "64g")\
spark = SparkSession\
    .builder\
    .appName("ensemble")\
    .config("spark.executor.heartbeatInterval", "60s")\
    .config("spark.executor.memory", "32g")\
    .config("spark.driver.memory", "32g")\
    .config("spark.driver.maxResultSize", "64g")\
    .config("spark.sql.crossJoin.enabled", True)\
    .getOrCreate()
sc = spark.sparkContext

In [10]:
from twitter_preproc import *
df = twitter_preproc(spark, sc, "/tmp/traintweet_1000.tsv", MF=True).outputDF
df = df.select("tweet_id", "engaging_user_id", "like_timestamp")\
    .withColumn("label", F.when(F.col("like_timestamp").isNull(), F.lit(0)).otherwise(F.lit(1)))\
    .drop("like_timestamp")
df.show(5, False)

+--------------------------------+--------------------------------+-----+
|tweet_id                        |engaging_user_id                |label|
+--------------------------------+--------------------------------+-----+
|E7D6C5094767223F6F8789A87A1937AB|00000776B07587ECA9717BFC301F2D6E|0    |
|129F4A868712BA2B98D31AF98C3066E4|00000B85AAF7DE172876FD96718C4469|1    |
|04C6C2175852CDBBC23B2446C7E7C22D|00000E0C9B364891CDE89ECFC54771DE|1    |
|168157826315514C120494D4DF8E6216|00000F04EEDBCF3E1FB9A1948BF353B6|1    |
|B3E3673782A69D9D8A45D3B222F0B073|000010088197DA00D659853E06935B3E|1    |
+--------------------------------+--------------------------------+-----+
only showing top 5 rows



In [20]:
train_df, test_df = df.randomSplit([0.9, 0.1])
train_df.count(), test_df.count()

(890, 110)

In [21]:
train_df.repartition(1).write.csv("ensemble/train/like_labels", sep="\x01", header=True)
test_df.repartition(1).write.csv("ensemble/test/like_labels", sep="\x01", header=True)

In [22]:
model_names = ["model1", "model2", "model3"]
for model_name in model_names:
    train_pred = train_df.withColumn("prediction", F.rand(seed=19)).drop("label")
    test_pred = test_df.withColumn("prediction", F.rand(seed=20)).drop("label")
    
    train_pred.repartition(1).write.csv("ensemble/train/{}_predictions".format(model_name), sep="\x01", header=True)
    test_pred.repartition(1).write.csv("ensemble/test/{}_predictions".format(model_name), sep="\x01", header=True)

In [25]:
ens = ensemble(spark, sc)
train_pred_files = {"model1": "ensemble/lol/train/model1_predictions.tsv", 
                    "model2": "ensemble/lol/train/model2_predictions.tsv", 
                    "model3": "ensemble/lol/train/model3_predictions.tsv"}
test_pred_files = {"model1": "ensemble/lol/test/model1_predictions.tsv", 
                    "model2": "ensemble/lol/test/model2_predictions.tsv", 
                    "model3": "ensemble/lol/test/model3_predictions.tsv"}
train_label_file = "ensemble/lol/train/like_labels.tsv"
test_label_file = "ensemble/lol/test/like_labels.tsv"

In [26]:
lr_model = ens.train(train_pred_files, train_label_file)

RMSE: 0.49624805566552604


In [27]:
eval_df = ens.test_evaluate(lr_model, test_pred_files, test_label_file, thresholds=[0.1, 0.3, 0.5, 0.7, 0.9])
eval_df

Unnamed: 0,areaUnderPR-0.1,areaUnderPR-0.3,areaUnderPR-0.5,areaUnderPR-0.7,areaUnderPR-0.9,log_loss,model_name
0,0.366034,0.355153,0.34364,0.312833,0.379604,0.966158,model1
1,0.366034,0.355153,0.34364,0.312833,0.379604,0.966158,model2
2,0.366034,0.355153,0.34364,0.312833,0.379604,0.966158,model3
3,0.354545,0.354545,0.354545,0.354545,0.354545,0.666254,Ensemble
