In [1]:
import pyspark
import numpy as np

from pyspark import SparkContext, SQLContext, SparkConf

from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, IntegerType, StringType, FloatType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import StringIndexer, Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, CountVectorizer, IDF, ChiSqSelector, Normalizer
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
    

from pyspark.mllib.evaluation import BinaryClassificationMetrics

## Start Session for Assignment 3 - Evaluation (Token IDs and Hashtags)

In [2]:
# Set Spark Config
conf = SparkConf().setAppName("RecSys-Challenge-Train-Model").setMaster("yarn")
conf = (conf.set("deploy-mode","cluster")
       .set("spark.driver.memory","100g")
       .set("spark.executor.memory","100g")
       .set("spark.driver.cores","1")
       .set("spark.num.executors","50")
       .set("spark.executor.cores","5")
       .set("spark.driver.maxResultSize", "100g"))
sc = pyspark.SparkContext(conf=conf)
sql = SQLContext(sc)

## Load Test set

In [3]:
path = 'test.tsv'

In [4]:
df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(path,  inferSchema="true")
    .repartition(1000)
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [5]:
df = df.select(['text_tokens', 'hashtags', 'reply_timestamp', "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"])

In [13]:
df.show(5)

+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+
|         text_tokens|            hashtags|reply_timestamp|retweet_timestamp|retweet_with_comment_timestamp|like_timestamp|
+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+
|101	56898	137	144...|CB6D1FF72208B50EA...|           null|             null|                          null|          null|
|101	146	10134	110...|                null|           null|             null|                          null|    1581464993|
|101	56898	137	469...|                null|           null|             null|                          null|          null|
|101	16986	12713	1...|                null|           null|             null|                          null|    1581472788|
|101	10057	7138	30...|6FF21860E3E64036C...|           null|             null|                          null|          null|
+-------

## Encode Engagements

In [6]:
response_cols = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [7]:
for col in response_cols:
    df = df.withColumn(
        col,
        F.when((F.col(col) >= 0), 1)\
        .otherwise(0)
        )

## Combine Token ID's with Hashtags

In [8]:
df = df.fillna({'hashtags':'nohashtag'})

In [9]:
df = df.withColumn(
    'token_and_hashtags',
    F.concat(F.col("text_tokens"), F.lit("\t"), F.col("hashtags")))

In [18]:
print(df)

DataFrame[text_tokens: string, hashtags: string, reply_timestamp: int, retweet_timestamp: int, retweet_with_comment_timestamp: int, like_timestamp: int, token_and_hashtags: string]


## Evaluation of all 4 models

### Evaluate 1st Model "reply_timestamp"

In [13]:
mPath_reply =  "model_reply_bestModel_big"

In [15]:
df = df.withColumnRenamed("reply_timestamp", "label")
persistedModel = PipelineModel.load(mPath_reply)

In [16]:
predictions = persistedModel.transform(df)

In [22]:
evaluator = BinaryClassificationEvaluator(labelCol='label')

In [24]:
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under PR Curve: {:.4f}".format(auprc))

Area under PR Curve: 0.3647


In [53]:
df = df.withColumnRenamed("label", "reply_timestamp")

In [26]:
df.show(5)

+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+--------------------+
|         text_tokens|            hashtags|reply_timestamp|retweet_timestamp|retweet_with_comment_timestamp|like_timestamp|  token_and_hashtags|
+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+--------------------+
|101	56898	137	146...|750431D16B3B4E656...|              0|                0|                             0|             0|101	56898	137	146...|
|101	56898	137	385...|           nohashtag|              0|                0|                             0|             0|101	56898	137	385...|
|101	56898	137	100...|           nohashtag|              0|                0|                             0|             0|101	56898	137	100...|
|101	1433	31287	20...|           nohashtag|              0|                0|                             0|             1|101	143

### Evaluate 2nd Model "retweet_timestamp"

In [27]:
mPath_retweet =  "model_retweet_bestModel_big"

In [28]:
df = df.withColumnRenamed("retweet_timestamp", "label")
persistedModel = PipelineModel.load(mPath_reply)
predictions = persistedModel.transform(df)

In [29]:
evaluator = BinaryClassificationEvaluator(labelCol='label')

In [31]:
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under PR Curve: {:.4f}".format(auprc))

Area under PR Curve: 0.5163


In [32]:
df = df.withColumnRenamed("label", "retweet_timestamp")

### Evaluate 3rd Model "retweet_with_comment_timestamp"

In [34]:
mPath_retweet_with_comment =  "model_retweet_with_comment_bestModel_big"

In [35]:
df = df.withColumnRenamed("retweet_with_comment_timestamp", "label")
persistedModel = PipelineModel.load(mPath_reply)
predictions = persistedModel.transform(df)

In [36]:
evaluator = BinaryClassificationEvaluator(labelCol='label')

In [46]:
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under PR Curve: {:.4f}".format(auprc))

Area under PR Curve: 0.1278


In [47]:
df = df.withColumnRenamed("label", "retweet_with_comment_timestamp")

### Evaluate 4th Model "like_timestamp"

In [48]:
mPath_like =  "model_like_bestModel_big"

In [49]:
df = df.withColumnRenamed("like_timestamp", "label")
persistedModel = PipelineModel.load(mPath_reply)
predictions = persistedModel.transform(df)

In [50]:
evaluator = BinaryClassificationEvaluator(labelCol='label')

In [51]:
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under PR Curve: {:.4f}".format(auprc))

Area under PR Curve: 0.3985


In [52]:
df = df.withColumnRenamed("label", "like_timestamp")

In [54]:
bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='label')

Exception ignored in: <bound method JavaModelWrapper.__del__ of <pyspark.mllib.evaluation.BinaryClassificationMetrics object at 0x7f80bd200e80>>
Traceback (most recent call last):
  File "/home/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py", line 142, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'BinaryClassificationMetrics' object has no attribute '_sc'


TypeError: __init__() got an unexpected keyword argument 'scoreCol'

In [None]:
bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived')

# We still can get the same metrics as the evaluator...
print("Area under ROC Curve: {:.4f}".format(bcm.areaUnderROC))
print("Area under PR Curve: {:.4f}".format(bcm.areaUnderPR))

# But now we can PLOT both ROC and PR curves!
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
bcm.plot_roc_curve(ax=axs[0])
bcm.plot_pr_curve(ax=axs[1])