# Evaluate Models

This notebook is used to load the trained models and check their performance on a test dataset which was generated in the "Data Split" Notebook.

The models have been trained using the python scripts and submitted to the cluster with spark-submit.

In [1]:
import pyspark
from pyspark import SQLContext, SparkConf

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import  when, col, rand, isnan, split, array,udf
from pyspark.sql.types import FloatType
from pyspark.ml.recommendation import ALSModel

In [2]:
conf = SparkConf().setAppName("RecSys-Challenge-Evaluate-Model").setMaster("yarn")
conf = (conf.set("deploy-mode","cluster")
       .set("spark.driver.memory","100g")
       .set("spark.executor.memory","100g")
       .set("spark.driver.cores","1")
       .set("spark.num.executors","100")
       .set("spark.executor.cores","4")
       .set("spark.driver.maxResultSize", "100g"))
sc = pyspark.SparkContext(conf=conf)
sql = SQLContext(sc)

In [3]:
datafile = "hdfs:///user/e1553958/RSC20/test.tsv"

df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(datafile,  inferSchema="true")
    .repartition(500)
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

#### Data Preprocessing

In [4]:
df = df.withColumn("engaged_with_user_is_verified",col("engaged_with_user_is_verified").cast("Integer"))
df = df.withColumn("engaging_user_is_verified",col("engaging_user_is_verified").cast("Integer"))
df = df.withColumn("engaged_follows_engaging",col("engaged_follows_engaging").cast("Integer"))

# Split the string representations of lists
## Convert the text tokens to array of ints
split_text = pyspark.sql.functions.split(df['text_tokens'], '\t')
df = df.withColumn("text_tokens", split_text)

## Convert present media to array of strings
split_text = pyspark.sql.functions.split(df['present_media'], '\t')
df = df.withColumn("present_media", when(col('present_media').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert present links to array of strings
split_text = pyspark.sql.functions.split(df['present_links'], '\t')
df = df.withColumn("present_links", when(col('present_links').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert hashtags to array of strings
split_text = pyspark.sql.functions.split(df['hashtags'], '\t')
df = df.withColumn("hashtags", when(col('hashtags').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert present_domains to array of strings
split_text = pyspark.sql.functions.split(df['present_domains'], '\t')
df = df.withColumn("present_domains", when(col('present_domains').isNull(), array().cast("array<string>")).otherwise(split_text))

In [5]:
def encode_response(x):
    '''
    Encode a response columnm with 0 or 1
    Parameter
    ---------
    x: str
        Name of the column to encode
    Return
        Int: 0 if no response, 1 if response
    '''
    return when(col(x).isNull(), float(0)).otherwise(float(1))

get_probability=udf(lambda v:float(v[1]),FloatType())

In [6]:
target_cols = ['reply_timestamp', 
                    'retweet_timestamp',
                    'retweet_with_comment_timestamp', 
                    'like_timestamp'
                    ]
metrics = {}

## Evaluation

### Random Forest without Tweet Features

Model trained with "Baseline_Approach_rf_train.py" and stored on the HDFS.

In [7]:
rf_model = PipelineModel.load("hdfs:///user/e1553958/RecSys/datasplit/pipeline")

In [8]:
rf_data = rf_model.transform(df)

In [9]:
metrics["Random Forest"] = {}

for target_col in target_cols:
    rf_data = rf_data.withColumn(target_col+"_proba", get_probability(target_col+"_proba"))
    rf_data = rf_data.withColumn(target_col, encode_response(target_col))
    
    predictionAndLabels = rf_data.rdd.map(lambda r: (r[target_col+"_proba"], r[target_col]))
    metric = BinaryClassificationMetrics(predictionAndLabels)
    
    metrics["Random Forest"][target_col] = metric.areaUnderPR
    print("For {}: Area under PR = {}".format(target_col, metrics["Random Forest"][target_col]))

For reply_timestamp: Area under PR = 0.030799386073847527
For retweet_timestamp: Area under PR = 0.10276490043817094
For retweet_with_comment_timestamp: Area under PR = 0.007482488732274796
For like_timestamp: Area under PR = 0.5479100593045614


### Logistic Regression without Tweet Features

Model trained with "Baseline_Approach_logReg_train.py" and stored on the HDFS.

In [10]:
logReg_model = PipelineModel.load("hdfs:///user/e1553958/RecSys/datasplit/pipeline_logReg")

In [11]:
logReg_data = logReg_model.transform(df)

In [12]:
metrics["Logistic Regression"] = {}

for target_col in target_cols:
    logReg_data = logReg_data.withColumn(target_col+"_proba", get_probability(target_col+"_proba"))
    logReg_data = logReg_data.withColumn(target_col, encode_response(target_col))
    
    predictionAndLabels = logReg_data.rdd.map(lambda r: (r[target_col+"_proba"], r[target_col]))
    metric = BinaryClassificationMetrics(predictionAndLabels)
    
    metrics["Logistic Regression"][target_col] = metric.areaUnderPR
    print("For {}: Area under PR = {}".format(target_col, metrics["Logistic Regression"][target_col]))

For reply_timestamp: Area under PR = 0.055998903236944404
For retweet_timestamp: Area under PR = 0.14223024178532603
For retweet_with_comment_timestamp: Area under PR = 0.012504503368450139
For like_timestamp: Area under PR = 0.5455455147453278


### ALS Matrix Factorization

Model trained with "Matrix_Factorization_train.py" and stored on the HDFS. The fallback prediction is only a random generator.

In [13]:
def create_index(old, new):
        """
        check if ID from training exits
        if yes then use this id
        if not use newly generated id
        """
        if old == "user":
            max_val = max_user_id
        elif old == "tweet":
            max_val = max_tweet_id
        return when(col(old).isNull(), col(new) + max_val).otherwise(col(old))

In [14]:
mappings_path = "hdfs:///user/e1553958/RecSys/datasplit/mappings/"
# Load id_string to id mappings from training
user2id = sql.read.format('parquet').load(mappings_path+"user2id")
tweet2id = sql.read.format('parquet').load(mappings_path+"tweet2id")

# Select relevant columns
val_df = df.select("tweet_id","engaging_user_id", 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp')
# Create mapping from id_string to id
tweet2id_val = val_df.select("tweet_id").rdd.map(lambda x: x[0]).distinct().zipWithUniqueId()
user2id_val = val_df.select("engaging_user_id").rdd.map(lambda x: x[0]).distinct().zipWithUniqueId()
tweet2id_val = tweet2id_val.toDF().withColumnRenamed("_1", "tweet_id_str_val").withColumnRenamed("_2", "tweet_new")
user2id_val = user2id_val.toDF().withColumnRenamed("_1", "user_id_str_val").withColumnRenamed("_2", "user_new")
# Join Mapping with Dataframe
val_df = val_df.join(tweet2id_val, col("tweet_id") == col("tweet_id_str_val"), "left_outer")
val_df = val_df.join(user2id_val, col("engaging_user_id") == col("user_id_str_val"), "left_outer")
# Join Mapping from training data with Dataframe
val_df = val_df.join(tweet2id, col("tweet_id") == col("tweet_id_str"), "left_outer")
val_df = val_df.join(user2id, col("engaging_user_id") == col("user_id_str"), "left_outer")

# Get the maximum IDs from training
max_user_id = user2id.groupBy().max("user").collect()[0][0]
max_tweet_id = tweet2id.groupBy().max("tweet").collect()[0][0]


val_df = val_df.withColumn("user", create_index("user", "user_new"))
val_df = val_df.withColumn("tweet", create_index("tweet", "tweet_new"))

In [15]:
def fallback_prediction(x):
    """
    Make a random Guess if model made no predicitons
    """
    return when(isnan(x), rand()).otherwise(col(x))


In [16]:
metrics["Matrix Factorization"] = {}

for target_col in target_cols:
        # Load model
        model = ALSModel.load("hdfs:///user/e1553958/RecSys/datasplit/models/" + target_col[:-10] + "_als_model")
        # Get Predictions of the model
        result_df = model.transform(val_df)
        result_df = result_df.withColumn("prediction", fallback_prediction("prediction"))
        result_df = result_df.withColumn(target_col, encode_response(target_col))
        predictionAndLabels = result_df.rdd.map(lambda r: (r["prediction"], r[target_col]))
        metric = BinaryClassificationMetrics(predictionAndLabels)
        metrics["Matrix Factorization"][target_col] = metric.areaUnderPR
        print("For {}: Area under PR = {}".format(target_col, metrics["Matrix Factorization"][target_col]))

For reply_timestamp: Area under PR = 0.025947249265459268
For retweet_timestamp: Area under PR = 0.09975892602345017
For retweet_with_comment_timestamp: Area under PR = 0.007482329907789703
For like_timestamp: Area under PR = 0.3986788910896872
