# Baseline Approach

This Notebook tries to implement the baseline approach described in the paper accompanind the Exam: 

- [Privacy-Preserving Recommender Systems Challenge on Twitter’s HomeTimeline]{https://arxiv.org/pdf/2004.13715.pdf}

The baseline approch is described in chapter 5.3

In [None]:
import findspark
findspark.init("/usr/spark-2.4.1")
import pyspark
from pyspark import SQLContext

import numpy as np

In [None]:
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '14g')
sc = pyspark.SparkContext()
sql = SQLContext(sc)

## Data Loading

Read Sample Data as Spark DataFrame and assign column names according to RecSys Challenge Documentation

In [None]:
datafile = "data/training_sample.tsv"

df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(datafile,  inferSchema="true")
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [None]:
df

In [None]:
# Change the boolean variables to interger (0 and 1)
from pyspark.sql.functions import col, split, when, array
df = df.withColumn("engaged_with_user_is_verified",col("engaged_with_user_is_verified").cast("Integer"))
df = df.withColumn("engaging_user_is_verified",col("engaging_user_is_verified").cast("Integer"))
df = df.withColumn("engaged_follows_engaging",col("engaged_follows_engaging").cast("Integer"))

In [None]:
# Split the string representations of lists
## Convert the text tokens to array of ints
split_text = pyspark.sql.functions.split(df['text_tokens'], '\t')
df = df.withColumn("text_tokens", split_text)

## Convert present media to array of strings
split_text = pyspark.sql.functions.split(df['present_media'], '\t')
df = df.withColumn("present_media", when(col('present_media').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert present links to array of strings
split_text = pyspark.sql.functions.split(df['present_links'], '\t')
df = df.withColumn("present_links", when(col('present_links').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert hashtags to array of strings
split_text = pyspark.sql.functions.split(df['hashtags'], '\t')
df = df.withColumn("hashtags", when(col('hashtags').isNull(), array().cast("array<string>")).otherwise(split_text))

## Convert present_domains to array of strings
split_text = pyspark.sql.functions.split(df['present_domains'], '\t')
df = df.withColumn("present_domains", when(col('present_domains').isNull(), array().cast("array<string>")).otherwise(split_text))


In [None]:
df.take(1)

In [8]:
numeric_cols = ['engaged_with_user_follower_count', 
                'engaged_with_user_following_count', 
                'engaged_with_user_account_creation',
                'engaging_user_follower_count', 
                'engaging_user_following_count',
                'engaging_user_account_creation',
                'tweet_timestamp',
               ]


categorical_cols = ['tweet_type', 'language', 
                    'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engaged_follows_engaging']

id_cols = ['tweet_id', 'engaged_with_user_id', 'engaging_user_id']

response_cols = ['reply_timestamp', 
                 'retweet_timestamp',
                 'retweet_with_comment_timestamp', 
                 'like_timestamp'
                ]

tweet_feature_cols = ['text_tokens', 'hashtags', 'present_media', 'present_links', 'present_domains']

### Helper Functions

In [9]:
from pyspark.ml.feature import QuantileDiscretizer, StringIndexer, FeatureHasher, OneHotEncoderEstimator, CountVectorizer,PCA, VectorAssembler

nq = 50 # number of quantiles to use

def create_quantilesDiscretizer(input_col: str) -> QuantileDiscretizer:
    """
    Create a Quantile Discretizer for a specified column 
    Uses as output colum the input + _discretized
    
    Parameters
    ----------
    input_col: str
        Name of the Input Column
        
    Return
    ------
    QuantileDiscretizer
    """
    output_col = input_col + "_discretized"
    return QuantileDiscretizer(numBuckets=nq,
                                  relativeError=0.,
                                  handleInvalid='keep',
                                  inputCol=input_col,
                                  outputCol=output_col)

def create_stringIndexer(input_col):
    """
    Create a String Indexer for a specified column 
    Uses as output colum the input + _indexed
    
    Parameters
    ----------
    input_col: str
        Name of the Input Column
        
    Return
    ------
    StringIndexer
    """
    output_col = input_col + "_indexed"
    return StringIndexer(inputCol=input_col,
                         outputCol=output_col,
                        handleInvalid='keep',)


def create_featureHasher(input_col):
    """
    Create a Feature Hasher for a specified column 
    Uses as output colum the input + _oneHot (creates oneHotEncodings for strings)
    
    Parameters
    ----------
    input_col: str
        Name of the Input Column
        
    Return
    ------
    FeatureHasher
    """
    output_col = input_col + "_oneHot"
    return FeatureHasher(numFeatures=nq,
                         inputCols=[input_col],
                         outputCol=output_col)


def create_countVectorizer(input_col):
    output_col = input_col + "_vectorized"
    return CountVectorizer(inputCol=input_col,
                           outputCol=output_col)


#### Create Transformer

In [10]:
# Encode Numeric Features (5.3.1)
quantile_discretizers_numeric = [ create_quantilesDiscretizer(col) for col in numeric_cols ]

# Encode Categorical Features (5.3.2)
string_indexer_categorical = [ create_stringIndexer(col) for col in categorical_cols]

# Encode ID Features (5.3.3)
id_feature_hashers = [ create_featureHasher(col) for col in id_cols]

# Encode Tweet Features (5.3.4 + 5.3.5)
tweet_countVectorizers = [ create_countVectorizer(col) for col in tweet_feature_cols]


# One-Hot-Encode Features
columns_to_encode = [ col+"_discretized" for col in numeric_cols]
columns_to_encode.extend([ col+"_indexed" for col in categorical_cols])
encoded_columns = [ col+"_oneHot" for col in numeric_cols]
encoded_columns.extend([ col+"_onHot" for col in categorical_cols])

onHotEncoder = OneHotEncoderEstimator(inputCols=columns_to_encode, 
                                      outputCols=encoded_columns,dropLast=False,handleInvalid="keep" )



# Add Vectors with VectorAssembler
encoded_columns.extend([ col+"_oneHot" for col in id_cols ])
num_cat_id_feature_assambler = VectorAssembler(inputCols=encoded_columns,
                                               outputCol="non_tweet_features")

tweet_features_encoded = [ col+"_vectorized" for col in tweet_feature_cols]
tweet_feature_assambler = VectorAssembler(inputCols=tweet_features_encoded,
                                               outputCol="tweet_features")

# Perform Dimensionality Reduction

#non_tweet_pca = PCA(k=16, 
#          inputCol="non_tweet_features", 
#          outputCol="non_tweet_features_reduced")

#tweet_pca = PCA(k=16, 
#          inputCol="tweet_features", 
#          outputCol="tweet_features_reduced")

features = ['non_tweet_features', 'tweet_features']
feature_assambler = VectorAssembler(inputCols=features,
                                               outputCol="features")

In [11]:
from pyspark.ml import Pipeline


# create a list of all transformers
stages = list()
stages.extend(quantile_discretizers_numeric)
stages.extend(string_indexer_categorical)
stages.extend(id_feature_hashers)
stages.append(onHotEncoder)
stages.extend(tweet_countVectorizers)
stages.append(num_cat_id_feature_assambler)
stages.append(tweet_feature_assambler)
#stages.append(non_tweet_pca)
#stages.append(tweet_pca)
stages.append(feature_assambler)

pipeline = Pipeline(stages=stages)

In [12]:
# fit and transform dataframe
df = pipeline.fit(df).transform(df)

In [14]:
df.select('features').take(1)

[Row(features=SparseVector(93549, {8: 1.0, 78: 1.0, 128: 1.0, 174: 1.0, 250: 1.0, 303: 1.0, 349: 1.0, 358: 1.0, 382: 1.0, 422: 1.0, 426: 1.0, 430: 1.0, 437: 1.0, 508: 1.0, 540: 1.0, 584: 1.0, 585: 3.0, 586: 1.0, 587: 1.0, 588: 2.0, 590: 1.0, 591: 1.0, 592: 1.0, 593: 1.0, 596: 1.0, 616: 1.0, 624: 1.0, 639: 1.0, 648: 1.0, 651: 1.0, 655: 1.0, 663: 1.0, 671: 1.0, 680: 1.0, 840: 1.0, 955: 1.0, 975: 1.0, 1491: 1.0, 9901: 1.0, 36312: 2.0, 38498: 1.0, 56362: 1.0, 79119: 1.0}))]

In [18]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

mat = RowMatrix(df.rdd.map(lambda v: Vectors.fromML(v.non_tweet_features)))

In [19]:
svd = mat.computeSVD(16)

KeyboardInterrupt: 

#### Encode Label Columns 

Create on column with array of 0 and 1

In [None]:
from pyspark.sql.functions import  lit, col

label_col = [ col(name) for name in response_cols]

def encode_response(x):
    return when(col(x).isNull(), float(0)).otherwise(float(1))

for column in response_cols:
    df = df.withColumn(column, encode_response(column))
    

df = df.withColumn("label", col("like_timestamp"))

In [None]:
splits = df.select("features", "label").randomSplit([0.8, 0.2], seed=1234)
train = splits[0]
test = splits[1]

In [None]:
# create the trainer and set its parameters
from pyspark.ml.classification import MultilayerPerceptronClassifier

layers = [93549, 128, 64, 34, 1]

trainer = MultilayerPerceptronClassifier(
  layers=layers,
  blockSize=128,
  seed=1234,
  maxIter=100)

# train the model
model = trainer.fit(train)

In [None]:
# compute accuracy on the test set
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
val result = model.transform(test)
val predictionAndLabels = result.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator()
  .setMetricName("accuracy")