In [1]:
import pyspark
import numpy as np

from pyspark import SparkContext, SQLContext, SparkConf

from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, IntegerType, StringType, FloatType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, CountVectorizer, IDF, ChiSqSelector, Normalizer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

from pyspark.mllib.evaluation import BinaryClassificationMetrics

## Start Session for Assignment 3 - Pipeline (Token IDs and Hashtags)

In [2]:
# Set Spark Config
conf = SparkConf().setAppName("RecSys-Challenge-Train-Model").setMaster("yarn")
conf = (conf.set("deploy-mode","cluster")
       .set("spark.driver.memory","100g")
       .set("spark.executor.memory","100g")
       .set("spark.driver.cores","1")
       .set("spark.num.executors","50")
       .set("spark.executor.cores","5")
       .set("spark.driver.maxResultSize", "100g"))
sc = pyspark.SparkContext(conf=conf)
sql = SQLContext(sc)

## Load trainset of RecSys Challenge 2020

In [3]:
path = 'training.tsv'

In [4]:
train_val = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(path,  inferSchema="true")
    .repartition(1000)
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [5]:
train_val = train_val.select(['text_tokens', 'hashtags', 'reply_timestamp', "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"])

In [6]:
train_val.show()

+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+
|         text_tokens|            hashtags|reply_timestamp|retweet_timestamp|retweet_with_comment_timestamp|like_timestamp|
+--------------------+--------------------+---------------+-----------------+------------------------------+--------------+
|101	56898	137	359...|                null|           null|       1581007862|                          null|          null|
|101	56898	137	141...|                null|           null|             null|                          null|    1580994716|
|101	89009	10133	1...|                null|           null|             null|                          null|          null|
|101	56898	137	160...|                null|           null|             null|                          null|    1580993877|
|101	2072	7731	203...|                null|           null|             null|                          null|    1581227780|
|101	568

## Encode Engagements

In [7]:
response_cols = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [8]:
for col in response_cols:
    train_val = train_val.withColumn(
        col,
        F.when((F.col(col) >= 0), 1)\
        .otherwise(0)
        )

## Combine Token ID's with Hashtags

In [9]:
train_val = train_val.fillna({'hashtags':'nohashtag'})

In [10]:
train_val = train_val.withColumn(
    'token_and_hashtags',
    F.concat(F.col("text_tokens"), F.lit("\t"), F.col("hashtags")))

In [11]:
print(train_val)

DataFrame[text_tokens: string, hashtags: string, reply_timestamp: int, retweet_timestamp: int, retweet_with_comment_timestamp: int, like_timestamp: int, token_and_hashtags: string]


## Build Pipeline to get TF-IDF, Top Features and Modeltuning from Token ID's & Hashtags

In [12]:
stopwordList = ["101","102"] 
stopwordList.extend(StopWordsRemover().getStopWords())
stopwordList = list(set(stopwordList))#optionnal

In [13]:
# Configure an ML pipeline, which consists of four stages: tokenizer, remover, CountVectorizer, idf, normalizer and random forest classifier.

# RegEx Tokenizer which is dealing with our needed patterns [^a-zA-Z0-9] and lowercases all tokens.
regexTokenizer = RegexTokenizer(inputCol="token_and_hashtags", outputCol="words", pattern="[^a-zA-Z0-9]")

# Removes the default english stopwords can be checked with:  StopWordsRemover.loadDefaultStopWords('english')
remover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered", stopWords=stopwordList)

# The hash function used here is MurmurHash 3. Then term frequencies are calculated based on the mapped indices.
cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="rawFeatures")

# Calculating TFIDF
idf = IDF(inputCol=cv.getOutputCol(), outputCol="features")

# Chi Square Selector which selects the top 4000 features.
selector = ChiSqSelector(featuresCol=idf.getOutputCol(),
                         outputCol="selectedFeatures", labelCol="label")

# Normalize
normalizer = Normalizer(p=2.0, inputCol="selectedFeatures", outputCol="normalizedFeatures")
#normalizer = Normalizer(p=2.0, inputCol="features", outputCol="normalizedFeatures")

# Random Forest.
rf = RandomForestClassifier(labelCol="label", featuresCol=normalizer.getOutputCol(), seed=1234)



#pipeline = Pipeline(stages=[regexTokenizer, remover, cv, idf, normalizer, rf])
pipeline = Pipeline(stages=[regexTokenizer, remover, cv, idf, selector, normalizer, rf])

In [14]:
paramGrid = ParamGridBuilder()\
    .addGrid(selector.numTopFeatures, [50, 100])\
    .addGrid(rf.numTrees, [10, 100])\
    .addGrid(rf.maxDepth, [8, 12])\
    .addGrid(rf.minInfoGain, [0.01, 0.1])\
    .build()

In [15]:
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
                           # 75% of the data will be used for training, 25% for validation.
                           seed=123,
                           trainRatio=0.75)

## Training of all 4 models

### Train 1st Model "reply_timestamp"

In [18]:
train_val = train_val.withColumnRenamed("reply_timestamp", "label")
# Run TrainValidationSplit, and choose the best set of parameters.
model_reply = tvs.fit(train_val)
train_val = train_val.withColumnRenamed("label", "reply_timestamp")

In [19]:
# save best model from tvs grid search
mPath_reply =  "model_reply_bestModel_big"
model_reply.bestModel.write().overwrite().save(mPath_reply)

### Train 2nd Model "retweet_timestamp"

In [15]:
train_val = train_val.withColumnRenamed("retweet_timestamp", "label")
# Run TrainValidationSplit, and choose the best set of parameters.
model_retweet = tvs.fit(train_val)
train_val = train_val.withColumnRenamed("label", "retweet_timestamp")

In [16]:
# save best model from tvs grid search
mPath_retweet =  "model_retweet_bestModel_big"
model_retweet.bestModel.write().overwrite().save(mPath_retweet)

### Train 3rd Model "retweet_with_comment_timestamp"

In [15]:
train_val = train_val.withColumnRenamed("retweet_with_comment_timestamp", "label")
# Run TrainValidationSplit, and choose the best set of parameters.
model_retweet_with_comment = tvs.fit(train_val)
train_val = train_val.withColumnRenamed("label", "retweet_with_comment_timestamp")

In [16]:
# save best model from tvs grid search
mPath_retweet_with_comment =  "model_retweet_with_comment_bestModel_big"
model_retweet_with_comment.bestModel.write().overwrite().save(mPath_retweet_with_comment)

### Train 4th Model "like_timestamp"

In [16]:
train_val = train_val.withColumnRenamed("like_timestamp", "label")
# Run TrainValidationSplit, and choose the best set of parameters.
model_like = tvs.fit(train_val)
train_val = train_val.withColumnRenamed("label", "like_timestamp")

In [17]:
# save best model from tvs grid search
mPath_like =  "model_like_bestModel_big"
model_like.bestModel.write().overwrite().save(mPath_like)