# Twitter RecSys Challenge 2020

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from twitter_preproc import twitter_preproc

#spark = SparkSession.builder.appName("ChiSquareSpark").getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Pipeline") \
    .getOrCreate()
sc = spark.sparkContext

## Preproc Data Pipeline

In [125]:
%%file twitter_preproc.py
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark.ml.feature import RegexTokenizer, OneHotEncoderEstimator, StringIndexer

class twitter_preproc:
    
    def __init__(self, spark:SparkSession, sc:SparkContext, inputFile:str, colnames:str, SCHEMA, seed:int=123, MF:bool=False):
        self.sc = sc
        #inputRDD = sc.textFile(inputFile)
        #self.inputData = spark.read.option("sep", "\x01").csv(inputFile)
        SCHEMA = StructType([
                StructField("text_tokens", StringType()),
                StructField("hashtags", StringType()),
                StructField("tweet_id", StringType()),
                StructField("present_media", StringType()),
                StructField("present_links", StringType()),
                StructField("present_domains", StringType()),
                StructField("tweet_type", StringType()),
                StructField("language", StringType()),
                StructField("tweet_timestamp", LongType()),
                StructField("engaged_with_user_id", StringType()),
                StructField("engaged_with_user_follower_count", LongType()),
                StructField("engaged_with_user_following_count", LongType()),
                StructField("engaged_with_user_is_verified", BooleanType()),
                StructField("engaged_with_user_account_creation", LongType()),
                StructField("engaging_user_id", StringType()),
                StructField("engaging_user_follower_count", LongType()),
                StructField("engaging_user_following_count", LongType()),
                StructField("engaging_user_is_verified", BooleanType()),
                StructField("engaging_user_account_creation", LongType()),
                StructField("engaged_follows_engaging", BooleanType()),
                StructField("reply_timestamp", LongType()),
                StructField("retweet_timestamp", LongType()),
                StructField("retweet_with_comment_timestamp", LongType()),
                StructField("like_timestamp", LongType())       
            ])
        self.inputData = spark.read.csv(path=inputFile, sep="\x01", header=False, schema=SCHEMA)
        if MF:
            self._preprocessMF()
        else:
            self._preprocess(seed)
        #self.inputData = spark.createDataFrame(inputRDD, sep="\x01", schema=SCHEMA)    
    
    def getDF(self):
        return self.outputDF
    
    def _preprocessMF(self):
        outputDF = self.inputData
        
        self.outputDF = outputDF.select(["tweet_id","engaging_user_id","engaged_with_user_id",
                                    "retweet_timestamp","reply_timestamp",
                                    "retweet_with_comment_timestamp","like_timestamp"])
    
    
    
    def _preprocess(self, seed):
        
        outputDF = self.inputData
        
        # Drop unnecessary cols
        ### drop ids for classification
        outputDF = outputDF.drop("tweet_id").drop("engaged_user_id").drop("engaged_with_user_id")\
                    .drop("present_links").drop("present_domains")
        
        # Split the text tokens to valid format
        regexTokenizer = RegexTokenizer(inputCol="text_tokens",outputCol="vector", pattern="\t")
        outputDF = regexTokenizer.transform(outputDF)
        outputDF = outputDF.drop("text_tokens").withColumnRenamed("vector", "text_tokens")
        
        regexTokenizer = RegexTokenizer(inputCol="present_media", outputCol="media_list")
        outputDF = regexTokenizer.transform(outputDF.fillna("none", subset=["present_media"]))
        outputDF = outputDF.drop("present_media").withColumnRenamed("media_list", "present_media")
        outputDF = outputDF.withColumn("present_media2", outputDF["present_media"].cast(StringType()))
        outputDF = outputDF.drop("present_media").withColumnRenamed("present_media2", "present_media")

        # OneHotEncode tweet_type
        ## TODO: user_id, engaged_user_id, ...
        indexer = StringIndexer(inputCol="tweet_type", outputCol="tweet_type_id")
        outputDF = indexer.fit(outputDF).transform(outputDF)
        indexer = StringIndexer(inputCol="present_media", outputCol="present_media_id")
        outputDF = indexer.fit(outputDF).transform(outputDF)
        indexer = StringIndexer(inputCol="language", outputCol="language_id")
        outputDF = indexer.fit(outputDF).transform(outputDF)
        
        # onehot
        encoder = OneHotEncoderEstimator(inputCols=["tweet_type_id", "present_media_id", "language_id"],
                                         outputCols=["tweet_type_onehot", "present_media_onehot", "language_onehot"])
        model = encoder.fit(outputDF)
        outputDF = model.transform(outputDF)
        
        # for explainability safe this
        self.explainOneHotDF = outputDF.select("tweet_type", "tweet_type_id", "tweet_type_onehot",
                                              "present_media", "present_media_id", "present_media_onehot",
                                               "language", "language_id", "language_onehot"
                                              )
        # make label columns binary
        outputDF = outputDF.withColumn("like", when(outputDF["like_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("retweet", when(outputDF["retweet_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("reply", when(outputDF["reply_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("retweet_comment", when(outputDF["retweet_with_comment_timestamp"].isNull(), 0).otherwise(1))
        
        # drop intermediate columns
        outputDF = outputDF.drop(*["like_timestamp","retweet_timestamp","reply_timestamp",
                                  "retweet_with_comment_timestamp","tweet_type","tweet_type_id",
                                 "language","language_id","present_media","present_media_id"])
        
        # TODO: 
        
        # might not need
        # transform boolean to 0-1 column... first one has to change the type in the schema though 
        #data = data.select("engaging_user_is_verified", "engaged_with_user_is_verified", "engaged_follows_engaging")\
        #    .replace(["false","true"], ["0","1"]).show()
        
        
        self.outputDF = outputDF
        
    '''
        returns small dataframe that explains the values of the oneHotEncoder step, this might be needed
        for mapping the encodings back to the original values
    '''    
    def explainOneHot(self):
        return self.explainOneHotDF


Overwriting twitter_preproc.py


## Interactive code...

In [126]:
#train = "///user/e11920598/traintweet_1000.tsv"
train = "///tmp/traintweet_1000.tsv"
#train = "///user/pknees/RSC20/training.tsv"

In [127]:
from pyspark.sql.types import *

column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",\
               "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

SCHEMA = StructType([
                StructField("text_tokens", StringType()),
                StructField("hashtags", StringType()),
                StructField("tweet_id", StringType()),
                StructField("present_media", StringType()),
                StructField("present_links", StringType()),
                StructField("present_domains", StringType()),
                StructField("tweet_type", StringType()),
                StructField("language", StringType()),
                StructField("tweet_timestamp", LongType()),
                StructField("engaged_with_user_id", StringType()),
                StructField("engaged_with_user_follower_count", LongType()),
                StructField("engaged_with_user_following_count", LongType()),
                StructField("engaged_with_user_is_verified", BooleanType()),
                StructField("engaged_with_user_account_creation", LongType()),
                StructField("engaging_user_id", StringType()),
                StructField("engaging_user_follower_count", LongType()),
                StructField("engaging_user_following_count", LongType()),
                StructField("engaging_user_is_verified", BooleanType()),
                StructField("engaging_user_account_creation", LongType()),
                StructField("engaged_follows_engaging", BooleanType()),
                StructField("reply_timestamp", LongType()),
                StructField("retweet_timestamp", LongType()),
                StructField("retweet_with_comment_timestamp", LongType()),
                StructField("like_timestamp", LongType())       
                                ])

len(column_names)

24

In [128]:
import twitter_preproc
import importlib
importlib.reload(twitter_preproc)
from twitter_preproc import *

In [129]:
preproc = twitter_preproc(spark, sc, train, column_names, SCHEMA)
print(preproc.getDF().show(5))
#print(preproc.getDF().show(5))
import pandas as pd
pd.DataFrame(preproc.getDF().take(1000), columns=preproc.getDF().columns)

+--------------------+---------------+--------------------------------+---------------------------------+-----------------------------+----------------------------------+--------------------+----------------------------+-----------------------------+-------------------------+------------------------------+------------------------+--------------------+-----------------+--------------------+---------------+----+-------+-----+---------------+
|            hashtags|tweet_timestamp|engaged_with_user_follower_count|engaged_with_user_following_count|engaged_with_user_is_verified|engaged_with_user_account_creation|    engaging_user_id|engaging_user_follower_count|engaging_user_following_count|engaging_user_is_verified|engaging_user_account_creation|engaged_follows_engaging|         text_tokens|tweet_type_onehot|present_media_onehot|language_onehot|like|retweet|reply|retweet_comment|
+--------------------+---------------+--------------------------------+---------------------------------+-------

Unnamed: 0,hashtags,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,text_tokens,tweet_type_onehot,present_media_onehot,language_onehot,like,retweet,reply,retweet_comment
0,,1581262691,986,1201,False,1274269909,00000776B07587ECA9717BFC301F2D6E,94,648,False,1478011810,False,"[101, 1942, 18628, 15752, 4458, 7697, 24309, 1...","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0
1,83D6C79F5FCEC8D1CAD9E82C2C261611\tFFAD2DCF664C...,1581497241,1225,677,False,1255778244,00000B85AAF7DE172876FD96718C4469,1139,46,False,1540395738,True,"[101, 56898, 137, 94836, 101481, 10245, 11166,...","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,0,0
2,,1580978528,3016,1623,False,1313450503,00000E0C9B364891CDE89ECFC54771DE,780,440,False,1432084055,True,"[101, 98377, 22627, 33499, 25053, 10898, 3793,...","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0
3,,1581321849,2121,16,False,1547717153,00000F04EEDBCF3E1FB9A1948BF353B6,1,45,False,1534313747,False,"[101, 56898, 137, 11161, 73421, 131, 137, 1188...","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0
4,,1580956787,813505,200,True,1476348838,000010088197DA00D659853E06935B3E,171,388,False,1490166885,False,"[101, 100, 119, 6694, 6546, 5621, 2446, 1975, ...","(1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0
5,D20331F40622336B266EC2BF3572F7E5,1581341389,47678,1430,False,1406547309,000012A6D58B300B1B4098C86223F76E,1927,1414,False,1368483885,True,"[101, 13498, 789, 45935, 19721, 10700, 93914, ...","(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1,1,0,0
6,,1581004518,29358,18528,False,1539180042,000012D4971A83624EF9C6711AE5167D,929,928,False,1577292324,True,"[101, 48993, 10147, 183, 32095, 10104, 190, 13...","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,1,0
7,,1581020186,43097,25017,False,1291780525,000013E6563760E3916215D42BE0D406,286,524,False,1439811227,True,"[101, 31594, 10107, 10149, 53317, 11565, 18561...","(0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0
8,,1581005860,279,193,False,1423617911,00001607209C5774DF9207A2AC0EED5F,461,697,False,1396311956,True,"[101, 56898, 137, 73336, 13264, 29591, 10112, ...","(0.0, 1.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0
9,,1581103914,2287014,1010,True,1202872538,0000170273D2530A0DF580401CC32AE0,42,118,False,1295601797,False,"[101, 28009, 10142, 13867, 24858, 119, 100, 11...","(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0


### Explanation

In [92]:
from pyspark.sql.functions import * 

data = preproc.getDF()
data.select("like_timestamp").show()
foo = data.withColumn("like", when(data["like_timestamp"].isNull(), 0).otherwise(1))
foo.select("like_timestamp", "like").show()
#data = data.drop("text_tokens").withColumnRenamed("vector", "text_tokens")
print("### Tweet-Type OneHotEncodings:")
explainonehot = preproc.explainOneHot()
explainonehot.show()
#data.show()
#data.groupBy("engaging_user_is_verified").count().show()
#data = data.select("engaging_user_is_verified", "engaged_with_user_is_verified", "engaged_follows_engaging")\
#    .replace(["false","true"], ["0","1"])..show()


+--------------+
|like_timestamp|
+--------------+
|          null|
|    1581497622|
|    1581060554|
|    1581328518|
|    1580957807|
|    1581346588|
|          null|
|          null|
|    1581009248|
|          null|
|    1581189873|
|          null|
|    1581045318|
|    1581375276|
|    1581063697|
|          null|
|    1581017998|
|          null|
|          null|
|    1581260483|
+--------------+
only showing top 20 rows

+--------------+----+
|like_timestamp|like|
+--------------+----+
|          null|   0|
|    1581497622|   1|
|    1581060554|   1|
|    1581328518|   1|
|    1580957807|   1|
|    1581346588|   1|
|          null|   0|
|          null|   0|
|    1581009248|   1|
|          null|   0|
|    1581189873|   1|
|          null|   0|
|    1581045318|   1|
|    1581375276|   1|
|    1581063697|   1|
|          null|   0|
|    1581017998|   1|
|          null|   0|
|          null|   0|
|    1581260483|   1|
+--------------+----+
only showing top 20 rows

### Tweet-Ty

In [73]:
data.select("present_media").show(5)
data.withColumn("present_media2", data["present_media"].cast(StringType()))
#data.select("present_media").rdd.map(lambda x: str(x[0])).toDF(schema= StructType([
#                StructField("present_media", StringType())])).show(5)


+-------------+
|present_media|
+-------------+
|       [none]|
|       [none]|
|       [none]|
|       [none]|
|      [photo]|
+-------------+
only showing top 5 rows



DataFrame[hashtags: string, tweet_type: string, language: string, tweet_timestamp: bigint, engaged_with_user_follower_count: bigint, engaged_with_user_following_count: bigint, engaged_with_user_is_verified: boolean, engaged_with_user_account_creation: bigint, engaging_user_id: string, engaging_user_follower_count: bigint, engaging_user_following_count: bigint, engaging_user_is_verified: boolean, engaging_user_account_creation: bigint, engaged_follows_engaging: boolean, reply_timestamp: bigint, retweet_timestamp: bigint, retweet_with_comment_timestamp: bigint, like_timestamp: bigint, text_tokens: array<string>, present_media: string, tweet_type_id: double, present_media_id: double, tweet_type_onehot: vector, present_media_onehot: vector, present_media2: string]

In [21]:
foo.foreach(print)
str(foo)

'PythonRDD[63] at RDD at PythonRDD.scala:53'

In [18]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [None]:
from pyspark.ml.feature import RegexTokenizer

data = preproc.getDF()

regexTokenizer = RegexTokenizer(inputCol="text_tokens",outputCol="vector", pattern="\t")
tokenized = regexTokenizer.transform(data)

tokenized.select("vector").show()

In [None]:
tokenized

In [7]:
%%file demo.py


from twitter_preproc import twitter_preproc
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

#spark = SparkSession.builder.appName("ChiSquareSpark").getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Pipeline") \
    .getOrCreate()
sc = spark.sparkContext


# sample file with 1000 tweets for checking the pipeline
train = "///user/e11920598/traintweet_1000.tsv"

preproc = twitter_preproc(spark, sc, train)
print(preproc.getDF().show(5))


sc.stop()


Overwriting demo.py


In [8]:
### rather use it on the command line than here
#! spark-submit --num-executors=4 --total-executor-cores 16 --executor-memory=8G demo.py 

20/06/06 16:26:47 INFO spark.SparkContext: Running Spark version 2.4.0-cdh6.3.2
20/06/06 16:26:47 INFO logging.DriverLogger: Added a local log appender at: /tmp/spark-95acd95f-2bb4-4950-bc77-662ca74baeab/__driver_logs__/driver.log
20/06/06 16:26:47 INFO spark.SparkContext: Submitted application: Pipeline
20/06/06 16:26:47 INFO spark.SecurityManager: Changing view acls to: e11920598
20/06/06 16:26:47 INFO spark.SecurityManager: Changing modify acls to: e11920598
20/06/06 16:26:47 INFO spark.SecurityManager: Changing view acls groups to: 
20/06/06 16:26:47 INFO spark.SecurityManager: Changing modify acls groups to: 
20/06/06 16:26:47 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(e11920598); groups with view permissions: Set(); users  with modify permissions: Set(e11920598); groups with modify permissions: Set()
20/06/06 16:26:47 INFO util.Utils: Successfully started service 'sparkDriver' on port 40223.
20/06/06 1