# Twitter RecSys Challenge 2020

In [7]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from twitter_preproc import twitter_preproc

#spark = SparkSession.builder.appName("ChiSquareSpark").getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Pipeline") \
    .getOrCreate()
sc = spark.sparkContext

## Preproc Data Pipeline

In [8]:
%%file twitter_preproc.py
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import RegexTokenizer, OneHotEncoderEstimator, StringIndexer, MinMaxScaler, VectorAssembler, HashingTF, IDF

class twitter_preproc:
    
    def __init__(self, spark:SparkSession, sc:SparkContext, inputFile:str, seed:int=123,
                 MF:bool=False, trainsplit:float=0.9):
        
        self.sc = sc
        #inputRDD = sc.textFile(inputFile)
        #self.inputData = spark.read.option("sep", "\x01").csv(inputFile)
        SCHEMA = StructType([
                StructField("text_tokens", StringType()),
                StructField("hashtags", StringType()),
                StructField("tweet_id", StringType()),
                StructField("present_media", StringType()),
                StructField("present_links", StringType()),
                StructField("present_domains", StringType()),
                StructField("tweet_type", StringType()),
                StructField("language", StringType()),
                StructField("tweet_timestamp", LongType()),
                StructField("engaged_with_user_id", StringType()),
                StructField("engaged_with_user_follower_count", LongType()),
                StructField("engaged_with_user_following_count", LongType()),
                StructField("engaged_with_user_is_verified", BooleanType()),
                StructField("engaged_with_user_account_creation", LongType()),
                StructField("engaging_user_id", StringType()),
                StructField("engaging_user_follower_count", LongType()),
                StructField("engaging_user_following_count", LongType()),
                StructField("engaging_user_is_verified", BooleanType()),
                StructField("engaging_user_account_creation", LongType()),
                StructField("engaged_follows_engaging", BooleanType()),
                StructField("reply_timestamp", LongType()),
                StructField("retweet_timestamp", LongType()),
                StructField("retweet_with_comment_timestamp", LongType()),
                StructField("like_timestamp", LongType())       
            ])
        self.inputData = spark.read.csv(path=inputFile, sep="\x01", header=False, schema=SCHEMA)
        if MF:
            self._preprocessMF()
        else:
            self._preprocess(trainsplit, seed)
        #self.inputData = spark.createDataFrame(inputRDD, sep="\x01", schema=SCHEMA)    
    
    def getDF(self):
        return self.outputDF
    
    def _preprocessMF(self):
        outputDF = self.inputData
        
        self.outputDF = outputDF.select(["tweet_id","engaging_user_id","engaged_with_user_id",
                                    "retweet_timestamp","reply_timestamp",
                                    "retweet_with_comment_timestamp","like_timestamp"])
    
    def _preprocess(self, trainsplit, seed):
        
        outputDF = self.inputData
        
        # Drop unnecessary cols
        ### drop ids for classification
        outputDF = outputDF.drop("tweet_id").drop("engaged_user_id").drop("engaged_with_user_id").drop("engaging_user_id")\
                    .drop("present_links").drop("present_domains")
        
        # Split the text tokens to valid format
        textTokenizer = RegexTokenizer(inputCol="text_tokens",outputCol="vector", pattern="\t")
        outputDF = textTokenizer.transform(outputDF)
        hashtagTokenizer = RegexTokenizer(inputCol="hashtags",outputCol="hashtag_tokens", pattern="\t")
        outputDF = hashtagTokenizer.transform(outputDF.fillna("none", subset=["hashtags"]))
        
        #self.tokenizerPipeline = Pipeline(stages=[textTokenizer, hashtagTokenizer])
        #outputDF = self.tokenizerPipeline.fit(outputDF).transform(outputDF)
        
        outputDF = outputDF.drop("text_tokens").withColumnRenamed("vector", "text_tokens")
        outputDF = outputDF.drop("hashtags").withColumnRenamed("hashtag_tokens", "hashtags")
        
        regexTokenizer = RegexTokenizer(inputCol="present_media", outputCol="media_list")
        outputDF = regexTokenizer.transform(outputDF.fillna("none", subset=["present_media"]))
        outputDF = outputDF.drop("present_media").withColumnRenamed("media_list", "present_media")
        outputDF = outputDF.withColumn("present_media2", outputDF["present_media"].cast(StringType()))
        outputDF = outputDF.drop("present_media").withColumnRenamed("present_media2", "present_media")

        # OneHotEncode tweet_type
        ## TODO: user_id, engaged_user_id, ...
        indexerTweetType = StringIndexer(inputCol="tweet_type", outputCol="tweet_type_id")
        #outputDF = indexerTweetType.fit(outputDF).transform(outputDF)
        indexerMedia = StringIndexer(inputCol="present_media", outputCol="present_media_id")
        #outputDF = indexerMedia.fit(outputDF).transform(outputDF)
        indexerLang = StringIndexer(inputCol="language", outputCol="language_id")
        #outputDF = indexerLang.fit(outputDF).transform(outputDF)
        
        self.indexerPipeline = Pipeline(stages=[indexerTweetType, indexerMedia, indexerLang]) 
        outputDF = self.indexerPipeline.fit(outputDF).transform(outputDF)
        
        # onehot
        encoder = OneHotEncoderEstimator(inputCols=["tweet_type_id", "present_media_id", "language_id"],
                                         outputCols=["tweet_type_onehot", "present_media_onehot", "language_onehot"])
        model = encoder.fit(outputDF)
        outputDF = model.transform(outputDF)
        
        
        
        # for explainability safe this
        self.explainOneHotDF = outputDF.select("tweet_type", "tweet_type_id", "tweet_type_onehot",
                                              "present_media", "present_media_id", "present_media_onehot",
                                               "language", "language_id", "language_onehot"
                                              )
        # make label columns binary
        outputDF = outputDF.withColumn("like", when(outputDF["like_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("retweet", when(outputDF["retweet_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("reply", when(outputDF["reply_timestamp"].isNull(), 0).otherwise(1))
        outputDF = outputDF.withColumn("retweet_comment", when(outputDF["retweet_with_comment_timestamp"].isNull(), 0).otherwise(1))
        
        # drop intermediate columns
        outputDF = outputDF.drop(*["like_timestamp","retweet_timestamp","reply_timestamp",
                                  "retweet_with_comment_timestamp","tweet_type","tweet_type_id",
                                 "language","language_id","present_media","present_media_id"])
        
        # TODO: Train/Test split and Scaling
        # create a train-test split
        #train, test = outputDF.randomSplit([trainsplit, 1-trainsplit], seed=seed)
        
        # scaling
        '''
        scalerTimestamp = MinMaxScaler(inputCol="tweet_timestamp",
                                       outputCol="tweet_timestamp_scaled")
        scalerEngagedAccountCreation = MinMaxScaler(inputCol="engaged_with_user_account_creation",
                                                   outputCol="engaged_with_user_account_creation_scaled")
        scalerEngagingAccountCreation = MinMaxScaler(inputCol="engaging_user_account_creation",
                                                    outputCol="engaging_user_account_creation_scaled")
        
        scalerEngagedFollowerCount = MinMaxScaler(inputCol="engaged_with_user_follower_count",
                                            outputCol="engaged_with_user_follower_count_scaled")
        scalerEngagedFollowingCount = MinMaxScaler(inputCol="engaged_with_user_following_count",
                                                  outputCol="engaged_with_user_following_count_scaled")
        scalerEngagingFollowerCount = MinMaxScaler(inputCol="engaging_user_follower_count",
                                           outputCol="engaging_user_follower_count_scaled")
        scalerEngagingFollowingCount = MinMaxScaler(inputCol="engaging_user_following_count",
                                                   outputCol="engaging_user_following_count_scaled")
        scalePipeline = Pipeline(stages=[scalerTimestamp, scalerEngagedAccountCreation,
                                         scalerEngagingAccountCreation, scalerEngagedFollowerCount,
                                        scalerEngagedFollowingCount, scalerEngagingFollowerCount,
                                        scalerEngagingFollowingCount])
        '''
        ## first vectorize for spark... meh
        assembler = VectorAssembler(inputCols=["tweet_timestamp", "engaged_with_user_account_creation",
                                   "engaging_user_account_creation", "engaged_with_user_follower_count",
                                  "engaged_with_user_following_count", "engaging_user_follower_count",
                                  "engaging_user_following_count"], outputCol="numeric_features")
        

        numericScaler = MinMaxScaler(inputCol="numeric_features", outputCol="numeric_scaled")
        self.scalePipeline = Pipeline(stages=[assembler, numericScaler])
        outputDF = self.scalePipeline.fit(outputDF).transform(outputDF)
        
        # drop numeric columns
        outputDF = outputDF.drop(*["tweet_timestamp", "engaged_with_user_account_creation",
                                   "engaging_user_account_creation", "engaged_with_user_follower_count",
                                  "engaged_with_user_following_count", "engaging_user_follower_count",
                                  "engaging_user_following_count", "numeric_features"])
        
        # tf/idf text + hashtags
        ### hashtags
        hashtagsTF = HashingTF(inputCol="hashtags", outputCol="hashtagsTF", numFeatures=2^10)
        outputDF = hashtagsTF.transform(outputDF)
        hashtagsIDF = IDF(inputCol="hashtagsTF", outputCol="hashtags_idf")
        outputDF = hashtagsIDF.fit(outputDF).transform(outputDF)
        
        TextTF = HashingTF(inputCol="text_tokens", outputCol="tweet_text_TF", numFeatures=2^14)
        outputDF = TextTF.transform(outputDF)
        TextIDF = IDF(inputCol="tweet_text_TF", outputCol="tweet_text_idf")
        outputDF = TextIDF.fit(outputDF).transform(outputDF)
        
        outputDF = outputDF.drop(*["hashtags", "hashtagsTF", "text_tokens", "tweet_text_TF"])
        
        self.outputDF = outputDF
        
        # might not need
        # transform boolean to 0-1 column... first one has to change the type in the schema though 
        #data = data.select("engaging_user_is_verified", "engaged_with_user_is_verified", "engaged_follows_engaging")\
        #    .replace(["false","true"], ["0","1"]).show()
        
        
        
    '''
        returns small dataframe that explains the values of the oneHotEncoder step, this might be needed
        for mapping the encodings back to the original values
    '''    
    def explainOneHot(self):
        return self.explainOneHotDF


Overwriting twitter_preproc.py


## Interactive code...

In [9]:
#train = "///user/e11920598/traintweet_1000.tsv"
train = "///tmp/traintweet_1000.tsv"
#train = "///user/pknees/RSC20/training.tsv"

In [10]:
import twitter_preproc
import importlib
importlib.reload(twitter_preproc)
from twitter_preproc import *

In [11]:
#    def __init__(self, spark:SparkSession, sc:SparkContext, inputFile:str, seed:int=123,
#                 MF:bool=False, trainsplit:float=0.9):
preproc = twitter_preproc(spark, sc, train)

print(preproc.getDF().show(5))
data = preproc.getDF()
#print(preproc.getDF().show(5))
pd.DataFrame(preproc.getDF().take(5), columns=preproc.getDF().columns)

+-----------------------------+-------------------------+------------------------+-----------------+--------------------+---------------+----+-------+-----+---------------+--------------------+--------------------+--------------------+
|engaged_with_user_is_verified|engaging_user_is_verified|engaged_follows_engaging|tweet_type_onehot|present_media_onehot|language_onehot|like|retweet|reply|retweet_comment|      numeric_scaled|        hashtags_idf|      tweet_text_idf|
+-----------------------------+-------------------------+------------------------+-----------------+--------------------+---------------+----+-------+-----+---------------+--------------------+--------------------+--------------------+
|                        false|                    false|                   false|    (2,[0],[1.0])|       (6,[0],[1.0])| (30,[1],[1.0])|   0|      0|    0|              0|[0.52188082629834...|(8,[7],[0.1646955...|(12,[0,1,2,3,4,5,...|
|                        false|                    false

Unnamed: 0,engaged_with_user_is_verified,engaging_user_is_verified,engaged_follows_engaging,tweet_type_onehot,present_media_onehot,language_onehot,like,retweet,reply,retweet_comment,numeric_scaled,hashtags_idf,tweet_text_idf
0,False,False,False,"(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0,"[0.5218808262983495, 0.26426815305087586, 0.73...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.13338868837882922, 0.5096270946152203, 0.20..."
1,False,False,True,"(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,0,0,"[0.9099009229435596, 0.21939052061710007, 0.89...","(0.0, 3.147554663621658, 0.0, 0.0, 0.0, 0.0, 9...","(0.6669434418941461, 0.8154033513843525, 0.402..."
2,False,False,True,"(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0,"[0.05178509096263063, 0.3593559829498602, 0.61...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.8003321302729753, 0.30577625676913217, 0.53..."
3,False,False,False,"(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0,"[0.6197469225564495, 0.927900384451402, 0.8824...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.6669434418941461, 0.10192541892304406, 0.20..."
4,True,False,False,"(1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0,"[0.015818580959801747, 0.7546958077954908, 0.7...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(1.3338868837882922, 1.528881283845661, 1.1413..."
5,False,False,True,"(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1,1,0,0,"[0.6520722804266154, 0.5852936819308083, 0.442...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.6669434418941461, 0.20385083784608812, 0.13..."
6,False,False,True,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,1,0,"[0.09478079470089118, 0.9071815725161682, 0.99...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.13338868837882922, 0.10192541892304406, 0.2..."
7,False,False,True,"(0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0,"[0.12070063641582256, 0.30676486649106605, 0.6...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.40016606513648767, 0.40770167569217625, 0.2..."
8,False,False,True,"(0.0, 1.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,0,0,"[0.09700088836833042, 0.6267225203820391, 0.51...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.13338868837882922, 0.20385083784608812, 0.3..."
9,True,False,False,"(1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0,"[0.2592133060040134, 0.0909930600575773, 0.249...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16469559...","(0.26677737675765845, 0.10192541892304406, 0.2..."


In [29]:
from pyspark.ml.classification import RandomForestClassifier
data = preproc.getDF()
data.drop(*["retweet","reply","retweet_comment"])
cols = data.columns
assembler = VectorAssembler(inputCols=cols, outputCol="all_features")
data = assembler.transform(data)
train, test = data.randomSplit([0.8,0.2])
rf = RandomForestClassifier(labelCol="like", featuresCol="all_features", numTrees=10)
model = rf.fit(train)

test_y = test.select("like")
test = test.drop("like")
pred = model.transform(test)
pred = pred.withColumn("truth", lit(0))
#pred.show()
frame = pd.DataFrame(pred.show(), columns=pred.columns)
frame_y = pd.DataFrame(test_y.show())
frame["y"] = frame_y
display(frame)

+-----------------------------+-------------------------+------------------------+-----------------+--------------------+---------------+-------+-----+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+-----+
|engaged_with_user_is_verified|engaging_user_is_verified|engaged_follows_engaging|tweet_type_onehot|present_media_onehot|language_onehot|retweet|reply|retweet_comment|      numeric_scaled|        hashtags_idf|      tweet_text_idf|        all_features|       rawPrediction|         probability|prediction|truth|
+-----------------------------+-------------------------+------------------------+-----------------+--------------------+---------------+-------+-----+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+-----+
|                        false|                    false|          

ValueError: Wrong number of items passed 0, placement implies 1

### Explanation

In [92]:
from pyspark.sql.functions import * 

data = preproc.getDF()
data.select("like_timestamp").show()
foo = data.withColumn("like", when(data["like_timestamp"].isNull(), 0).otherwise(1))
foo.select("like_timestamp", "like").show()
#data = data.drop("text_tokens").withColumnRenamed("vector", "text_tokens")
print("### Tweet-Type OneHotEncodings:")
explainonehot = preproc.explainOneHot()
explainonehot.show()
#data.show()
#data.groupBy("engaging_user_is_verified").count().show()
#data = data.select("engaging_user_is_verified", "engaged_with_user_is_verified", "engaged_follows_engaging")\
#    .replace(["false","true"], ["0","1"])..show()


+--------------+
|like_timestamp|
+--------------+
|          null|
|    1581497622|
|    1581060554|
|    1581328518|
|    1580957807|
|    1581346588|
|          null|
|          null|
|    1581009248|
|          null|
|    1581189873|
|          null|
|    1581045318|
|    1581375276|
|    1581063697|
|          null|
|    1581017998|
|          null|
|          null|
|    1581260483|
+--------------+
only showing top 20 rows

+--------------+----+
|like_timestamp|like|
+--------------+----+
|          null|   0|
|    1581497622|   1|
|    1581060554|   1|
|    1581328518|   1|
|    1580957807|   1|
|    1581346588|   1|
|          null|   0|
|          null|   0|
|    1581009248|   1|
|          null|   0|
|    1581189873|   1|
|          null|   0|
|    1581045318|   1|
|    1581375276|   1|
|    1581063697|   1|
|          null|   0|
|    1581017998|   1|
|          null|   0|
|          null|   0|
|    1581260483|   1|
+--------------+----+
only showing top 20 rows

### Tweet-Ty

In [73]:
data.select("present_media").show(5)
data.withColumn("present_media2", data["present_media"].cast(StringType()))
#data.select("present_media").rdd.map(lambda x: str(x[0])).toDF(schema= StructType([
#                StructField("present_media", StringType())])).show(5)


+-------------+
|present_media|
+-------------+
|       [none]|
|       [none]|
|       [none]|
|       [none]|
|      [photo]|
+-------------+
only showing top 5 rows



DataFrame[hashtags: string, tweet_type: string, language: string, tweet_timestamp: bigint, engaged_with_user_follower_count: bigint, engaged_with_user_following_count: bigint, engaged_with_user_is_verified: boolean, engaged_with_user_account_creation: bigint, engaging_user_id: string, engaging_user_follower_count: bigint, engaging_user_following_count: bigint, engaging_user_is_verified: boolean, engaging_user_account_creation: bigint, engaged_follows_engaging: boolean, reply_timestamp: bigint, retweet_timestamp: bigint, retweet_with_comment_timestamp: bigint, like_timestamp: bigint, text_tokens: array<string>, present_media: string, tweet_type_id: double, present_media_id: double, tweet_type_onehot: vector, present_media_onehot: vector, present_media2: string]

In [21]:
foo.foreach(print)
str(foo)

'PythonRDD[63] at RDD at PythonRDD.scala:53'

In [18]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [None]:
from pyspark.ml.feature import RegexTokenizer

data = preproc.getDF()

regexTokenizer = RegexTokenizer(inputCol="text_tokens",outputCol="vector", pattern="\t")
tokenized = regexTokenizer.transform(data)

tokenized.select("vector").show()

In [None]:
tokenized

In [7]:
%%file demo.py


from twitter_preproc import twitter_preproc
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

#spark = SparkSession.builder.appName("ChiSquareSpark").getOrCreate()
spark = SparkSession \
    .builder \
    .appName("Pipeline") \
    .getOrCreate()
sc = spark.sparkContext


# sample file with 1000 tweets for checking the pipeline
train = "///user/e11920598/traintweet_1000.tsv"

preproc = twitter_preproc(spark, sc, train)
print(preproc.getDF().show(5))


sc.stop()


Overwriting demo.py


In [8]:
### rather use it on the command line than here
#! spark-submit --num-executors=4 --total-executor-cores 16 --executor-memory=8G demo.py 

20/06/06 16:26:47 INFO spark.SparkContext: Running Spark version 2.4.0-cdh6.3.2
20/06/06 16:26:47 INFO logging.DriverLogger: Added a local log appender at: /tmp/spark-95acd95f-2bb4-4950-bc77-662ca74baeab/__driver_logs__/driver.log
20/06/06 16:26:47 INFO spark.SparkContext: Submitted application: Pipeline
20/06/06 16:26:47 INFO spark.SecurityManager: Changing view acls to: e11920598
20/06/06 16:26:47 INFO spark.SecurityManager: Changing modify acls to: e11920598
20/06/06 16:26:47 INFO spark.SecurityManager: Changing view acls groups to: 
20/06/06 16:26:47 INFO spark.SecurityManager: Changing modify acls groups to: 
20/06/06 16:26:47 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(e11920598); groups with view permissions: Set(); users  with modify permissions: Set(e11920598); groups with modify permissions: Set()
20/06/06 16:26:47 INFO util.Utils: Successfully started service 'sparkDriver' on port 40223.
20/06/06 1

In [None]:
from pyspark.sql.types import *

column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",\
               "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

SCHEMA = StructType([
                StructField("text_tokens", StringType()),
                StructField("hashtags", StringType()),
                StructField("tweet_id", StringType()),
                StructField("present_media", StringType()),
                StructField("present_links", StringType()),
                StructField("present_domains", StringType()),
                StructField("tweet_type", StringType()),
                StructField("language", StringType()),
                StructField("tweet_timestamp", LongType()),
                StructField("engaged_with_user_id", StringType()),
                StructField("engaged_with_user_follower_count", LongType()),
                StructField("engaged_with_user_following_count", LongType()),
                StructField("engaged_with_user_is_verified", BooleanType()),
                StructField("engaged_with_user_account_creation", LongType()),
                StructField("engaging_user_id", StringType()),
                StructField("engaging_user_follower_count", LongType()),
                StructField("engaging_user_following_count", LongType()),
                StructField("engaging_user_is_verified", BooleanType()),
                StructField("engaging_user_account_creation", LongType()),
                StructField("engaged_follows_engaging", BooleanType()),
                StructField("reply_timestamp", LongType()),
                StructField("retweet_timestamp", LongType()),
                StructField("retweet_with_comment_timestamp", LongType()),
                StructField("like_timestamp", LongType())       
                                ])

len(column_names)