# Data Exploration

In [1]:
import findspark
findspark.init("/usr/spark-2.4.1")
import pyspark
from pyspark import SQLContext

import numpy as np

In [2]:
sc = pyspark.SparkContext()
sql = SQLContext(sc)

## Data Loading

Read Sample Data as Spark DataFrame and assign column names according to RecSys Challenge Documentation

In [3]:
datafile = "data/training_sample.tsv"

df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(datafile,  inferSchema="true")
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [4]:
df

DataFrame[text_tokens: string, hashtags: string, tweet_id: string, present_media: string, present_links: string, present_domains: string, tweet_type: string, language: string, tweet_timestamp: int, engaged_with_user_id: string, engaged_with_user_follower_count: int, engaged_with_user_following_count: int, engaged_with_user_is_verified: boolean, engaged_with_user_account_creation: int, engaging_user_id: string, engaging_user_follower_count: int, engaging_user_following_count: int, engaging_user_is_verified: boolean, engaging_user_account_creation: int, engaged_follows_engaging: boolean, reply_timestamp: double, retweet_timestamp: double, retweet_with_comment_timestamp: double, like_timestamp: double]

In [5]:
from pyspark.sql.functions import col
df = df.withColumn("engaged_with_user_is_verified",col("engaged_with_user_is_verified").cast("Integer"))
df = df.withColumn("engaging_user_is_verified",col("engaging_user_is_verified").cast("Integer"))
df = df.withColumn("engaged_follows_engaging",col("engaged_follows_engaging").cast("Integer"))

In [6]:
numeric_cols = ['engaged_with_user_follower_count', 
                'engaged_with_user_following_count', 
                'engaged_with_user_account_creation',
                'engaging_user_follower_count', 
                'engaging_user_following_count',
                'engaging_user_account_creation',
                'tweet_timestamp',
               ]


categorical_cols = ['tweet_type', 'language', 
                    'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engaged_follows_engaging']

id_cols = ['tweet_id', 'engaged_with_user_id', 'engaging_user_id']

response_cols = ['reply_timestamp', 
                 'retweet_timestamp',
                 'retweet_with_comment_timestamp', 
                 'like_timestamp'
                ]

tweet_feature_cols = ['text_tokens', 'hashtags', 'present_media', 'present_links', 'present_domains']

In [7]:
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoder, StringIndexer, FeatureHasher

nq = 50 # number of quantiles to use

def create_quantilesDiscretizer(input_col):
    output_col = input_col + "_discretized"
    return QuantileDiscretizer(numBuckets=nq,
                                  relativeError=0.,
                                  handleInvalid='keep',
                                  inputCol=input_col,
                                  outputCol=output_col)

def create_oneHotEncoder(input_col):
    output_col = input_col + "_oneHot"
    return OneHotEncoder(inputCol=input_col,
                         outputCol=output_col, dropLast=False)

def create_stringIndexer(input_col):
    output_col = input_col + "_indexed"
    return StringIndexer(inputCol=input_col,
                         outputCol=output_col)


def create_featureHasher(input_col):
    output_col = input_col + "_oneHot"
    return FeatureHasher(numFeatures=nq,
                         inputCols=[input_col],
                         outputCol=output_col)


In [8]:
quantile_discretizers_numeric = [ create_quantilesDiscretizer(col) for col in numeric_cols ]
one_hot_encoders_numeric = [ create_oneHotEncoder(col+"_discretized") for col in numeric_cols]


string_indexer_categorical = [ create_stringIndexer(col) for col in categorical_cols]
one_hot_encoder_categorical = [ create_oneHotEncoder(col+"_indexed") for col in categorical_cols]

id_feature_hashers = [ create_featureHasher(col) for col in id_cols]


stages = list()

stages.extend(quantile_discretizers_numeric)
stages.extend(one_hot_encoders_numeric)
stages.extend(string_indexer_categorical)
stages.extend(one_hot_encoder_categorical)
stages.extend(id_feature_hashers)


In [9]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

In [10]:
df = pipeline.fit(df).transform(df)

In [11]:
df.take(1)

[Row(text_tokens='101\t56898\t137\t174\t63247\t10526\t131\t3197\t8747\t8747\t6455\t3205\t14120\t131\t120\t120\t188\t119\t11170\t120\t176\t11090\t10305\t10686\t11211\t10116\t11127\t10237\t11779\t11517\t102', hashtags=None, tweet_id='3C21DCFB8E3FEC1CB3D2BFB413A78220', present_media='Video', present_links=None, present_domains=None, tweet_type='Retweet', language='76B8A9C3013AE6414A3E6012413CDC3B', tweet_timestamp=1581467323, engaged_with_user_id='D1AA2C85FA644D64346EDD88470525F2', engaged_with_user_follower_count=737, engaged_with_user_following_count=706, engaged_with_user_is_verified=0, engaged_with_user_account_creation=1403069820, engaging_user_id='000046C8606F1C3F5A7296222C88084B', engaging_user_follower_count=131, engaging_user_following_count=2105, engaging_user_is_verified=0, engaging_user_account_creation=1573978269, engaged_follows_engaging=0, reply_timestamp=None, retweet_timestamp=None, retweet_with_comment_timestamp=None, like_timestamp=None, engaged_with_user_follower_count