In [1]:
import pyspark
from pyspark import SQLContext, SparkContext, SparkConf
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

In [2]:
# Set Spark Config
conf = SparkConf().setAppName("RecSys-Challenge-Data-Split").setMaster("yarn")
conf = (conf.set("deploy-mode","cluster")
       .set("spark.driver.memory","100g")
       .set("spark.executor.memory","100g")
       .set("spark.driver.cores","1")
       .set("spark.num.executors","50")
       .set("spark.executor.cores","5")
       .set("spark.driver.maxResultSize", "100g"))
sc = pyspark.SparkContext(conf=conf)
sql = SQLContext(sc)

In [9]:
datafile = "hdfs:///user/pknees/RSC20/training.tsv"

df = (sql.read
    .format("csv")
    .option("header", "false")
    .option("sep", "\x01")
    .load(datafile,  inferSchema="true")
    .toDF("text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains","tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count","engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified","engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"))

In [10]:
df = df.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("tweet_timestamp")))

In [11]:
train_df = df.where("rank <= .75").drop("rank")
test_df = df.where("rank > .8").drop("rank")

In [12]:
train_df.select("tweet_timestamp").show(10)

+---------------+
|tweet_timestamp|
+---------------+
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
|     1580947200|
+---------------+
only showing top 10 rows



In [13]:
test_df.select("tweet_timestamp").show(10)

+---------------+
|tweet_timestamp|
+---------------+
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
|     1581428809|
+---------------+
only showing top 10 rows



In [14]:
train_df.write.option("header", "false").option("sep", "\x01").csv("hdfs:///user/e1553958/RSC20/training.tsv")

AnalysisException: 'path hdfs://nameservice1/user/e1553958/RSC20/training.tsv already exists.;'

In [None]:
test_df.write.option("header", "false").option("sep", "\x01").csv("hdfs:///user/e1553958/RSC20/test.tsv")