In [1]:
import hsfs
import boto3
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
49,application_1646303173729_0008,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [27]:
spark.conf.set("spark.sql.shuffle.partitions", 10)

In [2]:
connection = hsfs.connection()
fs = connection.get_feature_store()

# Storage connector to s3
sc = fs.get_storage_connector("experiment-s3")


Connected. Call `.close()` to terminate connection gracefully.

In [3]:
sc.prepare_spark()

In [4]:
s3_directory = "s3a://" + sc.bucket + "/axel_experiments"

In [5]:
def save(experiment_id, direction: str, left: DataFrame, right: DataFrame):
    directory = s3_directory + "/sorted-{}/{}".format(direction, experiment_id)
    left.write.parquet(directory + "/left.parquet", mode="overwrite")
    # left.to_csv(directory + "/debug_left.csv")
    right.write.parquet(directory + "/right.parquet", mode="overwrite")
    # right.to_csv(directory + "/debug_right.csv")

In [6]:
s3_raw_directory = "s3a://" + sc.bucket + "/axel_experiments/raw/"
def load_data(experiment_directory: str):
    left_raw = s3_raw_directory + "/{}/left.parquet".format(experiment_directory)
    right_raw = s3_raw_directory + "/{}/right.parquet".format(experiment_directory)
    
    return spark.read.parquet(left_raw), spark.read.parquet(right_raw)

In [7]:
client = boto3.client(
    "s3",
    aws_access_key_id=sc._access_key,
    aws_secret_access_key=sc._secret_key,
)

In [8]:
folders_results = client.list_objects(Bucket=sc.bucket, Prefix="axel_experiments/raw/", Delimiter='/')

In [9]:
experiment_ids = []

for folder in folders_results.get("CommonPrefixes"):
    folder_name = folder.get("Prefix")
    experiment_ids.append(folder_name.split("/")[2])
experiment_ids

['10000-1_year', '100000-1_year', '1000000-1_year', '10000000-1_year']

In [10]:
def sort(df: DataFrame):
    return (
        df.orderBy(col("id"), col("ts")),
        df.orderBy(col("id").desc(), col("ts").desc()),
    )

In [11]:
def handle_experiments(ids):
    for experiment_id in ids:
        left_raw, right_raw = load_data(experiment_id)
        left_asc, left_desc = sort(left_raw)
        right_asc, right_desc = sort(right_raw)
        save(experiment_id, "asc", left_asc, right_asc)
        save(experiment_id, "desc", left_desc, right_desc)
        

In [None]:
handle_experiments(experiment_ids)