## Testing Notebook

In [1]:
path = '/opt/spark-data'

In [31]:
from __future__ import print_function
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col, split, udf, size, element_at, explode, when, lit
import pyspark.sql.functions as F

In [3]:
spark = SparkSession \
    .builder \
    .appName("S3_Analysis") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.cores", "2") \
    .config("spark.num.executors", "6") \
    .config("spark.executor.memory", "2g") \
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
## Open the parquet and have a look
s3_stats = spark.read.parquet(os.path.join(path, "s3logs"))

In [5]:
s3_stats.createOrReplaceTempView("s3_stats")

In [6]:
spark.sql("SELECT distinct requesthour from s3_stats").collect()

[Row(requesthour=15), Row(requesthour=14)]

In [7]:
key_data = spark.sql("SELECT `key` FROM s3_stats")

In [10]:
# check dates
spark.sql("SELECT min(requesttimestamp) FROM s3_stats").collect()

[Row(min(requesttimestamp)=datetime.datetime(2020, 7, 14, 4, 0))]

We need to derive the tree structure from the key

Scenarios:  
- same name different parent  
- same name same entity  

In [11]:
## Create some the parent child pairs we need to create out structure
def zip_pairs(value):
    """
    Args:
        value (list(str)): split up list of folder path
            ie [ db, table, partition, file.parquet ]
    
    Returns:
        result (list((str, str)))
    
    """
    lead_list = value.copy()
    lead_list.pop()
    lead_list.insert(0,None)
    result = [item for item in zip(lead_list,value)]
  
    return result

In [60]:
expr = ".%25."
pairZip = udf(zip_pairs, ArrayType(ArrayType(StringType())) )

df2 = key_data.select("key").withColumn("key_split", split(col("key"), "/")) \
        .withColumn("depth", size(col("key_split"))) \
        .withColumn("file", element_at(col("key_split"), -1) ) \
        .withColumn("pairs", pairZip(col("key_split"))) \
        .withColumn("_tmp", explode(col("pairs"))) \
        .withColumn("parent", col("_tmp")[0]) \
        .withColumn("object", col("_tmp")[1]) \
        .withColumn("_object_split", split(col("object"), "\.")) \
        .withColumn("type", when(size("_object_split")>1, "file")
                            .when(col("object").rlike(expr), "partition")
                            .otherwise("folder")) \
        .drop("_tmp") \
        .drop("_object_split")


## Exploring Entities

In [61]:
relationship_table = df2.select("parent", "object", "type").distinct()
relationship_table.createOrReplaceTempView("relationship_table")
#relationship_table.write.format("hive").saveAsTable("logging_demo.relationship_table")

In [45]:
relationship_table.count()

1569470

In [51]:
spark.sql("select parent, count(*) FILTER( WHERE type =='file' ) as `count(*)` from relationship_table group by parent").head(10)

[Row(parent='part_key%253D2019276', count(*)=251),
 Row(parent='part_key%253D202076', count(*)=430),
 Row(parent='part_key%253D202044_new', count(*)=120),
 Row(parent='attempt_20200713230910_1265_m_000000_0', count(*)=0),
 Row(parent='onboard_file_version_av', count(*)=148),
 Row(parent='task_20200706104331_0011_m_000010', count(*)=0),
 Row(parent='task_20200714000305_1106_m_000000', count(*)=1),
 Row(parent='int_flat_timeusage', count(*)=10),
 Row(parent='attempt_20200714000716_0417_m_000000_0', count(*)=1),
 Row(parent='.hive-staging_hive_2020-07-04_17-25-51_057_8557720520923397094-428', count(*)=0)]

In [None]:
## Exploring just the files
storage_files = df2.select("key", "parent", "child", "file").filter( df2.child == df2.file )
storage_files.filter(storage_files.parent.contains("=")).show(40,truncate=False)

## Exploring the folders in the tree
folders = df2.select("key", "parent", "child", "file").filter( df2.child != df2.file )
folders.select("parent", "child").show(10, truncate=False)

distinct_folders = folders.select("child").distinct()

entities = spark.sql("SELECT parent as stage from relationship_table \
                     UNION SELECT child as stage from relationship_table")

## End

In [18]:
spark.stop()