In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, date_format, to_date, lit, date_diff,\
months_between, month, year, add_months, date_add, day, weekofyear, weekday, hour, minute, second, current_timestamp,\
to_timestamp
from pyspark.sql.types import *

spark = SparkSession.Builder().appName('Timestamp Function in PySpark').getOrCreate()

In [2]:
from pyspark.sql.functions import col, collect_list, transform

# Create a PySpark DataFrame from the input data
df = spark.createDataFrame([
    (1, "T1", 123),
    (1, "T2", 34),
    (2, "T1", 21),
    (2, "T2", 345),
    (2, "T3", 23)
], ["acc", "flag", "mdab"])

# Group the DataFrame by "acc" and transform the "mdab" values
result = (
    df
    .groupBy("acc")
    .agg(
        collect_list("mdab")
        .alias("mdab")
    )
    .select(
        col("acc"),
        transform("mdab", lambda x: x).alias("mdab")
    )
)

# Show the result
result.show(truncate=False)

                                                                                

+---+-------------+
|acc|mdab         |
+---+-------------+
|1  |[123, 34]    |
|2  |[21, 345, 23]|
+---+-------------+



In [35]:
from pyspark.sql.functions import col, collect_set, map_from_arrays, get

# Create a PySpark DataFrame from the input data
df = spark.createDataFrame([
    (1, "T1", 21),
    (1, "T2", 21),
    (2, "T1", 21),
    (2, "T2", 345),
    (2, "T3", 23)
], ["acc", "flag", "mdab"])

# Group the DataFrame by "acc" and transform the "mdab" values
result = (
    df
    .groupBy("acc")
    .agg(
        collect_list("flag").alias("flags"),
        collect_list("mdab").alias("mdabs")
    )
    .select(
        col("acc"),
        map_from_arrays("flags", "mdabs").alias("mdab"),
        map_from_arrays("flags", "mdabs").alias("mdab"),
        map_from_arrays("flags", "mdabs").alias("mdab"),
        map_from_arrays("flags", "mdabs").alias("mdab")
    )
)

# Show the result
result.show(truncate=False)

+---+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
|acc|mdab                           |mdab                           |mdab                           |mdab                           |
+---+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
|1  |{T1 -> 21, T2 -> 21}           |{T1 -> 21, T2 -> 21}           |{T1 -> 21, T2 -> 21}           |{T1 -> 21, T2 -> 21}           |
|2  |{T1 -> 21, T2 -> 345, T3 -> 23}|{T1 -> 21, T2 -> 345, T3 -> 23}|{T1 -> 21, T2 -> 345, T3 -> 23}|{T1 -> 21, T2 -> 345, T3 -> 23}|
+---+-------------------------------+-------------------------------+-------------------------------+-------------------------------+



In [26]:
result.createOrReplaceTempView('result')
result.cache()
result.show()

24/07/12 18:00:57 WARN CacheManager: Asked to cache already cached data.

+---+--------------------+
|acc|                mdab|
+---+--------------------+
|  1|{T1 -> 21, T2 -> 21}|
|  2|{T1 -> 21, T2 -> ...|
+---+--------------------+



                                                                                

In [31]:
spark.sql(
    """
    SELECT mdab['T1'] FROM result
    """
).show()

+--------+
|mdab[T1]|
+--------+
|      21|
|      21|
+--------+



In [16]:
spark.stop()