# ARRAY REPEAT

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 16:04:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_people = [
    (1, "Andrew James", "1987-12-25", True, 140000, "model"),
    (2, "Linda Scott", "1990-05-14", False, 95000, "engineer"),
    (3, "Michael Brown", "1985-11-07", True, 120000, "teacher"),
    (4, "Jessica Wilson", "1992-03-22", False, 75000, "designer"),
    (5, "John Davis", "1980-08-19", True, 160000, "lawyer"),
    (6, "Emily Clark", "1995-02-10", False, 98000, "architect"),
    (7, "David Evans", "1983-07-15", True, 130000, "doctor"),
    (8, "Sophia Harris", "1988-11-30", False, 110000, "nurse"),
    (9, "Daniel Martinez", "1991-06-01", True, 145000, "consultant"),
    (10, "Olivia Thompson", "1986-09-12", False, 102000, "scientist")
]

schema_people = "id INTEGER, name STRING, birth_date STRING, married BOOLEAN, salary INTEGER, profession STRING"

In [4]:
df_people = spark.createDataFrame(data=data_people,schema=schema_people)
df_people.show()

                                                                                

+---+---------------+----------+-------+------+----------+
| id|           name|birth_date|married|salary|profession|
+---+---------------+----------+-------+------+----------+
|  1|   Andrew James|1987-12-25|   true|140000|     model|
|  2|    Linda Scott|1990-05-14|  false| 95000|  engineer|
|  3|  Michael Brown|1985-11-07|   true|120000|   teacher|
|  4| Jessica Wilson|1992-03-22|  false| 75000|  designer|
|  5|     John Davis|1980-08-19|   true|160000|    lawyer|
|  6|    Emily Clark|1995-02-10|  false| 98000| architect|
|  7|    David Evans|1983-07-15|   true|130000|    doctor|
|  8|  Sophia Harris|1988-11-30|  false|110000|     nurse|
|  9|Daniel Martinez|1991-06-01|   true|145000|consultant|
| 10|Olivia Thompson|1986-09-12|  false|102000| scientist|
+---+---------------+----------+-------+------+----------+



## Array repeat

In [6]:
from pyspark.sql.functions import array_repeat, explode, col

In [9]:
df_people_repeat = df_people.withColumn("key_col", array_repeat(col("id"),10))
df_people_repeat.show(truncate=False)

+---+---------------+----------+-------+------+----------+----------------------------------------+
|id |name           |birth_date|married|salary|profession|key_col                                 |
+---+---------------+----------+-------+------+----------+----------------------------------------+
|1  |Andrew James   |1987-12-25|true   |140000|model     |[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]          |
|2  |Linda Scott    |1990-05-14|false  |95000 |engineer  |[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]          |
|3  |Michael Brown  |1985-11-07|true   |120000|teacher   |[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]          |
|4  |Jessica Wilson |1992-03-22|false  |75000 |designer  |[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]          |
|5  |John Davis     |1980-08-19|true   |160000|lawyer    |[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]          |
|6  |Emily Clark    |1995-02-10|false  |98000 |architect |[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]          |
|7  |David Evans    |1983-07-15|true   |130000|doctor    |[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]          |


                                                                                

## Array repeat explode

In [10]:
df_people_explode = df_people.withColumn("key_col", explode(array_repeat(col("id"),10)))
df_people_explode.show(truncate=False)

+---+------------+----------+-------+------+----------+-------+
|id |name        |birth_date|married|salary|profession|key_col|
+---+------------+----------+-------+------+----------+-------+
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|1  |Andrew James|1987-12-25|true   |140000|model     |1      |
|2  |Linda Scott |1990-05-14|false  |95000 |engineer  |2      |
|2  |Linda Scott |1990-05-14|false  |95000 |engineer  |2      |
|2  |Linda Scott |1990-05-14|false  |950

## Using Window function

In [21]:
from pyspark.sql.functions import lit, row_number, concat_ws
from pyspark.sql.window import Window

In [17]:
window_function = Window().partitionBy("id").orderBy(lit("A"))

In [24]:
df_people_window = df_people_explode.withColumn("row_n", row_number().over(window_function))

df_people_extended = df_people_window.withColumn("id_new", concat_ws("_", col("id"),col("row_n"))).drop("id","key_col","row_n")

df_people_extended.show()

+------------+----------+-------+------+----------+------+
|        name|birth_date|married|salary|profession|id_new|
+------------+----------+-------+------+----------+------+
|Andrew James|1987-12-25|   true|140000|     model|   1_1|
|Andrew James|1987-12-25|   true|140000|     model|   1_2|
|Andrew James|1987-12-25|   true|140000|     model|   1_3|
|Andrew James|1987-12-25|   true|140000|     model|   1_4|
|Andrew James|1987-12-25|   true|140000|     model|   1_5|
|Andrew James|1987-12-25|   true|140000|     model|   1_6|
|Andrew James|1987-12-25|   true|140000|     model|   1_7|
|Andrew James|1987-12-25|   true|140000|     model|   1_8|
|Andrew James|1987-12-25|   true|140000|     model|   1_9|
|Andrew James|1987-12-25|   true|140000|     model|  1_10|
| Linda Scott|1990-05-14|  false| 95000|  engineer|   2_1|
| Linda Scott|1990-05-14|  false| 95000|  engineer|   2_2|
| Linda Scott|1990-05-14|  false| 95000|  engineer|   2_3|
| Linda Scott|1990-05-14|  false| 95000|  engineer|   2_

In [23]:
df_people_extended.count()

100