<h3 align="center"><b> PySpark RDDs </b></h3>

---

In [1]:
import re, numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, functions as F

In [2]:
conf = ( SparkConf()
         .setMaster("local")
         .setAppName("dev")
         .set("spark.executor.memory", "2g") )

sc = SparkContext(conf = conf)
spark = SparkSession(sparkContext=sc)

In [3]:
df = spark.read.csv("../data/random.csv", header=True, inferSchema=True)

print(f'''shape : {df.count()}, {len(df.columns)}''')
df.printSchema()
df.show(3)

shape : 819200, 6
root
 |-- iter: integer (nullable = true)
 |-- t1: integer (nullable = true)
 |-- t2: integer (nullable = true)
 |-- t3: integer (nullable = true)
 |-- t4: integer (nullable = true)
 |-- t5: integer (nullable = true)

+----+---+---+---+---+---+
|iter| t1| t2| t3| t4| t5|
+----+---+---+---+---+---+
|   1| 78| 79| 80| 81| 82|
|   1| 32| 33| 34| 35| 36|
|   1| 10| 11| 12| 13| 14|
+----+---+---+---+---+---+
only showing top 3 rows



In [4]:
np.unique(df.select('iter').rdd.map(lambda col: col[0]).collect())

array([1, 2, 3, 4])

In [5]:
df.select('iter').rdd.countByValue()

defaultdict(int,
            {Row(iter=1): 102400,
             Row(iter=2): 102400,
             Row(iter=3): 204800,
             Row(iter=4): 409600})

In [6]:
df.select('iter').rdd.countByKey()

defaultdict(int, {1: 102400, 2: 102400, 3: 204800, 4: 409600})

In [7]:
np.where(np.unique(df.select('iter').rdd.keys().collect()) > 3, '>3', '<=3' )


array(['<=3', '<=3', '<=3', '>3'], dtype='<U3')

In [8]:
np.sort(df.select('t1').rdd.flatMap(lambda col: col).distinct().collect())

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [9]:
one_row = sc.parallelize(df.rdd.take(1)).toDF()

one_row.show()

+----+---+---+---+---+---+
|iter| t1| t2| t3| t4| t5|
+----+---+---+---+---+---+
|   1| 78| 79| 80| 81| 82|
+----+---+---+---+---+---+



In [10]:
one_row.rdd.map(lambda x: np.sum(np.where(np.array(x) == 1, 1, 0))).collect() 

[1]

In [11]:
x = [1,2,3,4]
np.sum(np.where(np.array(x) == 1, 1, 0))

1

In [12]:
check = {}
for col in [i for i in df.columns if re.match('t', i)]:
    arr = df.select(col
            ).rdd.map(
                lambda col: col[0]
            ).map(
                lambda x: np.sum(np.where(np.array(x) == 1, 1, 0))
            ).collect()

    check.update({col: arr})

In [13]:
{k: np.sum(v) for k, v in check.items()}

{'t1': 8320, 't2': 0, 't3': 0, 't4': 0, 't5': 0}

In [14]:
df.rdd.map( lambda x: np.sum( np.where(np.array(x)>50, 1, 0) ) ).collect()

[5,
 0,
 0,
 3,
 0,
 0,
 5,
 5,
 5,
 0,
 5,
 0,
 0,
 5,
 0,
 5,
 0,
 0,
 1,
 5,
 5,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 0,
 5,
 5,
 5,
 5,
 5,
 5,
 0,
 5,
 5,
 0,
 5,
 5,
 0,
 0,
 5,
 0,
 5,
 5,
 5,
 0,
 0,
 5,
 5,
 0,
 5,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 5,
 3,
 0,
 5,
 5,
 5,
 0,
 5,
 5,
 0,
 5,
 5,
 0,
 5,
 2,
 5,
 0,
 5,
 0,
 0,
 5,
 5,
 5,
 0,
 5,
 0,
 0,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 2,
 5,
 0,
 0,
 0,
 0,
 5,
 0,
 5,
 0,
 5,
 5,
 0,
 5,
 0,
 5,
 5,
 5,
 0,
 0,
 0,
 5,
 5,
 0,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 5,
 0,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 0,
 5,
 0,
 5,
 0,
 5,
 5,
 5,
 5,
 0,
 5,
 0,
 5,
 5,
 5,
 0,
 4,
 5,
 5,
 0,
 5,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 0,
 0,
 0,
 4,
 5,
 5,
 0,
 5,
 0,
 5,
 0,
 5,
 5,
 0,
 5,
 0,
 5,
 5,
 5,
 0,
 0,
 5,
 5,
 0,
 0,
 0,
 2,
 0,
 5,
 5,
 0,
 0,
 0,
 5,
 5,
 5,
 0,
 5,
 5,
 5,
 0,
 5,
 0,
 0,
 5,
 0,
 0,
 5,
 5,
 0,
 5,
 0,
 0,
 5,
 0,
 0,
 5,
 0,
 0,
 5,
 0,
 5,
 5,
 0,
 5,
 5,
 5,
 5,


In [15]:
col_count = {}
for col in [i for i in df.columns if re.match('t', i)]:
    val = df.select(col).rdd.map( lambda x: np.where(np.array(x[0])>50, 1, 0)).sum()
    col_count.update({col: val})

col_count

{'t1': 410560, 't2': 418752, 't3': 426368, 't4': 435136, 't5': 443008}

In [16]:
df.rdd.map( lambda x: np.where(np.array(x[0])>50, 1, 0)).sum()

0

In [17]:
tst = df.select('*')

tst = tst.rdd.map(lambda row: np.where(np.array(row)>50, 1, 0)) #.toDF(tst.columns)
# tst.groupBy('iter').sum().show()
tst.collect()

[array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 1]),
 array([0, 1, 1, 1, 1, 1]),
 array([0, 1, 1, 1, 

In [52]:
def agg_func(df, converter=None, group_col='iter', iter_regex='t'):
    for col in [i for i in df.columns if re.match(iter_regex, i)]:
        
        if converter != None:
            df = df.withColumn(col, F.when(F.col(col) > 50, converter).otherwise(0))
        
        else:
            df = df.withColumn(col, F.when(F.col(col) > 50, 1).otherwise(0))

    return df.groupBy(group_col).sum()

In [54]:
agg_func(df).show()

+----+---------+-------+-------+-------+-------+-------+
|iter|sum(iter)|sum(t1)|sum(t2)|sum(t3)|sum(t4)|sum(t5)|
+----+---------+-------+-------+-------+-------+-------+
|   1|   102400|  51264|  52272|  53248|  54512|  55376|
|   3|   614400| 102640| 104688| 106592| 108784| 110752|
|   4|  1638400| 205280| 209376| 213184| 217568| 221504|
|   2|   204800|  51376|  52416|  53344|  54272|  55376|
+----+---------+-------+-------+-------+-------+-------+



In [45]:
super_df = df.select('*')
for _ in range(6):
    super_df = super_df.union(super_df)

In [46]:
agg_func(df=super_df).show()

+----+---------+--------+--------+--------+--------+--------+
|iter|sum(iter)| sum(t1)| sum(t2)| sum(t3)| sum(t4)| sum(t5)|
+----+---------+--------+--------+--------+--------+--------+
|   1|  6553600| 3280896| 3345408| 3407872| 3488768| 3544064|
|   3| 39321600| 6568960| 6700032| 6821888| 6962176| 7088128|
|   4|104857600|13137920|13400064|13643776|13924352|14176256|
|   2| 13107200| 3288064| 3354624| 3414016| 3473408| 3544064|
+----+---------+--------+--------+--------+--------+--------+



In [47]:
super_df.count()

52428800