### RDD in pyspark

In [5]:
#Installing pyspark, only in the first time
# !pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l/

In [6]:
import pyspark
from pyspark.sql import SparkSession

In [8]:
#For work with pyspark we need to start a spark session

spark = SparkSession.builder.master("local[1]") \
        .appName("RDDExamples")\
        .getOrCreate()

23/01/02 21:17:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [16]:
sc = spark.sparkContext

In [17]:
numeros = sc.parallelize([1,2,3,4,5,6,7,8,9,10])

In [None]:
#This function will get the first x positions 
numeros.take(5)

In [21]:
#Top will get the x highest values
numeros.top(5)

[10, 9, 8, 7, 6]

In [25]:
#Collect will get all the RDD values
numeros.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [26]:
numeros.count()

10

In [27]:
#Mean value 
numeros.mean()

5.5

In [28]:
#Sum 
numeros.sum()

55

In [29]:
#Max value
numeros.max()

10

In [30]:
#Min value
numeros.min()

1

In [31]:
#Standart deviaton
numeros.stdev()

2.8722813232690143

In [32]:
#Filter
filtro = numeros.filter(lambda filtro: filtro > 2)
filtro.collect()

[3, 4, 5, 6, 7, 8, 9, 10]

In [33]:
#Sample
sample = numeros.sample(True, 0.5,1)
sample.collect()

[2, 3, 4, 5, 9, 10]

In [34]:
#Map
map = numeros.map(lambda map: map * 2)
map.collect()

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [35]:
#New RDD 
numeros2 = sc.parallelize([6,7,8,9,10])

In [37]:
#Union
union = numeros.union(numeros2)
union.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10]

In [38]:
#Intersection
inter = numeros.intersection(numeros2)
inter.collect()

                                                                                

[6, 8, 10, 7, 9]

In [39]:
#Subtract 
sub = numeros.subtract(numeros2)
sub.collect()

                                                                                

[2, 4, 1, 3, 5]

In [40]:
#Cartesian
cartesian = numeros.cartesian(numeros2)
cartesian.collect()

[(1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (1, 10),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (2, 10),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (3, 10),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (4, 10),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (5, 10),
 (6, 6),
 (6, 7),
 (6, 8),
 (6, 9),
 (6, 10),
 (7, 6),
 (7, 7),
 (7, 8),
 (7, 9),
 (7, 10),
 (8, 6),
 (8, 7),
 (8, 8),
 (8, 9),
 (8, 10),
 (9, 6),
 (9, 7),
 (9, 8),
 (9, 9),
 (9, 10),
 (10, 6),
 (10, 7),
 (10, 8),
 (10, 9),
 (10, 10)]

In [41]:
#Count by value
cartesian.countByValue()

defaultdict(int,
            {(1, 6): 1,
             (1, 7): 1,
             (1, 8): 1,
             (1, 9): 1,
             (1, 10): 1,
             (2, 6): 1,
             (2, 7): 1,
             (2, 8): 1,
             (2, 9): 1,
             (2, 10): 1,
             (3, 6): 1,
             (3, 7): 1,
             (3, 8): 1,
             (3, 9): 1,
             (3, 10): 1,
             (4, 6): 1,
             (4, 7): 1,
             (4, 8): 1,
             (4, 9): 1,
             (4, 10): 1,
             (5, 6): 1,
             (5, 7): 1,
             (5, 8): 1,
             (5, 9): 1,
             (5, 10): 1,
             (6, 6): 1,
             (6, 7): 1,
             (6, 8): 1,
             (6, 9): 1,
             (6, 10): 1,
             (7, 6): 1,
             (7, 7): 1,
             (7, 8): 1,
             (7, 9): 1,
             (7, 10): 1,
             (8, 6): 1,
             (8, 7): 1,
             (8, 8): 1,
             (8, 9): 1,
             (8, 10): 1,
             (9