In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
import os

### 0. Create SparkContext

In [2]:
sc = SparkContext()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/06 11:48:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 1. Create RDDs

#### 1.1. Create from list

`sc.parallelize(data, numSlices=)`

In [70]:
RDD = sc.parallelize(range(100), numSlices=4)
RDD.getNumPartitions()

4

In [3]:
# Create RDD from list of words
RDD1 = sc.parallelize(['Nathan', 'Ngo', 'are', 'coding', 'with', 'PySpark'])

print(f"Type of RDD1 is {type(RDD1)}")

Type of RDD1 is <class 'pyspark.rdd.RDD'>


In [4]:
RDD1.take(0)

[]

In [5]:
RDD1.take(1)

['Nathan']

In [6]:
RDD1.take(3)

['Nathan', 'Ngo', 'are']

In [7]:
RDD1.take(7)

['Nathan', 'Ngo', 'are', 'coding', 'with', 'PySpark']

In [8]:
# out of list range
RDD1.take(100)

['Nathan', 'Ngo', 'are', 'coding', 'with', 'PySpark']

#### 1.2. Create from txt file

In [7]:
txt_file_path = os.path.join('..', 'src-data', '5000_points.txt')
RDD2 = sc.textFile(txt_file_path, minPartitions = 3)

In [8]:
RDD2.take(2)

['664159\t550946', '665845\t557965']

In [9]:
RDD2.getNumPartitions()

3

#### 1.3. Create from HDFS

*Note:* Please follow our tutorial in `setup-Hadoop/README.md`.

In [12]:
hdfs_file_path = 'hdfs://localhost:9000//user/NathanNgo/5000_points.txt'
RDD3 = sc.textFile(hdfs_file_path, minPartitions = 3)

In [14]:
RDD3.take(2)

['664159\t550946', '665845\t557965']

In [15]:
RDD3.take(3)

['664159\t550946', '665845\t557965', '597173\t575538']

In [19]:
RDD3.getNumPartitions()

3

#### 1.4 Create from exist RDD

In [12]:
# create RDD4 from frist 4 element of RDD1
RDD4 = sc.parallelize(RDD1.take(4))

In [13]:
RDD4.take(4)

['Nathan', 'Ngo', 'are', 'coding']

### 2. RDD Operation - Transformation

In [14]:
RDD = sc.parallelize([i for i in range(1, 8)])
RDD.take(8)

[1, 2, 3, 4, 5, 6, 7]

#### 2.1. Transformation - `map` method
`map(function)`: apply 1 `function` for all elements in RDD.

map with single parament --> use lambda

In [25]:
RDD_map_1 = RDD.map(lambda x: x ** 2)
RDD_map_1.take(8)

[1, 4, 9, 16, 25, 36, 49]

map with single parament --> use lambda + user define function

In [26]:
# User Define Function
def power_n(x, n):
    return x ** n

RDD_map_2 = RDD.map(lambda x: power_n(x, 3))
RDD_map_2.take(8)

[1, 8, 27, 64, 125, 216, 343]

#### 2.2. Transformation - `filter` method
`filter(condtion)`: return new RDD for satisfying the `condtion`.

In [28]:
RDD_filter_1 = RDD.filter(lambda x: x > 2)
RDD_filter_1.take(8)

[3, 4, 5, 6, 7]

#### 2.3. Trasnformation - `flatMap()` method
`flatMap()`: return individual element from original RDD.

In [34]:
RDD_string_1 = sc.parallelize(['Black Pink', 'Nathan Ngo', 'FPT Software', 'International University'])
RDD_flatmap_1 = RDD_string_1.flatMap(lambda x: x.split(' '))
RDD_flatmap_1.take(10)

['Black',
 'Pink',
 'Nathan',
 'Ngo',
 'FPT',
 'Software',
 'International',
 'University']

In [55]:
RDD_string_2 = sc.parallelize(['Black Pink', 'FPT Software'])
RDD_flatmap_2 = RDD_string_2.flatMap(lambda x: x.upper())
RDD_flatmap_2.take(11)

['B', 'L', 'A', 'C', 'K', ' ', 'P', 'I', 'N', 'K', 'F']

In [53]:
RDD_string_3 = sc.parallelize(['Black Pink', 'FPT Software'])

# combine split and upper
def split_upper(x):
    x = x.split(' ')
    x_after = []
    for sub_x in x:
        x_after.append(sub_x.upper())
    return x_after
    
RDD_flatmap_3 = RDD_string_3.flatMap(lambda x: split_upper(x))
RDD_flatmap_3.take(4)

['BLACK', 'PINK', 'FPT', 'SOFTWARE']

#### 2.4. Trasnformation - `uninon()` method
`RDD_1.uninon(RDD_2)`: return new RDD which contains all elements and arguments from `RDD_1` and `RDD_2`.

In [56]:
RDD_1 = sc.parallelize([i for i in range(1, 8)])
RDD_2 = sc.parallelize([i for i in range(8, 12)])
RDD_3 = RDD_1.union(RDD_2)
RDD_3.take(20)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

### 3. RDD Operation - Action

`Action`: return value after operation in RDD

#### 3.1. Action - `collect` method
`collect()`: return all emelents of RDD into `list`.

In [57]:
RDD_1 = sc.parallelize([i for i in range(1, 8)])
print(type(RDD_1.collect()))
RDD_1.collect()

<class 'list'>


[1, 2, 3, 4, 5, 6, 7]

#### 3.2. Action - `take()` method
`take(n)`: return `n` emelents of RDD into `list`.

In [59]:
print(type(RDD_1.take(2)))
RDD_1.take(2)

<class 'list'>


[1, 2]

#### 3.3. Action - `count()` method
`count()`: return total elements of RDD.

In [60]:
RDD_1.count()

7

#### 3.4. Action - `reduce()` method
`reduce(function)`: Synthesize elements of RDD.

In [63]:
RDD_1 = sc.parallelize([i for i in range(1,6)])
RDD_1.take(6)

[1, 2, 3, 4, 5]

In [66]:
print(1 + 2 + 3 + 4 + 5)
RDD_1.reduce(lambda x,y : x + y) # ~ 1 + 2 + 3 + 4 + 5

15


15

In [None]:
# if use 3 parament --> get error
RDD_1.reduce(lambda x, y, z : x + y + z)

*Expected output:* `TypeError: <lambda>() missing 1 required positional argument: 'z'`

#### 3.4. Action - `saveAsTextFile` method
`saveAsTextFile(folder_name)`: save RDD into `folder_name` with sperated file according partion.

In [71]:
RDD = sc.parallelize(range(100), numSlices=5)
RDD.getNumPartitions()

5

So Number Partitions is 5 --> 5 files of partion will be saved.

In [72]:
RDD.saveAsTextFile('../src-data/temp-1')

![plot](https://github.com/DatacollectorVN/PySpark-Tutorial/blob/master/public-imgs/RDD_save_temp_1.png?raw=true)