In [1]:
from pyspark import SparkContext 

# RDD - Resilient Distributed Datasets
Resiliente: Capacidade de resistir a falhas
Distributed: Distribuido em várias máquinas
Datasets: Coleção de dados (Arrays, tabelas, tuplas, ...)

In [2]:
# master = 'local[1]'
master = 'spark://192.168.2.102:7077' # Connect to remote server
appName = 'Spark Example'

sc = SparkContext(master=master, appName=appName)

21/12/17 12:34:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
sc

In [4]:
rdd = sc.parallelize([1,2,3,4])
rdd.take(5)

                                                                                

[1, 2, 3, 4]

## Partição
É uma divisão criada logicamente para dividir um dataset de forma distribuida

In [5]:
rdd = sc.parallelize(range(100), numSlices=10)
rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## Transformações

### Map, flat map, filter, reduce
transforma os dados aplicando uma função

não aplica a transformação até realizar uma ação, ex rdd.take(5)

In [6]:
rdd = sc.parallelize(range(10))

rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
rdd_map = rdd.map(lambda x: x**2)

rdd_map.take(10)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [8]:
rdd = sc.parallelize(["hello world", "how are you"])
flatmap = rdd.flatMap(lambda x: x.split(" "))

flatmap.take(10)

['hello', 'world', 'how', 'are', 'you']

In [9]:
error_log = sc.parallelize(["[ERROR] Failed to create dataframe", "[ERROR] Failed to train"])
success_log = sc.parallelize(["[SUCCESS] Train successfully completed"])

log = error_log.union(success_log)

log.take(5)

['[ERROR] Failed to create dataframe',
 '[ERROR] Failed to train',
 '[SUCCESS] Train successfully completed']

# Actions

- take
- collect: Retorna todos os elementos em uma list
- first
- count

In [10]:
log.collect()

['[ERROR] Failed to create dataframe',
 '[ERROR] Failed to train',
 '[SUCCESS] Train successfully completed']

In [11]:
log.take(2)

['[ERROR] Failed to create dataframe', '[ERROR] Failed to train']

In [12]:
log.first()

'[ERROR] Failed to create dataframe'

In [13]:
log.count()

3

## Tupla

In [14]:
salary = [
    'CargoA,2500',
    'CargoB,1800',
    'CargoA,2480',
    'CargoA,3500',
    'CargoC,1500',
    'CargoB,1500',
]


def split_columns(x: str):
    parts = x.split(',')

    return parts[0], float(parts[1])


rdd = sc.parallelize(salary)

pair_rdd = rdd.map(split_columns)

pair_rdd.take(5)


[('CargoA', 2500.0),
 ('CargoB', 1800.0),
 ('CargoA', 2480.0),
 ('CargoA', 3500.0),
 ('CargoC', 1500.0)]

In [15]:
total_by_role = pair_rdd.reduceByKey(lambda x, y: x + y)
total_by_role.collect()

                                                                                

[('CargoA', 8480.0), ('CargoB', 3300.0), ('CargoC', 1500.0)]

In [16]:
rdd = sc.parallelize(range(1, 10))


rdd = sc.parallelize(range(1, 10))
rdd.reduce(lambda x, y: x + y)


45

In [17]:
total_by_role.sortByKey(ascending=False).take(5)

[('CargoC', 1500.0), ('CargoB', 3300.0), ('CargoA', 8480.0)]

In [18]:
products = [
    ('CatA', 'Prod 1'),
    ('CatA', 'Prod 2'),
    ('CatC', 'Prod 4'),
    ('CatD', 'Prod 6'),
    ('CatB', 'Prod 3'),
    ('CatC', 'Prod 5'),
]


rdd = sc.parallelize(products)

grouped_rdd = rdd.groupByKey()
grouped_rdd.collect()

[('CatB', <pyspark.resultiterable.ResultIterable at 0x7f290151d9d0>),
 ('CatC', <pyspark.resultiterable.ResultIterable at 0x7f290151da60>),
 ('CatD', <pyspark.resultiterable.ResultIterable at 0x7f290151daf0>),
 ('CatA', <pyspark.resultiterable.ResultIterable at 0x7f290151db50>)]

In [19]:
for cat, prod_list in grouped_rdd.collect():
    print(f'{cat}: {list(prod_list)}')

CatB: ['Prod 3']
CatC: ['Prod 4', 'Prod 5']
CatD: ['Prod 6']
CatA: ['Prod 1', 'Prod 2']


In [20]:
products_a = sc.parallelize([
    ('CatA', 'Prod 1'),
    ('CatB', 'Prod 2'),
    ('CatC', 'Prod 3'),
])

products_b = sc.parallelize([
    ('CatA', 5.8),
    ('CatB', 1.2),
    ('CatC', 4),
    ('CatC', 3.8),
])


products = products_a.join(products_b)
products.collect()

[('CatB', ('Prod 2', 1.2)),
 ('CatC', ('Prod 3', 4)),
 ('CatC', ('Prod 3', 3.8)),
 ('CatA', ('Prod 1', 5.8))]

In [21]:
products_txt = products.map(lambda x: f'{x[0]},{x[1][0]},{x[1][1]}')
products_txt.collect()

['CatB,Prod 2,1.2', 'CatC,Prod 3,4', 'CatC,Prod 3,3.8', 'CatA,Prod 1,5.8']

In [22]:
products_txt.saveAsTextFile('products.txt')

In [23]:
products_txt.coalesce(1).saveAsTextFile('products_2.txt')

In [24]:
products.countByKey()

defaultdict(int, {'CatB': 1, 'CatC': 2, 'CatA': 1})

In [25]:
products.collectAsMap()

{'CatB': ('Prod 2', 1.2), 'CatC': ('Prod 3', 3.8), 'CatA': ('Prod 1', 5.8)}

In [26]:
root_path = 'hdfs://192.168.2.102:9000/dataset/{filename}' # If you want to save in hdfs
# root_path = './{filename}'

In [27]:
products_txt.saveAsTextFile(root_path.format(filename='products.txt'))
products_txt.coalesce(1).saveAsTextFile(root_path.format(filename='products_2.txt'))

                                                                                