# PySpark

In [1]:
import os
import pyspark
from pyspark import SparkContext
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [2]:
sc = SparkContext("local", "test")
sc

In [3]:
rdd = sc.parallelize(range(1, 11), 4)
rdd.take(100)


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### Transform

map

In [4]:
rdd = sc.parallelize(range(1, 11), 4)
rdd_map = rdd.map(lambda x: 2 * x)
print(rdd.collect())
print(rdd_map.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


flatMap

In [5]:
rdd2 = sc.parallelize(["hello SamShare", "hello PySpark"])
print(rdd2.collect())
print(rdd2.map(lambda x: x.split(" ")).collect())
print(rdd2.flatMap(lambda x: x.split(" ")).collect())

['hello SamShare', 'hello PySpark']
[['hello', 'SamShare'], ['hello', 'PySpark']]
['hello', 'SamShare', 'hello', 'PySpark']


filter

In [6]:
rdd = sc.parallelize(range(1, 11), 4)
print(rdd.collect())
print(rdd.filter(lambda x: x % 2 == 0).collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[2, 4, 6, 8, 10]


distinct

In [7]:
rdd = sc.parallelize([2, 2, 4, 8, 8, 8, 8, 16, 32, 32])
print(rdd.collect())
print(rdd.distinct().collect())

[2, 2, 4, 8, 8, 8, 8, 16, 32, 32]
[2, 4, 8, 16, 32]


reduceByKey

In [8]:
from operator import add
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
print(rdd.collect())
print(rdd.reduceByKey(add).collect())

[('a', 1), ('b', 1), ('a', 1)]
[('a', 2), ('b', 1)]


mapPartitions

In [9]:
rdd = sc.parallelize([1, 2, 3, 4], 2)
def f(iterator):
    yield sum(iterator)
print(rdd.collect())
print(rdd.mapPartitions(f).collect())

[1, 2, 3, 4]
[3, 7]


sortBy

In [10]:
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
print(sc.parallelize(tmp).sortBy(lambda x: x[0]).collect())
print(sc.parallelize(tmp).sortBy(lambda x: x[1]).collect())

[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
[('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]


subtract

In [11]:
x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)])
y = sc.parallelize([("a", 3), ("c", None)])
print(sorted(x.subtract(y).collect()))

[('a', 1), ('b', 4), ('b', 5)]


union

In [12]:
rdd = sc.parallelize([1, 1, 2, 3])
print(rdd.union(rdd).collect())

[1, 1, 2, 3, 1, 1, 2, 3]


intersection

In [13]:
rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5, 2, 3])
rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
print(rdd1.intersection(rdd2).collect())

[2, 1, 3]


cartesian

In [14]:
rdd = sc.parallelize([1, 2, 3])
print(sorted(rdd.cartesian(rdd).collect()))

[(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3)]


zip

In [15]:
x = sc.parallelize(range(0, 5))
y = sc.parallelize(range(1000, 1005))
print(x.zip(y).collect())

[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]


zipWithIndex

In [16]:
rdd_name = sc.parallelize(["LiLei", "Hanmeimei", "Lily", "Lucy", "Ann", "Dachui", "RuHua"])
rdd_index = rdd_name.zipWithIndex()
print(rdd_index.collect())

[('LiLei', 0), ('Hanmeimei', 1), ('Lily', 2), ('Lucy', 3), ('Ann', 4), ('Dachui', 5), ('RuHua', 6)]


groupByKey

In [18]:
rdd = sc.parallelize([("a", 3), ("b", 1), ("a", 1)])
print(sorted(rdd.groupByKey().mapValues(len).collect()))
print(sorted(rdd.groupByKey().mapValues(list).collect()))
print(sorted(rdd.groupByKey().mapValues(sum).collect()))

[('a', 2), ('b', 1)]
[('a', [3, 1]), ('b', [1])]
[('a', 4), ('b', 1)]


sortByKey

In [23]:
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
print(sc.parallelize(tmp).sortByKey(False, 1).collect())

[('d', 4), ('b', 2), ('a', 1), ('2', 5), ('1', 3)]


join

In [25]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
print(sorted(x.join(y).collect()))
print(sorted(y.join(x).collect()))

[('a', (1, 2)), ('a', (1, 3))]
[('a', (2, 1)), ('a', (3, 1))]


leftOuterJoin/rightOuterJoin

In [27]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
print(sorted(x.leftOuterJoin(y).collect()))
print(sorted(x.rightOuterJoin(y).collect()))

[('a', (1, 2)), ('b', (4, None))]
[('a', (1, 2))]


### Action

collect

In [28]:
rdd = sc.parallelize(range(0, 5))
rdd_collect = rdd.collect()
print(rdd_collect)

[0, 1, 2, 3, 4]


first

In [30]:
print(sc.parallelize([2, 3, 4]).first())

2


collectAsMap

In [31]:
print(sc.parallelize([(1, 2), (3, 4)]).collectAsMap())

{1: 2, 3: 4}


reduce

In [37]:
rdd = sc.parallelize(range(10), 5)
print(rdd.reduce(lambda x, y: x+y))
print(rdd.reduce(add))

45
45


countByKey/countByValue

In [38]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
print(sorted(rdd.countByKey().items()))
print(sorted(rdd.countByValue().items()))

[('a', 2), ('b', 1)]
[(('a', 1), 2), (('b', 1), 1)]


take

In [39]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
print(rdd.take(5))

[('a', 1), ('b', 1), ('a', 1)]


takeSample

In [43]:
rdd = sc.parallelize(range(100), 4)
print(rdd.takeSample(True, 5, 42))

[59, 55, 27, 70, 98]


foreach

In [44]:
rdd = sc.parallelize(range(10), 5)
accum = sc.accumulator(0)
rdd.foreach(lambda x: accum.add(x))
print(accum.value)

45


## SparkSQL

In [47]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('testsql').getOrCreate()

sc = spark.sparkContext

### SparkDataFrame

In [48]:
rdd = sc.parallelize([("Sam", 28, 88), ("Flora", 28, 90), ("Run", 1, 60)])
df = rdd.toDF(["name", "age", "score"])
df.show()
df.printSchema()

+-----+---+-----+
| name|age|score|
+-----+---+-----+
|  Sam| 28|   88|
|Flora| 28|   90|
|  Run|  1|   60|
+-----+---+-----+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- score: long (nullable = true)



In [51]:
import pandas as pd
df = pd.DataFrame([['Sam', 28, 88], ['Flora', 28, 90], ['Run', 1, 60]],
                  columns=['name', 'age', 'score'])
print(df)
Spark_df = spark.createDataFrame(df)
Spark_df.show()

    name  age  score
0    Sam   28     88
1  Flora   28     90
2    Run    1     60
+-----+---+-----+
| name|age|score|
+-----+---+-----+
|  Sam| 28|   88|
|Flora| 28|   90|
|  Run|  1|   60|
+-----+---+-----+



In [52]:
list_values = [['Sam', 28, 88], ['Flora', 28, 90], ['Run', 1, 60]]
Spark_df = spark.createDataFrame(list_values, ['name', 'age', 'score'])
Spark_df.show()

+-----+---+-----+
| name|age|score|
+-----+---+-----+
|  Sam| 28|   88|
|Flora| 28|   90|
|  Run|  1|   60|
+-----+---+-----+



In [54]:
df.to_csv('test_csv.csv')

df = spark.read.option("header", "true").option("inferSchema", "true").option("delimiter", ",").csv("test_csv.csv")
df.show(2)
df.printSchema()


+---+-----+---+-----+
|_c0| name|age|score|
+---+-----+---+-----+
|  0|  Sam| 28|   88|
|  1|Flora| 28|   90|
+---+-----+---+-----+
only showing top 2 rows

root
 |-- _c0: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- score: integer (nullable = true)



### SparkDataFrame API