In [1]:
!pip install pyspark
!pip install findspark




In [2]:
import findspark
findspark.init()

In [3]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [4]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

24/09/05 06:30:32 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
if 'spark' in locals() and isinstance(spark, SparkSession):
    print("SparkSession is active and ready to use.")
else:
    print("SparkSession is not active. Please create a SparkSession.")

SparkSession is active and ready to use.


In [6]:
data = range(1,30)
# print first element of iterator
print(data[0])
len(data)
xrangeRDD = sc.parallelize(data, 4)

# this will let us know that we created an RDD
xrangeRDD


1


PythonRDD[1] at RDD at PythonRDD.scala:53

In [7]:
subRDD = xrangeRDD.map(lambda x: x-1)
filteredRDD = subRDD.filter(lambda x : x<10)

In [8]:
print(filteredRDD.collect())
filteredRDD.count()

                                                                                

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


10

In [9]:
import time 

test = sc.parallelize(range(1,50000),4)
test.cache()

t1 = time.time()
# first count will trigger evaluation of count *and* cache
count1 = test.count()
dt1 = time.time() - t1
print("dt1: ", dt1)


t2 = time.time()
# second count operates on cached data only
count2 = test.count()
dt2 = time.time() - t2
print("dt2: ", dt2)

#test.count()

                                                                                

dt1:  0.8319389820098877
dt2:  0.22359561920166016


In [10]:
spark

In [11]:
# Download the data first into a local `people.json` file
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/data/people.json >> people.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    73  100    73    0     0    839      0 --:--:-- --:--:-- --:--:--   839


In [12]:
# Read the dataset into a spark dataframe using the `read.json()` function
df = spark.read.json("people.json").cache()

print(df.collect())
print(df.count())

                                                                                

[Row(age=None, name='Michael'), Row(age=30, name='Andy'), Row(age=19, name='Justin'), Row(age=None, name='Michael'), Row(age=30, name='Andy'), Row(age=19, name='Justin')]
6


In [13]:
# Print the dataframe as well as the data schema
df.show()
df.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [14]:
# Register the DataFrame as a SQL temporary view
df.createTempView("people")

In [15]:
# Select and show basic data columns

df.select("name").show()
df.select(df["name"]).show()
spark.sql("SELECT name FROM people").show()


+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
|Michael|
|   Andy|
| Justin|
+-------+



In [16]:
# Perform basic filtering

df.filter(df["age"] > 21).show()
spark.sql("SELECT age, name FROM people WHERE age > 21").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
| 30|Andy|
+---+----+

+---+----+
|age|name|
+---+----+
| 30|Andy|
| 30|Andy|
+---+----+



In [17]:
# Perfom basic aggregation of data

df.groupBy("age").count().show()
spark.sql("SELECT age, COUNT(age) as count FROM people GROUP BY age").show()

                                                                                

+----+-----+
| age|count|
+----+-----+
|  19|    2|
|null|    2|
|  30|    2|
+----+-----+





+----+-----+
| age|count|
+----+-----+
|  19|    2|
|null|    0|
|  30|    2|
+----+-----+



                                                                                

In [18]:
spark


In [46]:
data = range(1, 50)

RDD = sc.parallelize(data)

# Filter the even numbers using modulus
Even = RDD.filter(lambda x: x % 2 == 0)

print(Even.collect())


[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48]


In [59]:
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/people2.json >> people2.json

df = spark.read.json("people2.json")

df.cache()

print(df.collect(), df.count())

df.show()

df.createOrReplaceTempView("people2")

spark.sql("SELECT AVG(age) AS average_age FROM people2").show()




  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   136  100   136    0     0   1789      0 --:--:-- --:--:-- --:--:--  1789


24/09/05 07:40:53 WARN execution.CacheManager: Asked to cache already cached data.


[Row(age=25, name='Michael'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff'), Row(age=24, name='Andy'), Row(age=19, name='Justin'), Row(age=26, name='George'), Row(age=30, name='Jeff')] 25
+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
| 24|   Andy|
| 19| Justin|
| 26| Georg

In [61]:
spark.stop()

In [None]:
spark