# Install Java and Spark on Hadoop

In [None]:
# install java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# install spark (change the version number if needed)
!wget -q https://downloads.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
# unzip the spark file to the current folder
!tar xf spark-3.3.2-bin-hadoop3.tgz

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease [24.3 kB]
Get:11 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,681 kB]
Get:12 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1,343 kB]
Hit:13 http://ppa.launchpad.net/ubuntu

In [None]:
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"


In [None]:
!pip install findspark
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


# Creating a SparkSession in Python

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Introduction to Spark")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

In [None]:
# Import necessary libraries
from pyspark.sql.functions import col, column, expr
from pyspark.sql import functions as f

# Answer the questions

0- Load the data files

In [None]:
!git clone https://github.com/nnthaofit/CSC14118.git

Cloning into 'CSC14118'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), 762.51 KiB | 1.48 MiB/s, done.


In [None]:
df = spark.read.json("CSC14118/movies.json") 

1- Show the schema of DataFrame that stores the movies dataset.

In [None]:
df.schema

StructType([StructField('cast', ArrayType(StringType(), True), True), StructField('genres', ArrayType(StringType(), True), True), StructField('title', StringType(), True), StructField('year', LongType(), True)])

In [None]:
# 1. Show the number of distinct films in the dataset
df.orderBy(df.year, ascending = False).show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+---------------------------+----+
|cast                                                                                                                                                                  |genres                                   |title                      |year|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+---------------------------+----+
|[Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Kirk Acevedo, Bruce Davison]                                                                |[Horror, Thriller]                       |Insidious: The Last Key    |2018|
|[James Corden, Domhnall

In [None]:
# 1. Show the number of distinct films in the dataset
df.select(f.countDistinct('*')).show()

+-----------------------------------------+
|count(DISTINCT cast, genres, title, year)|
+-----------------------------------------+
|                                    28789|
+-----------------------------------------+



In [None]:
# 2. Count the number of movies released during the years 2012 and 2015 (included)
df.where((df.year >= 2012) & (df.year <= 2015)).count()

1015

In [None]:
# 3.Show the year in which the number of movies released is highest
grouped_df = df.groupBy(df.year).count().orderBy('count', ascending=False)
highest_freq = grouped_df.first()[1]
grouped_df.where(f.col('count') == highest_freq).show()

+----+-----+
|year|count|
+----+-----+
|1919|  634|
+----+-----+



In [None]:
# 4. Show the list of movies such that for each film, the number of actors/actresses is at least five, and the number of genres it belongs to is at most two genres.
df.where((f.size(df.cast) >= 5) & (f.size(df.genres) <= 2)).show()

+--------------------+----------------+--------------------+----+
|                cast|          genres|               title|year|
+--------------------+----------------+--------------------+----+
|[Earle Foxe, Alie...|         [Drama]|  A Desperate Chance|1913|
|[Charlotte Burton...|         [Drama]|    The Archeologist|1914|
|[Charlotte Burton...|         [Drama]|At the Potter's W...|1914|
|[Herbert Tracey, ...|        [Comedy]|    Back to the Farm|1914|
|[Charlotte Burton...|              []|    The Beggar Child|1914|
|[William Garwood,...|              []|       Billy's Rival|1914|
|[B. Reeves Eason,...|         [Drama]| Break, Break, Break|1914|
|[Charlotte Burton...|              []|       The Butterfly|1914|
|[Charlotte Burton...|       [Western]|Calamity Anne's L...|1914|
|[Charlie Chaplin,...|        [Comedy]|    The Star Boarder|1914|
|[Sydney Ayres, Ja...|              []|A Story of Little...|1914|
|[Sydney Ayres, Pe...|              []|The Story of the ...|1914|
|[Charlott

In [None]:
# 5. Show the movies whose name is the longest
new_df = df.select(df.title).withColumn('length', f.length(df.title)).orderBy(f.col('length'), ascending = False)
highest_len = new_df.first()[1]
new_df.where(f.col('length') == highest_len).show()

+--------------------+------+
|               title|length|
+--------------------+------+
|Cornell-Columbia-...|   110|
+--------------------+------+



In [None]:
# 6. Show the movies whose name contains the word “fighting” (case-insensitive)
df.select(df.title).where(f.lower(df.title).contains('fighting')).show()

+--------------------+
|               title|
+--------------------+
|  A Fighting Colleen|
|     Fighting Cressy|
|    Fighting Destiny|
|   Fighting for Gold|
|  The Fighting Heart|
|   The Fighting Line|
|  The Fighting Guide|
| The Fighting Streak|
|  The Fighting Blade|
| The Fighting Coward|
|       Fighting Fury|
|The Fighting Adve...|
|    The Fighting Sap|
|  The Fighting Demon|
|       Fighting Fate|
|  The Fighting Heart|
|       Fighting Luck|
|  The Fighting Smile|
| Fighting the Flames|
|      Fighting Youth|
+--------------------+
only showing top 20 rows



In [None]:
# 7. Show the list of distinct genres appearing in the dataset
df.select(f.explode('genres').alias('genres')).distinct().show()

+-------------+
|       genres|
+-------------+
|        Crime|
|      Romance|
|     Thriller|
|      Slasher|
|Found Footage|
|    Adventure|
|         Teen|
| Martial Arts|
|       Sports|
|        Drama|
|          War|
|  Documentary|
|       Family|
|      Fantasy|
|       Silent|
|     Disaster|
|        Legal|
|      Mystery|
| Supernatural|
|     Suspense|
+-------------+
only showing top 20 rows



In [None]:
# 8. List all movies in which the actor Harrison Ford has participated.
df.where(f.array_contains(df.cast, 'Harrison Ford')).show(truncate = False)

+-------------------------------------------------+-----------------+-------------------------+----+
|cast                                             |genres           |title                    |year|
+-------------------------------------------------+-----------------+-------------------------+----+
|[Constance Talmadge, Harrison Ford]              |[Romance, Comedy]|Experimental Marriage    |1919|
|[Constance Talmadge, Harrison Ford]              |[Comedy]         |Happiness a la Mode      |1919|
|[Constance Talmadge, Harrison Ford]              |[Comedy]         |Romance and Arabella     |1919|
|[Vivian Martin, Harrison Ford]                   |[Comedy]         |The Third Kiss           |1919|
|[Harrison Ford, Constance Talmadge]              |[Comedy]         |The Veiled Adventure     |1919|
|[Constance Talmadge, Harrison Ford]              |[Comedy]         |Who Cares?               |1919|
|[Vivian Martin, Harrison Ford]                   |[Drama]          |You Never Saw Such a G

In [None]:
# 9. List all movies in which the actors/actresses whose names include the word “Lewis“ (case-insensitive) have participated.
newdf = df.withColumn('cast', f.explode(df.cast))
newdf.where(f.lower(newdf.cast).contains('lewis')).show()

+--------------+-----------+--------------------+----+
|          cast|     genres|               title|year|
+--------------+-----------+--------------------+----+
|     Ida Lewis|         []|       The Butterfly|1914|
| Sheldon Lewis|    [Drama]|The Exploits of E...|1914|
|     Ida Lewis|   [Comedy]| Mein Lieber Katrina|1914|
|   Ralph Lewis|    [Drama]|      Going Straight|1916|
|   Ralph Lewis|    [Drama]|Gretchen the Gree...|1916|
|     Ben Lewis|  [Western]|     A Sister of Six|1916|
| Lewis J. Cody|    [Drama]| The Bride's Silence|1917|
|Mitchell Lewis|    [Drama]|Nine-Tenths of th...|1918|
|Mitchell Lewis|    [Drama]|The Faith of the ...|1919|
|   Ralph Lewis|   [Comedy]|         The Hoodlum|1919|
|Mitchell Lewis|    [Drama]|Jacques of the Si...|1919|
|Mitchell Lewis|    [Drama]|The Last of His P...|1919|
|   Lewis Stone|    [Drama]|        Man's Desire|1919|
|    Vera Lewis|   [Comedy]|   Yvonne from Paris|1919|
|Mitchell Lewis|    [Drama]|Nine-Tenths of th...|1919|
|   Ralph 

In [None]:
# 10. Show top five actors/actresses that have participated in most movies.
exploded_df = df.withColumn('cast', f.explode(df.cast))
res_df = exploded_df.groupBy(exploded_df.cast).count().orderBy('count', ascending = False).show(5)


+----------------+-----+
|            cast|count|
+----------------+-----+
|    Harold Lloyd|  190|
|     Hoot Gibson|  142|
|      John Wayne|  136|
|Charles Starrett|  116|
|    Bebe Daniels|  103|
+----------------+-----+
only showing top 5 rows



# B. RDD exercises

## 1. Given a string s that include only alphabetical letters and spaces. Check whether s1 is a palindrome.

In [None]:
# palindrome là ngược giống xuôi.
s = "race car"
def isPalindrome(s):
  return s==s[::-1]

isPalindrome(s.replace(" ",""))

True

In [None]:
s = 'race car'
rdd = spark.sparkContext.parallelize(s).filter(lambda l: l!= ' ')
rdd.collect()

['r', 'a', 'c', 'e', 'c', 'a', 'r']

In [None]:
# a rdd for the orignal series of letters
index = spark.sparkContext.range(0, rdd.count())
rddForward = index.zip(rdd)
rddForward.collect()

[(0, 'r'), (1, 'a'), (2, 'c'), (3, 'e'), (4, 'c'), (5, 'a'), (6, 'r')]

In [None]:
rddBackward = rddForward.sortBy(lambda r:r[0]*-1)
rddBackward.collect()

[(6, 'r'), (5, 'a'), (4, 'c'), (3, 'e'), (2, 'c'), (1, 'a'), (0, 'r')]

In [None]:
rddCombined = rddForward.zip(rddBackward)
rddCombined.collect()

[((0, 'r'), (6, 'r')),
 ((1, 'a'), (5, 'a')),
 ((2, 'c'), (4, 'c')),
 ((3, 'e'), (3, 'e')),
 ((4, 'c'), (2, 'c')),
 ((5, 'a'), (1, 'a')),
 ((6, 'r'), (0, 'r'))]

In [None]:
rddCombined.filter(lambda r: r[0][1] != r[1][1]).count()

0

## 2. Given a string s that include only alphabetical letters and spaces. Check whether s1 is a pangram.

In [None]:
s = "The quick brown fox jumps over the lazy dog"
s1 = "The quick brown fox jumps over the dog"
def isPangram(s):
  rdd = spark.sparkContext.parallelize(s).filter(lambda l: l!= ' ')
  return rdd.distinct().count() == 26

isPangram(s.lower())

True

## 3. Given two strings, s1 and s2, that include only alphabetical letters and spaces. Check whether s1 is an anagram of s2

In [None]:
str1 = "listen" 
str2 = "silent"

def sortStr(str):
  return spark.sparkContext.parallelize(str2).filter(lambda l: l!= ' ')\
         .sortBy(lambda x: x)

def isAnagram(str1, str2):
  rdd1 = sortStr(str1)
  rdd2 = sortStr(str2)
  rddCombined = rdd1.zip(rdd2)
  return rddCombined.filter(lambda x: x[0] != x[1]).count() == 0

isAnagram(str1, str2) 

True