In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Waiting for headers] [1 InRelease 14.2 kB/88.7 kB 16%] [Connecting to cloud                                                                               Get:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
0% [Waiting for headers] [1 InRelease 14.2 kB/88.7 kB 16%] [Connecting to cloud                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease 88.7 kB/88.7 kB 100%] [Connected to cloud.r-project.org (108.15                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [4 InRelease 82.2 kB/88.7 kB 93%] [Connected 

In [2]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer

In [4]:
# Create DataFrame
sentenceData = spark.createDataFrame([
                                      (0, ["Big","data","is","super","powerful"]),
                                      (1,["This","is","going","to","be","epic"])
],["id","raw"])

In [5]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

In [6]:
# Run the Remover
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

In [7]:
#Tansform and show data
remover.transform(sentenceData).show(truncate=False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+



In [8]:
# Create sample Dataframe
dataframe_mod = spark.createDataFrame([
                                   (0, "Spark is great"),
                                   (1, "We are learning Spark"),
                                   (2, "Spark is better than hadoop no doubt"),
                                   (3, "Adding a line for skill drill practice")
],["id","sentence"])
dataframe_mod.show(truncate=False)

+---+--------------------------------------+
|id |sentence                              |
+---+--------------------------------------+
|0  |Spark is great                        |
|1  |We are learning Spark                 |
|2  |Spark is better than hadoop no doubt  |
|3  |Adding a line for skill drill practice|
+---+--------------------------------------+



In [9]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)
# Create a user defined function   
count_tokens = udf(word_list_length, IntegerType())

In [10]:
# Create our Tokenizer
tokenizer = Tokenizer(inputCol="sentence",outputCol="words")

# Transform and show DataFrame
dataframe_mod = tokenizer.transform(dataframe_mod)

# Select the needed columns and don't truncate results
dataframe_mod.withColumn("tokens",count_tokens(col("words"))).show(truncate=False)

+---+--------------------------------------+----------------------------------------------+------+
|id |sentence                              |words                                         |tokens|
+---+--------------------------------------+----------------------------------------------+------+
|0  |Spark is great                        |[spark, is, great]                            |3     |
|1  |We are learning Spark                 |[we, are, learning, spark]                    |4     |
|2  |Spark is better than hadoop no doubt  |[spark, is, better, than, hadoop, no, doubt]  |7     |
|3  |Adding a line for skill drill practice|[adding, a, line, for, skill, drill, practice]|7     |
+---+--------------------------------------+----------------------------------------------+------+



In [11]:
# Run the Remover
remover_mod = StopWordsRemover(inputCol="words", outputCol="filtered")

In [12]:
#Tansform and show data
remover_mod.transform(dataframe_mod).show(truncate=False)

+---+--------------------------------------+----------------------------------------------+--------------------------------------+
|id |sentence                              |words                                         |filtered                              |
+---+--------------------------------------+----------------------------------------------+--------------------------------------+
|0  |Spark is great                        |[spark, is, great]                            |[spark, great]                        |
|1  |We are learning Spark                 |[we, are, learning, spark]                    |[learning, spark]                     |
|2  |Spark is better than hadoop no doubt  |[spark, is, better, than, hadoop, no, doubt]  |[spark, better, hadoop, doubt]        |
|3  |Adding a line for skill drill practice|[adding, a, line, for, skill, drill, practice]|[adding, line, skill, drill, practice]|
+---+--------------------------------------+---------------------------------------