In [16]:
# #Install Spark and Java
# ! pip install pyspark
# !apt-get update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
# !tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
# !pip install -q findspark
# !pip install nltk
# !python -m nltk.downloader popular

In [12]:
import os
import findspark
from pyspark.sql import SparkSession

# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
findspark.init()
# Start Spark session
spark = SparkSession.builder.appName("Spark_NLP").getOrCreate()

In [13]:
#using NLP to Tokenization and Part-of-Speech Tagging
import nltk
from nltk import word_tokenize
text = word_tokenize("I enjoy biking on the trails")
output = nltk.pos_tag(text)
print(output)

[('I', 'PRP'), ('enjoy', 'VBP'), ('biking', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('trails', 'NNS')]


In [14]:
from pyspark.ml.feature import Tokenizer

In [15]:
# make sample df
dataframe = spark.createDataFrame([
    (0, "spark is great"),
    (1, "I like learning spark"),
    (2, "Why learn hadoop when you can learn spark")
], ["id","sentence"])
dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      spark is great|
|  1|I like learning s...|
|  2|Why learn hadoop ...|
+---+--------------------+



In [17]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_221cb1b077fe

In [18]:
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+-----------------------------------------+--------------------------------------------------+
|id |sentence                                 |words                                             |
+---+-----------------------------------------+--------------------------------------------------+
|0  |spark is great                           |[spark, is, great]                                |
|1  |I like learning spark                    |[i, like, learning, spark]                        |
|2  |Why learn hadoop when you can learn spark|[why, learn, hadoop, when, you, can, learn, spark]|
+---+-----------------------------------------+--------------------------------------------------+



In [19]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [20]:
def word_list_length(word_list):
    return len(word_list)
def word_unique_list_length(word_list):
    return len(set(word_list))
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())
count_unique_tokens = udf(word_unique_list_length, IntegerType())

In [21]:
tokenized_df = tokenized_df.withColumn("tokens",count_tokens(col("words")))
tokenized_df = tokenized_df.withColumn("unique_tokens",count_unique_tokens(col("words")))
tokenized_df.show()

+---+--------------------+--------------------+------+-------------+
| id|            sentence|               words|tokens|unique_tokens|
+---+--------------------+--------------------+------+-------------+
|  0|      spark is great|  [spark, is, great]|     3|            3|
|  1|I like learning s...|[i, like, learnin...|     4|            4|
|  2|Why learn hadoop ...|[why, learn, hado...|     8|            7|
+---+--------------------+--------------------+------+-------------+



In [22]:
# make df with stops words
dataframe_stopwords = spark.createDataFrame([
    (0, "I use spark with big data"),
    (1, "This stop words will be removed"),
    (2, "This is going to be cool")
], ["id","sentence"])
dataframe_stopwords.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|I use spark with ...|
|  1|This stop words w...|
|  2|This is going to ...|
+---+--------------------+



In [33]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover
# Run the Remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [31]:
tokenized_dataframe_stopwords = tokenizer.transform(dataframe_stopwords)
tokenized_dataframe_stopwords.show(truncate=False)

+---+-------------------------------+--------------------------------------+
|id |sentence                       |words                                 |
+---+-------------------------------+--------------------------------------+
|0  |I use spark with big data      |[i, use, spark, with, big, data]      |
|1  |This stop words will be removed|[this, stop, words, will, be, removed]|
|2  |This is going to be cool       |[this, is, going, to, be, cool]       |
+---+-------------------------------+--------------------------------------+



In [34]:
remover.transform(tokenized_dataframe_stopwords).show(truncate=True)

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|            filtered|
+---+--------------------+--------------------+--------------------+
|  0|I use spark with ...|[i, use, spark, w...|[use, spark, big,...|
|  1|This stop words w...|[this, stop, word...|[stop, words, rem...|
|  2|This is going to ...|[this, is, going,...|       [going, cool]|
+---+--------------------+--------------------+--------------------+

