# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Natural Language Processing").getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql import functions as f 
from pyspark.sql.types import IntegerType

# Tokenizer - Sentence to word

In [3]:
data = spark.createDataFrame([
    (0,'Hi I heard about Spark'),
    (1,'I wish java could use case classes'),
    (2,'Logistic, regression, models, are, neat ')    
],['id','sentence'])

In [4]:
data.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)



In [5]:
data.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic, regress...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol = 'sentence', outputCol ='words')
regexTokenizer = RegexTokenizer(inputCol = 'sentence', outputCol ='words', pattern='\\W')

In [7]:
countTokens = f.udf(lambda words: len(words),IntegerType())

In [8]:
tokenized = tokenizer.transform(data)

In [9]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic, regress...|[logistic,, regre...|
+---+--------------------+--------------------+



In [10]:
tokenized.withColumn('tokens',countTokens(f.col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic,, regre...|     5|
+---+--------------------+--------------------+------+



In [11]:
regexTokenized = regexTokenizer.transform(data)

In [12]:
regexTokenized.withColumn('tokens',countTokens(f.col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



# Stop word removal

In [13]:
from pyspark.ml.feature import StopWordsRemover
sentenceDF = spark.createDataFrame([
    (0,['I','saw','the','green','horse']),
    (1,['Mary','had','a','little','lamb'])    
],['id','tokens'])

In [14]:
sentenceDF.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [15]:
remover = StopWordsRemover(inputCol = 'tokens', outputCol ='filtered')

In [16]:
remover.transform(sentenceDF).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



# N-gram

In [17]:
from pyspark.ml.feature import NGram

In [18]:
wordDF = spark.createDataFrame([
    (0,["Hi","I","heard","about","Spark"]),
    (1,["I","wish","java","could","use","case","classes"]),
    (2,["Logistic","regression","models","are","neat"])    
],['id','words'])
wordDF.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[Hi, I, heard, ab...|
|  1|[I, wish, java, c...|
|  2|[Logistic, regres...|
+---+--------------------+



In [19]:
ngram=NGram(n = 2, inputCol = 'words', outputCol ='grams')

In [20]:
ngram.transform(wordDF).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi, I, heard, ab...|[Hi I, I heard, h...|
|  1|[I, wish, java, c...|[I wish, wish jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+



In [21]:
ngram.transform(wordDF).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

