In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLPTools').getOrCreate()

In [2]:
#tokenizer
from pyspark.ml.feature import Tokenizer,RegexTokenizer

In [3]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType


In [4]:
my_dataFrame = spark.createDataFrame([
    (1,'Hi I heard About Spark is a Good Language'),
    (2,'My name is Debabrata and I am pursing my Masters'),
    (3,'Language,Java,C,Python,Spark,Good,Masters')
],['id','sentence'])

In [5]:
my_dataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|Hi I heard About ...|
|  2|My name is Debabr...|
|  3|Language,Java,C,P...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='Words')

In [8]:
req_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='RegWords',pattern='\\W')

In [11]:
count_tokens = udf(lambda words:len(words),IntegerType())

In [12]:
tokenized_data = tokenizer.transform(my_dataFrame)

In [13]:
tokenized_data.show()

+---+--------------------+--------------------+
| id|            sentence|               Words|
+---+--------------------+--------------------+
|  1|Hi I heard About ...|[hi, i, heard, ab...|
|  2|My name is Debabr...|[my, name, is, de...|
|  3|Language,Java,C,P...|[language,java,c,...|
+---+--------------------+--------------------+



In [14]:
tokenized_data.withColumn('tokens',count_tokens(col('Words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               Words|tokens|
+---+--------------------+--------------------+------+
|  1|Hi I heard About ...|[hi, i, heard, ab...|     9|
|  2|My name is Debabr...|[my, name, is, de...|    10|
|  3|Language,Java,C,P...|[language,java,c,...|     1|
+---+--------------------+--------------------+------+



In [15]:
regtokenized_data = req_tokenizer.transform(my_dataFrame)

In [21]:
reg_data=regtokenized_data.withColumn('tokens',count_tokens(col('RegWords')))

In [23]:
reg_data.show()

+---+--------------------+--------------------+------+
| id|            sentence|            RegWords|tokens|
+---+--------------------+--------------------+------+
|  1|Hi I heard About ...|[hi, i, heard, ab...|     9|
|  2|My name is Debabr...|[my, name, is, de...|    10|
|  3|Language,Java,C,P...|[language, java, ...|     7|
+---+--------------------+--------------------+------+



In [18]:
#StopWords Removal
from pyspark.ml.feature import StopWordsRemover

In [19]:
remover = StopWordsRemover(inputCol='RegWords',outputCol='filteredWords')

In [29]:
filtered_data=remover.transform(reg_data).withColumn('Aftertokens',count_tokens(col('filteredWords')))

In [27]:
#nGrams
from pyspark.ml.feature import NGram

In [28]:
ngram = NGram(n=2,inputCol='filteredWords',outputCol='Ngram')

In [31]:
ngram.transform(filtered_data).select('Ngram').show(truncate=False)

+-------------------------------------------------------------------------+
|Ngram                                                                    |
+-------------------------------------------------------------------------+
|[hi heard, heard spark, spark good, good language]                       |
|[name debabrata, debabrata pursing, pursing masters]                     |
|[language java, java c, c python, python spark, spark good, good masters]|
+-------------------------------------------------------------------------+



In [34]:
ngram_data = ngram.transform(filtered_data)

In [35]:
ngram_data.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- RegWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokens: integer (nullable = true)
 |-- filteredWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Aftertokens: integer (nullable = true)
 |-- Ngram: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [36]:
#termFrequenct, IDF, 
from pyspark.ml.feature import HashingTF,IDF

In [37]:
hashing_tf = HashingTF(inputCol='filteredWords',outputCol='HashingTF')

In [38]:
featurizedData = hashing_tf.transform(ngram_data)

In [40]:
featurizedData.select('filteredWords','HashingTF').show()

+--------------------+--------------------+
|       filteredWords|           HashingTF|
+--------------------+--------------------+
|[hi, heard, spark...|(262144,[49304,73...|
|[name, debabrata,...|(262144,[31617,35...|
|[language, java, ...|(262144,[28698,55...|
+--------------------+--------------------+



In [41]:
idf = IDF(inputCol='HashingTF',outputCol='IDF')

In [43]:
idf_model = idf.fit(featurizedData)

In [44]:
idf_data = idf_model.transform(featurizedData)

In [45]:
idf_data.select('IDF').show(truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|IDF                                                                                                                                                                                               |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(262144,[49304,73197,113432,116836,234657],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.28768207245178085])                                                   |
|(262144,[31617,35119,79364,243021],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.6931471805599453])                                                                                |
|(262144,[28698

In [46]:
#count vectorizer
from pyspark.ml.feature import CountVectorizer

In [47]:
cv = CountVectorizer(inputCol='filteredWords',outputCol='CountVectorized',vocabSize=3,minDF=2)

In [49]:
cv_model = cv.fit(idf_data)

In [50]:
cv_data=cv_model.transform(idf_data)

In [52]:
cv_data.select('filteredWords','CountVectorized').show(truncate=False)

+-------------------------------------------------+-------------------------+
|filteredWords                                    |CountVectorized          |
+-------------------------------------------------+-------------------------+
|[hi, heard, spark, good, language]               |(3,[0,1],[1.0,1.0])      |
|[name, debabrata, pursing, masters]              |(3,[2],[1.0])            |
|[language, java, c, python, spark, good, masters]|(3,[0,1,2],[1.0,1.0,1.0])|
+-------------------------------------------------+-------------------------+

