In [2]:
try:
    sc.stop()
except:
    print("no sc to stop")

In [3]:

from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('PysparkNLP')\
    .getOrCreate()

In [7]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


In [10]:
sen_df = spark.createDataFrame([
    (0, "this is Spark session on NLP"),
    (1, "I wish we could have learned NLP with python."),
    (2, "for text analysis ml algorithms are logistic, naive bayes model"),
    ], schema= ['id', 'sentence'])

sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|this is Spark ses...|
|  1|I wish we could h...|
|  2|for text analysis...|
+---+--------------------+



In [11]:
# Tokenizer & Lowercase

In [14]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

#count_tokens = udf(lamba s,w: s.count(w))
count_tokens = udf(lambda w: len(w) , IntegerType())

tokenized=tokenizer.transform(sen_df)

tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|this is Spark ses...|[this, is, spark,...|
|  1|I wish we could h...|[i, wish, we, cou...|
|  2|for text analysis...|[for, text, analy...|
+---+--------------------+--------------------+



In [15]:
tokenized.withColumn('tokens', count_tokens('words')).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|this is Spark ses...|[this, is, spark,...|     6|
|  1|I wish we could h...|[i, wish, we, cou...|     9|
|  2|for text analysis...|[for, text, analy...|    10|
+---+--------------------+--------------------+------+



In [20]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern=r'\W')
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|this is Spark ses...|[this, is, spark,...|
|  1|I wish we could h...|[i, wish, we, cou...|
|  2|for text analysis...|[for, text, analy...|
+---+--------------------+--------------------+



In [25]:
from pyspark.ml.feature import StopWordsRemover

filtered_df = StopWordsRemover(inputCol='words', outputCol='filtered')
filtered_df.transform(regex_tokenized).select('words', 'filtered').show(truncate=False)

+-------------------------------------------------------------------------+---------------------------------------------------------------+
|words                                                                    |filtered                                                       |
+-------------------------------------------------------------------------+---------------------------------------------------------------+
|[this, is, spark, session, on, nlp]                                      |[spark, session, nlp]                                          |
|[i, wish, we, could, have, learned, nlp, with, python]                   |[wish, learned, nlp, python]                                   |
|[for, text, analysis, ml, algorithms, are, logistic, naive, bayes, model]|[text, analysis, ml, algorithms, logistic, naive, bayes, model]|
+-------------------------------------------------------------------------+---------------------------------------------------------------+



In [26]:
"""
Review of Restaurants

sentiment analysis using twitter

negative/positive/neutral sentiment expressed on topic

(The ambience of the cafe was good but food was pathetic) -> neutral
(The ambience of the cafe was good) ->  positive
(but food was pathetic) -> negative


(the food was not good) - > neutral?
(the food) (was not) (good) 
(the food) (was) (not good) -> negative


(food not good)
(food not) (not good)

"""

'\nReview of Restaurants\n\nsentiment analysis using twitter\n\nnegative/positive/neutral sentiment expressed on topic\n\n(The ambience of the cafe was good but food was pathetic) -> neutral\n(The ambience of the cafe was good) ->  positive\n(but food was pathetic) -> negative\n\n\n(the food was not good) - > neutral?\n(the food) (was not) (good) \n(the food) (was) (not good) -> negative\n\n\n(food not good)\n(food not) (not good)\n\n'

In [30]:
from pyspark.ml.feature import NGram

wordDataFrame= spark.createDataFrame([
    (0, "Hi I know about nlp".split()),
    (1, "I wish we could have learned nlp".split()),
    (2, "For text analysis ml algorithms are logistic, naive bayes model".split()),        
], ['id', 'text'])

wordDataFrame.show()

+---+--------------------+
| id|                text|
+---+--------------------+
|  0|[Hi, I, know, abo...|
|  1|[I, wish, we, cou...|
|  2|[For, text, analy...|
+---+--------------------+



In [33]:
ngram = NGram(inputCol='text', outputCol='grams')
ngram.transform(wordDataFrame).show(truncate=False)

+---+--------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
|id |text                                                                      |grams                                                                                                                          |
+---+--------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
|0  |[Hi, I, know, about, nlp]                                                 |[Hi I, I know, know about, about nlp]                                                                                          |
|1  |[I, wish, we, could, have, learned, nlp]                                  |[I wish, wish we, we could, could have, have learned, learned nlp]                  

In [34]:
#### Term frequency (TF) and IDF (inverse document frequency)/home/hadoop/data


In [35]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [39]:
sentenceDF = spark.createDataFrame([
    (0.0, "this is Spark session on NLP"),
    (0.0, "I wish we could have learned NLP with python."),
    (1.0, "for text analysis ml algorithms are logistic, naive bayes model"),
    ], schema= ['label', 'sentence'])

sentenceDF.show(truncate=False)

+-----+---------------------------------------------------------------+
|label|sentence                                                       |
+-----+---------------------------------------------------------------+
|0.0  |this is Spark session on NLP                                   |
|0.0  |I wish we could have learned NLP with python.                  |
|1.0  |for text analysis ml algorithms are logistic, naive bayes model|
+-----+---------------------------------------------------------------+



In [40]:
tokenizer=Tokenizer(inputCol='sentence', outputCol='words')
words_df=tokenizer.transform(sentenceDF)
words_df.show(truncate=False)

+-----+---------------------------------------------------------------+--------------------------------------------------------------------------+
|label|sentence                                                       |words                                                                     |
+-----+---------------------------------------------------------------+--------------------------------------------------------------------------+
|0.0  |this is Spark session on NLP                                   |[this, is, spark, session, on, nlp]                                       |
|0.0  |I wish we could have learned NLP with python.                  |[i, wish, we, could, have, learned, nlp, with, python.]                   |
|1.0  |for text analysis ml algorithms are logistic, naive bayes model|[for, text, analysis, ml, algorithms, are, logistic,, naive, bayes, model]|
+-----+---------------------------------------------------------------+-----------------------------------------------

In [43]:
hashing_tf = sentenceDF = spark.createDataFrame([
    (0, 'python spark hive, spark, hadoop'),
    (0, 'a b b b c'),
    (1, 'c hadoop hive spark hadoop hive')
], ['label', 'sentence'])

sentenceDF.show()
(inputCol='words', outputCol='rawFeatures')

featurized_df = hashing_tf.transform(words_df)
featurized_df.show(truncate=False)

+-----+---------------------------------------------------------------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|label|sentence                                                       |words                                                                     |rawFeatures                                                                                                           |
+-----+---------------------------------------------------------------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|0.0  |this is Spark session on NLP                                   |[this, is, spark, session, on, nlp]                                       |(262144,[9677,15889,100258,108541,116034,234657],[1.0,1.

In [44]:
featurized_df.select('rawFeatures').show(truncate=False)

# (document_hash, [word_hashes], [frequencies] )


+----------------------------------------------------------------------------------------------------------------------+
|rawFeatures                                                                                                           |
+----------------------------------------------------------------------------------------------------------------------+
|(262144,[9677,15889,100258,108541,116034,234657],[1.0,1.0,1.0,1.0,1.0,1.0])                                           |
|(262144,[20719,24417,116034,126466,139934,142830,147489,147765,253475],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])         |
|(262144,[16332,25817,36578,92225,143985,150224,158432,167122,183588,254285],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+----------------------------------------------------------------------------------------------------------------------+



In [45]:
help(HashingTF)

Help on class HashingTF in module pyspark.ml.feature:

class HashingTF(pyspark.ml.wrapper.JavaTransformer, pyspark.ml.param.shared.HasInputCol, pyspark.ml.param.shared.HasOutputCol, pyspark.ml.param.shared.HasNumFeatures, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  Maps a sequence of terms to their term frequencies using the hashing trick.
 |  Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)
 |  to calculate the hash code value for the term object.
 |  Since a simple modulo is used to transform the hash function to a column index,
 |  it is advisable to use a power of two as the numFeatures parameter;
 |  otherwise the features will not be mapped evenly to the columns.
 |  
 |  >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])
 |  >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
 |  >>> hashingTF.transform(df).head().features
 |  SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0})
 |  >>> hashing

In [47]:
idf= IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)

rescaled_df.show(truncate=False)

+-----+---------------------------------------------------------------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|sentence                                                       |words                                                                     |rawFeatures                                                                                                           |features                                                                                                                                                                                                

In [96]:
from pyspark.ml.feature import CountVectorizer

sentenceDF = spark.createDataFrame([
    (0, 'python spark hive spark hadoop'),
    (0, 'a b b b c'),
    (1, 'c hadoop hive spark hadoop hive')
], ['label', 'sentence'])

#sentenceDF.show()

wordsDF=tokenizer.transform(sentenceDF)
wordsDF.show()


+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|    0|python spark hive...|[python, spark, h...|
|    0|           a b b b c|     [a, b, b, b, c]|
|    1|c hadoop hive spa...|[c, hadoop, hive,...|
+-----+--------------------+--------------------+



In [97]:
#cv =  CountVectorizer(inputCol='words', outputCol='features', vocabSize=262144, minDF=2.0)
cv =  CountVectorizer(inputCol='words', outputCol='features', vocabSize=6,  minDF=2.0)
# minDF : minimum document frequency
# vocabSize : 

In [98]:
cv_model = cv.fit(wordsDF)
cv_df = cv_model.transform(wordsDF)
cv_df.show(truncate=False)

+-----+-------------------------------+--------------------------------------+-------------------------------+
|label|sentence                       |words                                 |features                       |
+-----+-------------------------------+--------------------------------------+-------------------------------+
|0    |python spark hive spark hadoop |[python, spark, hive, spark, hadoop]  |(4,[0,1,2],[1.0,2.0,1.0])      |
|0    |a b b b c                      |[a, b, b, b, c]                       |(4,[3],[1.0])                  |
|1    |c hadoop hive spark hadoop hive|[c, hadoop, hive, spark, hadoop, hive]|(4,[0,1,2,3],[2.0,1.0,2.0,1.0])|
+-----+-------------------------------+--------------------------------------+-------------------------------+



In [99]:
help(CountVectorizer)

Help on class CountVectorizer in module pyspark.ml.feature:

class CountVectorizer(pyspark.ml.wrapper.JavaEstimator, _CountVectorizerParams, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
 |  
 |  >>> df = spark.createDataFrame(
 |  ...    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
 |  ...    ["label", "raw"])
 |  >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
 |  >>> model = cv.fit(df)
 |  >>> model.transform(df).show(truncate=False)
 |  +-----+---------------+-------------------------+
 |  |label|raw            |vectors                  |
 |  +-----+---------------+-------------------------+
 |  |0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
 |  |1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
 |  +-----+---------------+-------------------------+
 |  ...
 |  >>> sorted(model.vocabulary) == ['a', 'b', 'c']
 |  True
 |  >>> countVectorize

In [75]:
del cv_df