In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 45 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 53.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=9983c7b65b8d6c39f709f4da2330c281e69affa91a74c42dc001106c6bf5b5da
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
!pip install spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spark
  Downloading spark-0.2.1.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 74 kB/s 
[?25hBuilding wheels for collected packages: spark
  Building wheel for spark (setup.py) ... [?25l[?25hdone
  Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58762 sha256=0d473eddef8d39616ce54c38ea1a92f063f7be927c273cc909117a804d7601e8
  Stored in directory: /root/.cache/pip/wheels/4e/0e/f1/164619f9920fb447d294afaae11a7715bd442ded7225953d72
Successfully built spark
Installing collected packages: spark
Successfully installed spark-0.2.1


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.types import IntegerType

In [5]:
sen_df = spark.createDataFrame([
                                (0, 'Hi I heard about Spark'),
                                (1, 'I wish java could use case classes'),
                                (2, 'Logistic,regression,models,are,neat')
], ['id','sentence'])

In [6]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [7]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [9]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words',
                                 pattern='\\W')

In [12]:
from pyspark.sql.functions import col, udf


In [13]:
count_token = udf(lambda words:len(words),IntegerType())

In [14]:
tokenized = tokenizer.transform(sen_df)

In [17]:
tokenized.withColumn('tokens', count_token(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [15]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [18]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [19]:
rg_tokenized.withColumn('tokens', count_token(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [20]:
from pyspark.ml.feature import StopWordsRemover

In [21]:
sentenceDataFrame = spark.createDataFrame([
                                           (0, ['I', 'saw', 'the', 'green', 'horse']),
                                           (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id','tokens'])

In [23]:
sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [24]:
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered' )

In [25]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [29]:
#n-gram
from pyspark.ml.feature import NGram

In [42]:
wordDataFrame = spark.createDataFrame([
                                (0,[ 'Hi', 'I', 'heard', 'about', 'Spark']),
                                (1,[ 'I', 'wish', 'java', 'could', 'use','case', 'classes']),
                                (2, ['Logistic','regression','models','are','neat'])
], ['id','words'])

In [43]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [45]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [46]:
from pyspark.ml.feature import HashingTF, IDF

In [47]:
data = spark.createDataFrame([
                                (0, 'Hi I heard about Spark'),
                                (1, 'I wish java could use case classes'),
                                (2, 'Logistic regression models are neat')
], ['label','sentence'])

In [48]:
data.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|    0|Hi I heard about ...|
|    1|I wish java could...|
|    2|Logistic regressi...|
+-----+--------------------+



In [51]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [52]:
words_data = tokenizer.transform(data)

In [53]:
words_data.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|    0|Hi I heard about ...|[hi, i, heard, ab...|
|    1|I wish java could...|[i, wish, java, c...|
|    2|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [54]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [55]:
featurized_data = hashing_tf.transform(words_data)

In [56]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [57]:
idf_model = idf.fit(featurized_data)

In [58]:
rescaled_data = idf_model.transform(featurized_data)

In [59]:
rescaled_data.select('label','features').show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                      |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |(262144,[18700,19036,33808,66273,173558],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                   |
|1    |(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|2   

In [60]:
from pyspark.ml.feature import CountVectorizer

In [61]:
df = spark.createDataFrame([
                            (0, 'a b c'.split(' ')),
                            (1, 'a b b c a'.split(' '))
], ['id', 'words'])

In [62]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [63]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)

In [64]:
model = cv.fit(df)

In [65]:
result = model.transform(df)

In [67]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

