In [None]:
!pip install -q findspark
!pip install -q pyspark

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[K     |████████████████████████████████| 198 kB 51.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
sentenceDataFrame = spark.createDataFrame([
  (0, "Hi I am a student at aivancity"),
  (1, "aivancity is an AI school"),
  (2, "students develop key practical skills at the AI clinic")], ["id", "sentence"])

In [None]:
sentenceDataFrame.show(truncate=False)

+---+------------------------------------------------------+
|id |sentence                                              |
+---+------------------------------------------------------+
|0  |Hi I am a student at aivancity                        |
|1  |aivancity is an AI school                             |
|2  |students develop key practical skills at the AI clinic|
+---+------------------------------------------------------+



In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [None]:
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.show(truncate=False)

+---+------------------------------------------------------+----------------------------------------------------------------+
|id |sentence                                              |words                                                           |
+---+------------------------------------------------------+----------------------------------------------------------------+
|0  |Hi I am a student at aivancity                        |[hi, i, am, a, student, at, aivancity]                          |
|1  |aivancity is an AI school                             |[aivancity, is, an, ai, school]                                 |
|2  |students develop key practical skills at the AI clinic|[students, develop, key, practical, skills, at, the, ai, clinic]|
+---+------------------------------------------------------+----------------------------------------------------------------+



In [None]:
from pyspark.ml.feature import RegexTokenizer

In [None]:
regextokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
countTokens = udf(lambda words: len(words), IntegerType())

In [None]:
tokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+------------------------------------------------------+----------------------------------------------------------------+------+
|sentence                                              |words                                                           |tokens|
+------------------------------------------------------+----------------------------------------------------------------+------+
|Hi I am a student at aivancity                        |[hi, i, am, a, student, at, aivancity]                          |7     |
|aivancity is an AI school                             |[aivancity, is, an, ai, school]                                 |5     |
|students develop key practical skills at the AI clinic|[students, develop, key, practical, skills, at, the, ai, clinic]|9     |
+------------------------------------------------------+----------------------------------------------------------------+------+



In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [None]:
tokenized_out_sw = remover.transform(tokenized)
tokenized_out_sw.select("filtered").show(truncate=False)

+-------------------------------------------------------+
|filtered                                               |
+-------------------------------------------------------+
|[hi, student, aivancity]                               |
|[aivancity, ai, school]                                |
|[students, develop, key, practical, skills, ai, clinic]|
+-------------------------------------------------------+



In [None]:
from pyspark.ml.feature import NGram  #to use sequence de mots pas des mots spérarément
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

ngramDataFrame = ngram.transform(tokenized_out_sw)
ngramDataFrame.select("filtered","ngrams").show(truncate=False)

+-------------------------------------------------------+--------------------------------------------------------------------------------------+
|filtered                                               |ngrams                                                                                |
+-------------------------------------------------------+--------------------------------------------------------------------------------------+
|[hi, student, aivancity]                               |[hi student, student aivancity]                                                       |
|[aivancity, ai, school]                                |[aivancity ai, ai school]                                                             |
|[students, develop, key, practical, skills, ai, clinic]|[students develop, develop key, key practical, practical skills, skills ai, ai clinic]|
+-------------------------------------------------------+-------------------------------------------------------------------------