#Start a Spark Session

In [None]:
!pip install -q findspark
!pip install -q pyspark

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
df = spark.read.csv('SMSSpamCollection.csv',inferSchema=True,header=False, sep = '\t')
df.count()
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [None]:
df = df.withColumnRenamed("_c0", "class")
df = df.withColumnRenamed("_c1", "text")

In [None]:
df.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [None]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)
df.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                                                                                |words                                                                                                                                                                                                                                     |
+-----+-----------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import RegexTokenizer
regextokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
countToken = udf(lambda words: len(words), IntegerType())
df.select("text", "words").withColumn("tokens", countToken(col("words"))).show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|text                                                                                                                                                                                                |words                                                                                                                                                                                                                                     |tokens|
+---------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import NGram
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

df = ngram.transform(df)
df.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                  

In [None]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="ngrams", outputCol="StopWords")
df = remover.transform(df)
df.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="words", outputCol="features")

model = cv.fit(df)

df = model.transform(df)
df.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="StopWords", outputCol="IDF")
df = hashingTF.transform(df)
df.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|               words|              ngrams|           StopWords|            features|                 IDF|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|[go until, until ...|[go until, until ...|(13587,[8,42,52,6...|(262144,[1533,132...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok lar..., lar.....|[ok lar..., lar.....|(13587,[5,75,411,...|(262144,[11490,38...|
| spam|Free entry in 2 a...|[free, entry, in,...|[free entry, entr...|[free entry, entr...|(13587,[0,3,8,20,...|(262144,[1834,166...|
|  ham|U dun say so earl...|[u, dun, say, so,...|[u dun, dun say, ...|[u dun, dun say, ...|(13587,[5,22,60,1...|(262144,[3476,543...|
|  ham|Nah I don't think...|[nah, i, don't, t...|[nah i, i don