In [None]:
pip install pyspark

In [None]:
# Importing Pyspark
import pyspark
from pyspark import SparkContext,SQLContext

In [None]:
import re
import pyspark
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import col, split
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, PCA ,StopWordsRemover,StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier,LogisticRegression,NaiveBayes,LinearSVC
from pyspark.sql import functions as F
from pyspark.ml.clustering import KMeans
from pyspark.sql.types import DoubleType

In [None]:
sc=SparkContext(appName='SpamHam')
sql=SQLContext(sparkContext=sc)

In [None]:
# Loading the dataset

rawDF=sql.read.format('csv').options(header=True,inferSchema=True).load('../input/sms-spam-collection-dataset/spam.csv')
rawDF

In [None]:
# Preprocessing

def preprocessing(df):
    # droping the used column
    df=df.drop('_c2').drop('_c3').drop('_c4')
    # changing the name of the colmun
    df = df.selectExpr("v1 as class", "v2 as text")
    
    # removing the null value
    df=df.filter(df.text != '')
    return df

In [None]:
new_df = preprocessing(rawDF)

In [None]:
new_df.show(2)

In [None]:
def pyspark_lib(df):
    # regex tokenizer
    regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
    resultantdf=regexTokenizer.transform(df)
    # removal of stop word
    
    add_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
    
    stopwordsRemover = StopWordsRemover(inputCol="tokenized", outputCol="filtered").setStopWords(add_stopwords)
    resultantdf=stopwordsRemover.transform(resultantdf)
    
    #converting the class to integer label
    indexer = StringIndexer(inputCol="class", outputCol="label")
    indexed = indexer.fit(resultantdf).transform(resultantdf)
    indexed=indexed.drop('class')
    return indexed

In [None]:
new_df_2 = pyspark_lib(new_df)

In [None]:
new_df.show(10)

In [None]:
new_df_2.show(10)

In [None]:
spam_df = new_df_2.filter('label == 0.0')

In [None]:
spam_df.show(10)

In [None]:
non_spam_df = new_df_2.filter('label == 1.0')

In [None]:
non_spam_df.show(10)

In [None]:
# Creating N-grams (Bigrams)
from pyspark.ml.feature import NGram
ngram = NGram(n=2)
ngram.setInputCol("filtered")

ngram.setOutputCol("bigrams")

spam_df_1 = ngram.transform(spam_df)


In [None]:
spam_df_1.show(10)

In [None]:
all_spam_bigrams = spam_df_1.select('bigrams').rdd.flatMap(lambda x: x).collect()

In [None]:
from functools import reduce



all_spam_bigrams_single_list = reduce(lambda x,y: x+y, all_spam_bigrams)
print(len(all_spam_bigrams_single_list))

all_spam_bigrams_single_list_2 = list(set(all_spam_bigrams_single_list))
print(len(all_spam_bigrams_single_list_2))
#print(all_spam_bigrams_single_list)

In [None]:
len(all_spam_bigrams)

In [None]:
#all_spam_bigrams_single_list_2
results = []
count = 0
for i in all_spam_bigrams_single_list_2:
    for j in all_spam_bigrams:
        if i in j:
            count += 1
    a = i.split()
    a.sort()
    results.append([a[0],a[1],count])
    #print(i,count)
    count = 0

In [None]:
print(len(results))

In [None]:
# giving column names of dataframe
columns = ["word_1", "word_2", "count"]

# creating sparksession and giving 
# an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
# creating a dataframe
dataframe = spark.createDataFrame(results, columns)
  
# show data frame
dataframe.show(10)

In [None]:
dataframe.write.csv('spam_file.csv')

# For Non Spam Dataframe #

In [None]:
# Creating N-grams (Bigrams)
from pyspark.ml.feature import NGram
ngram = NGram(n=2)
ngram.setInputCol("filtered")

ngram.setOutputCol("bigrams")

non_spam_df_1 = ngram.transform(non_spam_df)

In [None]:
all_non_spam_bigrams = non_spam_df_1.select('bigrams').rdd.flatMap(lambda x: x).collect()

In [None]:
from functools import reduce



all_non_spam_bigrams_single_list = reduce(lambda x,y: x+y, all_non_spam_bigrams)
print(len(all_non_spam_bigrams_single_list))

all_non_spam_bigrams_single_list_2 = list(set(all_non_spam_bigrams_single_list))
print(len(all_non_spam_bigrams_single_list_2))
#print(all_spam_bigrams_single_list)

In [None]:
non_spam_results = []
count = 0
for i in all_non_spam_bigrams_single_list_2:
    for j in all_non_spam_bigrams:
        if i in j:
            count += 1
    a = i.split()
    a.sort()
    non_spam_results.append([a[0],a[1],count])
    #print(i,count)
    count = 0

In [None]:
# giving column names of dataframe
columns_non_spam = ["word_1", "word_2", "count"]

# creating sparksession and giving 
# an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
# creating a dataframe
dataframe_non_spam = spark.createDataFrame(results, columns_non_spam)
  
# show data frame
dataframe_non_spam.show(10)

In [None]:
dataframe_non_spam.write.csv('non_spam_file.csv')

In [None]:
dataframe.show(10)

In [None]:
dataframe_non_spam.show(10)

In [None]:
# Using equals condition
df_non_spam = dataframe_non_spam.filter(dataframe_non_spam.word_2 =='super')
df_non_spam.show()

In [None]:
# Using equals condition
df_spam = dataframe.filter(dataframe.word_2 =='super')
df_spam.show()

In [None]:
df_non_spam.write.csv('non_spam_frq_word.csv')

In [None]:
df_spam.write.csv('spam_frq_word.csv')

In [None]:
df_spam1 = dataframe.filter(dataframe.word_2 =='bitter')
df_non_spam1 = dataframe_non_spam.filter(dataframe_non_spam.word_2 =='bitter')
df_non_spam1.write.csv('non_spam_frq_word1.csv')
df_spam1.write.csv('spam_frq_word1.csv')

In [None]:
df_spam2 = dataframe.filter(dataframe.word_2 =='candid')
df_non_spam2 = dataframe_non_spam.filter(dataframe_non_spam.word_2 =='candid')
df_non_spam2.write.csv('non_spam_frq_word2.csv')
df_spam2.write.csv('spam_frq_word2.csv')