In [1]:
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

import os

os.environ['PYSPARK_PYTHON'] = 'python'
# sc.stop()
sc = SparkContext()

In [2]:
file_path_spam = "data//spam.txt"
file_path_non_spam = "data//ham.txt"

spam_rdd = sc.textFile(file_path_spam)
non_spam_rdd = sc.textFile(file_path_non_spam)

In [3]:
spam_words = spam_rdd.map(lambda email: email.split(' '))
non_spam_words = non_spam_rdd.map(lambda email: email.split(' '))

In [4]:
# Create a HashingTf instance with 200 features
tf = HashingTF(numFeatures=200)

# Map each word to one feature
spam_features = tf.transform(spam_words)
non_spam_features = tf.transform(non_spam_words)

In [5]:
# Label the features: 1 for spam, 0 for non-spam
spam_samples = spam_features.map(lambda features: LabeledPoint(1, features))
non_spam_samples = non_spam_features.map(lambda features: LabeledPoint(0, features))

In [6]:
samples = spam_samples.union(non_spam_samples)

In [7]:
train_samples,test_samples = samples.randomSplit([0.8, 0.2])
test_labels = test_samples.map(lambda x: x.label)
test_features = test_samples.map(lambda x: x.features)

In [8]:
model = LogisticRegressionWithLBFGS.train(train_samples)

In [None]:
predictions = model.predict(test_features)
actuals_and_preds = test_labels.zip(predictions)

In [None]:
accuracy = actuals_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_samples.count())
print(f"Model accuracy : {accuracy:.2f}")

# High-level APIs (pyspark.ml)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, lit
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
file_path_spam = "data//spam.txt"
file_path_non_spam = "data//ham.txt"

In [None]:
spam_df = spark.read.text(file_path_spam) \
               .withColumnRenamed('value', 'doc') \
               .select(split(col('doc'), " ").alias('doc')) \
               .withColumn('label', lit(1.0))
spam_df.show(5)
print("Number of rows:", spam_df.count())

In [None]:
non_spam_df = spark.read.text(file_path_non_spam) \
                   .withColumnRenamed('value', 'doc') \
                   .select(split(col('doc'), " ").alias('doc')) \
                   .withColumn('label', lit(0.0))
non_spam_df.show(5)
print("Number of rows:", non_spam_df.count())

In [None]:
samples_df = spam_df.union(non_spam_df)
print("Number of rows:", samples_df.count())

In [None]:
HashingTF(inputCol='doc', outputCol='features', numFeatures=200) \
    .transform(samples_df) \
    .show(1, truncate=False)

In [None]:
train_samples_df, test_samples_df = samples_df.randomSplit([0.8, 0.2])

In [None]:
ht = HashingTF(inputCol='doc', outputCol='features', numFeatures=200)
lr = LogisticRegression(featuresCol='features', labelCol='label')
pipeline = Pipeline(stages=[ht, lr])
lr_mod = pipeline.fit(train_samples_df)

In [None]:
eval = MulticlassClassificationEvaluator(metricName='accuracy')
predictions = lr_mod.transform(test_samples_df)
accuracy = eval.evaluate(predictions)
print(f"Model accuracy : {accuracy:.2f}") 