In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [1 InRelease gpgv 1,575 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [1 InRelease gpgv 1,575 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 1,575 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 http://security.ubuntu.com/ubu

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [4]:
# Read in CSV
from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("/content/news_TextHero.csv"),sep=",", escape='"', encoding="utf-8", quote='"',  header=True)
df.show(5)

+--------------------+--------------------+-------+----------+------------+
|               title|                text|subject|      date|news_outcome|
+--------------------+--------------------+-------+----------+------------+
|['donald', 'trump...|['donald', 'trump...|   News|31/12/2017|           1|
|['drunk', 'braggi...|['house', 'intell...|   News|31/12/2017|           1|
|['sheriff', 'davi...|['friday', 'revea...|   News|30/12/2017|           1|
|['trump', 'obsess...|['christmas', 'da...|   News|29/12/2017|           1|
|['pope', 'francis...|['pope', 'francis...|   News|25/12/2017|           1|
+--------------------+--------------------+-------+----------+------------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
df = df.withColumn('length_title', length(df['title']))
df = df.withColumn('length_text', length(df['text']))
df.show(3)

+--------------------+--------------------+-------+----------+------------+------------+-----------+
|               title|                text|subject|      date|news_outcome|length_title|length_text|
+--------------------+--------------------+-------+----------+------------+------------+-----------+
|['donald', 'trump...|['donald', 'trump...|   News|31/12/2017|           1|          87|       2825|
|['drunk', 'braggi...|['house', 'intell...|   News|31/12/2017|           1|          91|       1871|
|['sheriff', 'davi...|['friday', 'revea...|   News|30/12/2017|           1|          97|       3533|
+--------------------+--------------------+-------+----------+------------+------------+-----------+
only showing top 3 rows



In [6]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- news_outcome: string (nullable = true)
 |-- length_title: integer (nullable = true)
 |-- length_text: integer (nullable = true)



In [7]:
#changing the title type to array
from pyspark.sql.functions import udf, col, split

tolist_udf = udf(lambda x: x.replace("[","").replace("]","").replace("'",""))

In [8]:
df_2 = df.withColumn("title", tolist_udf(col("title")))
df_2 = df_2.withColumn("label", tolist_udf(col("news_outcome")))

In [9]:
from pyspark.sql.types import IntegerType
df_2 = df_2.withColumn("label", df_2["label"].cast(IntegerType()))

In [10]:
df_3 = df_2.select(split(col("title"),",").alias("TitleArray"), "label", "length_title") \
    .drop("title")
df_3.printSchema()

root
 |-- TitleArray: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: integer (nullable = true)
 |-- length_title: integer (nullable = true)



Hashing Term Frequency

In [11]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="TitleArray", outputCol="TitleHashedValues", numFeatures=pow(2,5))

In [12]:
hashed_df = hashing.transform(df_3)

In [13]:
hashed_df.show(3)

+--------------------+-----+------------+--------------------+
|          TitleArray|label|length_title|   TitleHashedValues|
+--------------------+-----+------------+--------------------+
|[donald,  trump, ...|    1|          87|(32,[4,9,11,13,22...|
|[drunk,  bragging...|    1|          91|(32,[0,2,10,14,16...|
|[sheriff,  david,...|    1|          97|(32,[3,5,6,7,10,1...|
+--------------------+-----+------------+--------------------+
only showing top 3 rows



In [14]:
text_df = df_2.withColumn("text", tolist_udf(col("text")))

In [15]:
text_df2 = text_df.select(split(col("text"),",").alias("TextArray"), "label", "length_text") \
    .drop("text")
text_df2.printSchema()

root
 |-- TextArray: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: integer (nullable = true)
 |-- length_text: integer (nullable = true)



In [16]:
# Run the hashing term frequency - text
hashing = HashingTF(inputCol="TextArray", outputCol="TextHashedValues", numFeatures=pow(2,5))

In [17]:
hashed_df_text = hashing.transform(text_df2)

In [18]:
hashed_df_text.show(3)

+--------------------+-----+-----------+--------------------+
|           TextArray|label|length_text|    TextHashedValues|
+--------------------+-----+-----------+--------------------+
|[donald,  trump, ...|    1|       2825|(32,[0,1,2,3,4,6,...|
|[house,  intellig...|    1|       1871|(32,[0,1,2,3,4,5,...|
|[friday,  reveal,...|    1|       3533|(32,[0,1,2,3,4,5,...|
+--------------------+-----+-----------+--------------------+
only showing top 3 rows



Fitting IDF on the data set

In [19]:
# Fit the IDF on the data set 
idf = IDF(inputCol="TitleHashedValues", outputCol="TitleFeatures")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [20]:
rescaledData.show(3)

+--------------------+-----+------------+--------------------+--------------------+
|          TitleArray|label|length_title|   TitleHashedValues|       TitleFeatures|
+--------------------+-----+------------+--------------------+--------------------+
|[donald,  trump, ...|    1|          87|(32,[4,9,11,13,22...|(32,[4,9,11,13,22...|
|[drunk,  bragging...|    1|          91|(32,[0,2,10,14,16...|(32,[0,2,10,14,16...|
|[sheriff,  david,...|    1|          97|(32,[3,5,6,7,10,1...|(32,[3,5,6,7,10,1...|
+--------------------+-----+------------+--------------------+--------------------+
only showing top 3 rows



In [21]:
# Fit the IDF on the data set - text
idf2 = IDF(inputCol="TextHashedValues", outputCol="TextFeatures")
idfModel2 = idf2.fit(hashed_df_text)
rescaledData2 = idfModel2.transform(hashed_df_text)

In [46]:
rescaledData2.show(3)

+--------------------+-----+-----------+--------------------+--------------------+
|           TextArray|label|length_text|    TextHashedValues|        TextFeatures|
+--------------------+-----+-----------+--------------------+--------------------+
|[donald,  trump, ...|    1|       2825|(32,[0,1,2,3,4,6,...|(32,[0,1,2,3,4,6,...|
|[house,  intellig...|    1|       1871|(32,[0,1,2,3,4,5,...|(32,[0,1,2,3,4,5,...|
|[friday,  reveal,...|    1|       3533|(32,[0,1,2,3,4,5,...|(32,[0,1,2,3,4,5,...|
+--------------------+-----+-----------+--------------------+--------------------+
only showing top 3 rows



Naive Bayes

In [23]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['TitleFeatures', 'length_title'], outputCol='features')

In [24]:
cleaner = clean_up.transform(rescaledData)

In [25]:
cleaner.show(2)

+--------------------+-----+------------+--------------------+--------------------+--------------------+
|          TitleArray|label|length_title|   TitleHashedValues|       TitleFeatures|            features|
+--------------------+-----+------------+--------------------+--------------------+--------------------+
|[donald,  trump, ...|    1|          87|(32,[4,9,11,13,22...|(32,[4,9,11,13,22...|(33,[4,9,11,13,22...|
|[drunk,  bragging...|    1|          91|(32,[0,2,10,14,16...|(32,[0,2,10,14,16...|(33,[0,2,10,14,16...|
+--------------------+-----+------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [26]:
 from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaner.randomSplit([0.7, 0.3])
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [27]:
 # Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|          TitleArray|label|length_title|   TitleHashedValues|       TitleFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|[1,  million,  do...|    1|          94|(32,[1,7,11,15,21...|(32,[1,7,11,15,21...|(33,[1,7,11,15,21...|[-92.428614557579...|[0.71786765147133...|       0.0|
|[1,  percenter,  ...|    1|         131|(32,[3,5,6,9,10,1...|(32,[3,5,6,9,10,1...|(33,[3,5,6,9,10,1...|[-119.95077874140...|[0.53936011576607...|       0.0|
|[10,  day,  orlan...|    1|          77|(32,[1,10,17,27,2...|(32,[1,10,17,27,2...|(33,[1,10,17,27,2...|[-83.398425002981...|[0.12128236301639...|       1.0|
|[10,  reason,  vo...|    1|          71|(32,[0,3,8,

In [28]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.642055


In [29]:
# Create feature vectors
clean_up2 = VectorAssembler(inputCols=['TextFeatures', 'length_text'], outputCol='features')
cleaner_text = clean_up2.transform(rescaledData2)

In [30]:
cleaner_text.show(2)

+--------------------+-----+-----------+--------------------+--------------------+--------------------+
|           TextArray|label|length_text|    TextHashedValues|        TextFeatures|            features|
+--------------------+-----+-----------+--------------------+--------------------+--------------------+
|[donald,  trump, ...|    1|       2825|(32,[0,1,2,3,4,6,...|(32,[0,1,2,3,4,6,...|[0.90043953270749...|
|[house,  intellig...|    1|       1871|(32,[0,1,2,3,4,5,...|(32,[0,1,2,3,4,5,...|[0.65486511469635...|
+--------------------+-----+-----------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [31]:
 from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaner_text.randomSplit([0.7, 0.3])
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [32]:
 # Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+---------+-----+-----------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|TextArray|label|length_text|TextHashedValues|        TextFeatures|            features|       rawPrediction|         probability|prediction|
+---------+-----+-----------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|     

In [44]:
 # Tranform the model with the testing data
train_results = predictor.transform(training)
train_results.show(5)

+---------+-----+-----------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|TextArray|label|length_text|TextHashedValues|        TextFeatures|            features|       rawPrediction|         probability|prediction|
+---------+-----+-----------+----------------+--------------------+--------------------+--------------------+--------------------+----------+
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|       []|    1|          2| (32,[28],[1.0])|(32,[28],[0.05212...|(33,[28,32],[0.05...|[-1.2127030127202...|[0.47463655470847...|       1.0|
|     

In [35]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting fake news was: %f" % acc)

Accuracy of model at predicting fake news was: 0.657543


In [45]:
train_eval = MulticlassClassificationEvaluator()
train = acc_eval.evaluate(train_results)
print("Accuracy of model at predicting fake news was: %f" % train)

Accuracy of model at predicting fake news was: 0.665101


In [53]:
f1_eval = MulticlassClassificationEvaluator(metricName='f1')
f1 = f1_eval.evaluate(test_results)
print("F1 score of model at predicting fake news was: %f" % f1)

F1 score of model at predicting fake news was: 0.657543
