In [None]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:5 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRe

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [None]:
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
# Read in CSV
from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("/content/dataset.csv"),sep=",", escape='"', encoding="utf-8", quote='"',  header=True)
df.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|founding father w...|    1|
|wow bravo tomi re...|    1|
|karma bitch way a...|    1|
|washington reuter...|    0|
|may president tru...|    1|
+--------------------+-----+
only showing top 5 rows



In [None]:
from pyspark.sql.types import IntegerType
df = df.withColumn("label", df["label"].cast(IntegerType()))

In [None]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
df = df.withColumn('length', length(df['text']))
df.show(3)

+--------------------+-----+------+
|                text|label|length|
+--------------------+-----+------+
|founding father w...|    1|  3299|
|wow bravo tomi re...|    1|   208|
|karma bitch way a...|    1|   147|
+--------------------+-----+------+
only showing top 3 rows



In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [None]:
 # Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

In [None]:
 # Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df)
cleaned = cleaner.transform(df)

In [None]:
 # Show label and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(262145,[991,2220...|
|    1|(262145,[1696,270...|
|    1|(262145,[531,1512...|
|    0|(262145,[3775,538...|
|    1|(262145,[2504,892...|
|    1|(262145,[1880,383...|
|    0|(262145,[2306,243...|
|    1|(262145,[2731,392...|
|    1|(262145,[5381,634...|
|    0|(262145,[1968,236...|
|    1|(262145,[378,2325...|
|    1|(262145,[3834,538...|
|    1|(262145,[1971,659...|
|    0|(262145,[1303,538...|
|    1|(262145,[11391,13...|
|    0|(262145,[3775,522...|
|    0|(262145,[4409,649...|
|    0|(262145,[14875,15...|
|    1|(262145,[2437,611...|
|    1|(262145,[619,1696...|
+-----+--------------------+
only showing top 20 rows



In [None]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [None]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|label|length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|abc announced dev...|    1|   923|[abc, announced, ...|[abc, announced, ...|(262144,[3834,407...|(262144,[3834,407...|(262145,[3834,407...|[-5748.2826101435...|[9.44091033354833...|       1.0|
|abidjan reuters f...|    0|   346|[abidjan, reuters...|[abidjan, reuters...|(262144,[3282,597...|(262144,[3282,597...|(262145,[3282,597...|[-2188.5674556179...|[1.0,2.1385350942...|       0.0|
|abidjan reuters h...|    0|  

In [None]:
 # Tranform the model with the testing data
train_results = predictor.transform(training)
train_results.show(5)

+--------------------+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|label|length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|abc announced dev...|    1|   923|[abc, announced, ...|[abc, announced, ...|(262144,[3834,407...|(262144,[3834,407...|(262145,[3834,407...|[-5748.2826101435...|[9.44091033354833...|       1.0|
|abc cancelled tim...|    1|  2457|[abc, cancelled, ...|[abc, cancelled, ...|(262144,[1696,232...|(262144,[1696,232...|(262145,[1696,232...|[-14338.729321783...|           [0.0,1.0]|       1.0|
|abc cancelled tim...|    1|  

In [None]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting fake news was: %f" % acc)

Accuracy of model at predicting fake news was: 0.940584


In [None]:
train_eval = MulticlassClassificationEvaluator()
train = acc_eval.evaluate(train_results)
print("Accuracy of model at predicting fake news was: %f" % train)

Accuracy of model at predicting fake news was: 0.963500


In [None]:
f1_eval = MulticlassClassificationEvaluator(metricName='f1')
f1 = f1_eval.evaluate(test_results)
print("F1 score of model at predicting fake news was: %f" % f1)

F1 score of model at predicting fake news was: 0.940584
