In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,16

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [22]:
# Read in CSV
from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("/content/news.csv"),sep=",", escape='"', encoding="utf-8", quote='"',  header=True)
df.show(5)

+--------------------+--------------------+-------+----------+------------+
|               title|                text|subject|      date|news_outcome|
+--------------------+--------------------+-------+----------+------------+
|donald, trump, se...|donald, trump, wi...|   News|31/12/2017|           1|
|drunk, bragging, ...|house, intelligen...|   News|31/12/2017|           1|
|sheriff, david, c...|friday, reveal, f...|   News|30/12/2017|           1|
|trump, obsess, ev...|christmas, day, d...|   News|29/12/2017|           1|
|pope, francis, ca...|pope, francis, us...|   News|25/12/2017|           1|
+--------------------+--------------------+-------+----------+------------+
only showing top 5 rows



In [28]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import array
df_new = df.withColumn("title", array(df["title"]))
df_new = df_new.withColumn("text", array(df["text"]))
df_new.printSchema()

root
 |-- title: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- text: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- news_outcome: string (nullable = true)



In [48]:
 # Run the hashing term frequency
hashing = HashingTF(inputCol="title", outputCol="hashedTitles")

# Transform into a DF
hashed_df = hashing.transform(df_new)
hashed_df.show(5, truncate=False)

+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [52]:
 # Run the hashing term frequency
hashing2 = HashingTF(inputCol="text", outputCol="hashedText")

# Transform into a DF
hashed_df2 = hashing2.transform(hashed_df)
hashed_df2.show(5)

+--------------------+--------------------+-------+----------+------------+--------------------+--------------------+
|               title|                text|subject|      date|news_outcome|        hashedTitles|          hashedText|
+--------------------+--------------------+-------+----------+------------+--------------------+--------------------+
|[donald, trump, s...|[donald, trump, w...|   News|31/12/2017|           1|(262144,[185097],...|(262144,[214095],...|
|[drunk, bragging,...|[house, intellige...|   News|31/12/2017|           1|(262144,[228680],...|(262144,[53903],[...|
|[sheriff, david, ...|[friday, reveal, ...|   News|30/12/2017|           1|(262144,[245365],...|(262144,[6623],[1...|
|[trump, obsess, e...|[christmas, day, ...|   News|29/12/2017|           1|(262144,[34496],[...|(262144,[116024],...|
|[pope, francis, c...|[pope, francis, u...|   News|25/12/2017|           1|(262144,[253547],...|(262144,[240902],...|
+--------------------+--------------------+-------+-----

In [53]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedTitles", outputCol="title_features")
idfModel = idf.fit(hashed_df2)
rescaledData = idfModel.transform(hashed_df2)

In [63]:
# Fit the IDF on the data set 
idf2 = IDF(inputCol="hashedText", outputCol="text_features")
idfModel2 = idf2.fit(rescaledData)
rescaledData2 = idfModel2.transform(rescaledData)

In [55]:
 # Display the DataFrame
rescaledData.select("title", "title_features").show(truncate=False)

+-----------------------------------------------------------------------------------------------+-------------------------------------+
|title                                                                                          |title_features                       |
+-----------------------------------------------------------------------------------------------+-------------------------------------+
|[donald, trump, send, embarrassing, new, year, eve, message, disturb]                          |(262144,[185097],[10.01364156102802])|
|[drunk, bragging, trump, staffer, start, russian, collusion, investigation]                    |(262144,[228680],[10.01364156102802])|
|[sheriff, david, clarke, become, internet, joke, threaten, poke, people, eye]                  |(262144,[245365],[10.01364156102802])|
|[trump, obsess, even, obama, name, coded, website, image]                                      |(262144,[34496],[9.608176452919857]) |
|[pope, francis, call, donald, trump, christmas,

In [78]:
 # Display the DataFrame
rescaledData2.select("text", "text_features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------