In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Waiting for headers] [1 InRelease 14.2 kB/88.7 kB 16%] [Connecting to cloud                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic I

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [4]:
# Read in CSV
from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("/content/news.csv"),sep=",", escape='"', encoding="utf-8", quote='"',  header=True)
df.show(5)

+--------------------+--------------------+-------+----------+------------+
|               title|                text|subject|      date|news_outcome|
+--------------------+--------------------+-------+----------+------------+
|['donald', 'trump...|['donald', 'trump...|   News|31/12/2017|           1|
|['drunk', 'braggi...|['house', 'intell...|   News|31/12/2017|           1|
|['sheriff', 'davi...|['friday', 'revea...|   News|30/12/2017|           1|
|['trump', 'obsess...|['christmas', 'da...|   News|29/12/2017|           1|
|['pope', 'francis...|['pope', 'francis...|   News|25/12/2017|           1|
+--------------------+--------------------+-------+----------+------------+
only showing top 5 rows



In [5]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- news_outcome: string (nullable = true)



In [6]:
from pyspark.sql.functions import udf, col, split

tolist_udf = udf(lambda x: x.replace("[","").replace("]","").replace("'",""))

In [7]:
df_2 = df.withColumn("title", tolist_udf(col("title")))

In [44]:
df_3 = df_2.select(split(col("title"),",").alias("TitleArray")) \
    .drop("title")
df_3.printSchema()

root
 |-- TitleArray: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [29]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="TitleArray", outputCol="TitleHashedValues", numFeatures=pow(2,5))

In [30]:
hashed_df = hashing.transform(df_3)

In [45]:
hashed_df.show(3, truncate=False)

+--------------------------------------------------------------------------------------+-------------------------------------------------------------------+
|TitleArray                                                                            |TitleHashedValues                                                  |
+--------------------------------------------------------------------------------------+-------------------------------------------------------------------+
|[donald,  trump,  send,  embarrassing,  new,  year,  eve,  message,  disturb]         |(32,[4,9,11,13,22,25,28,30],[1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0])     |
|[drunk,  bragging,  trump,  staffer,  start,  russian,  collusion,  investigation]    |(32,[0,2,10,14,16,17,22,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])     |
|[sheriff,  david,  clarke,  become,  internet,  joke,  threaten,  poke,  people,  eye]|(32,[3,5,6,7,10,13,16,22,24],[1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0])|
+---------------------------------------------------------

In [21]:
text_df = df_2.withColumn("text", tolist_udf(col("text")))

In [47]:
text_df2 = text_df.select(split(col("text"),",").alias("TextArray"), "text") \
    .drop("text")
text_df2.printSchema()

root
 |-- TextArray: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [25]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="TextArray", outputCol="TextHashedValues", numFeatures=pow(2,5))

In [26]:
hashed_df_text = hashing.transform(text_df2)

In [51]:
hashed_df_text.show(3)

+--------------------+--------------------+
|           TextArray|    TextHashedValues|
+--------------------+--------------------+
|[donald,  trump, ...|(32,[0,1,2,3,4,6,...|
|[house,  intellig...|(32,[0,1,2,3,4,5,...|
|[friday,  reveal,...|(32,[0,1,2,3,4,5,...|
+--------------------+--------------------+
only showing top 3 rows



In [36]:
# Fit the IDF on the data set 
idf = IDF(inputCol="TitleHashedValues", outputCol="TitleFeatures")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [53]:
rescaledData.show(3)

+--------------------+--------------------+--------------------+
|          TitleArray|   TitleHashedValues|       TitleFeatures|
+--------------------+--------------------+--------------------+
|[donald,  trump, ...|(32,[4,9,11,13,22...|(32,[4,9,11,13,22...|
|[drunk,  bragging...|(32,[0,2,10,14,16...|(32,[0,2,10,14,16...|
|[sheriff,  david,...|(32,[3,5,6,7,10,1...|(32,[3,5,6,7,10,1...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [39]:
# Fit the IDF on the data set 
idf2 = IDF(inputCol="TextHashedValues", outputCol="TextFeatures")
idfModel2 = idf2.fit(hashed_df_text)
rescaledData2 = idfModel2.transform(hashed_df_text)

In [52]:
rescaledData2.show(3)

+--------------------+--------------------+--------------------+
|           TextArray|    TextHashedValues|        TextFeatures|
+--------------------+--------------------+--------------------+
|[donald,  trump, ...|(32,[0,1,2,3,4,6,...|(32,[0,1,2,3,4,6,...|
|[house,  intellig...|(32,[0,1,2,3,4,5,...|(32,[0,1,2,3,4,5,...|
|[friday,  reveal,...|(32,[0,1,2,3,4,5,...|(32,[0,1,2,3,4,5,...|
+--------------------+--------------------+--------------------+
only showing top 3 rows

