In [24]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version
# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [W                                                                               Hit:2 http://security.ubuntu.com/ubuntu focal-security InRelease
                                                                               0% [Waiting for headers] [Waiting for headers] [Waiting for headers]                                                                    Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers]                                                                    Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
                                                                    0% [Waiting for headers] [Waiting for headers] 

In [25]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkDates").getOrCreate()

In [28]:
# Load in data
from pyspark import SparkFiles
url ="https://movie-lens-data-p4t1.s3.amazonaws.com/genome-tags.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("genome-tags.csv"), sep=",", header=True, inferSchema=True, timestampFormat="yyyy/MM/dd HH:mm:ss")
df.show()

+-----+---------------+
|tagId|            tag|
+-----+---------------+
|    1|            007|
|    2|   007 (series)|
|    3|   18th century|
|    4|          1920s|
|    5|          1930s|
|    6|          1950s|
|    7|          1960s|
|    8|          1970s|
|    9|          1980s|
|   10|   19th century|
|   11|             3d|
|   12|           70mm|
|   13|            80s|
|   14|           9/11|
|   15|        aardman|
|   16|aardman studios|
|   17|       abortion|
|   18|         absurd|
|   19|         action|
|   20|  action packed|
+-----+---------------+
only showing top 20 rows



In [29]:
# Start Spark session
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [30]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [31]:
# Tokenize the words
tokenizer = Tokenizer(inputCol="tag", outputCol="tokens")
wordsData = tokenizer.transform(df)
wordsData.show()

+-----+---------------+------------------+
|tagId|            tag|            tokens|
+-----+---------------+------------------+
|    1|            007|             [007]|
|    2|   007 (series)|   [007, (series)]|
|    3|   18th century|   [18th, century]|
|    4|          1920s|           [1920s]|
|    5|          1930s|           [1930s]|
|    6|          1950s|           [1950s]|
|    7|          1960s|           [1960s]|
|    8|          1970s|           [1970s]|
|    9|          1980s|           [1980s]|
|   10|   19th century|   [19th, century]|
|   11|             3d|              [3d]|
|   12|           70mm|            [70mm]|
|   13|            80s|             [80s]|
|   14|           9/11|            [9/11]|
|   15|        aardman|         [aardman]|
|   16|aardman studios|[aardman, studios]|
|   17|       abortion|        [abortion]|
|   18|         absurd|          [absurd]|
|   19|         action|          [action]|
|   20|  action packed|  [action, packed]|
+-----+----

In [32]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="tokens", outputCol="hashedValues", numFeatures=pow(2,4))

# Transform into a DF
hashed_df = hashing.transform(wordsData)

In [33]:
# Display new DataFrame
hashed_df.show(truncate=False)

+-----+---------------+------------------+---------------------+
|tagId|tag            |tokens            |hashedValues         |
+-----+---------------+------------------+---------------------+
|1    |007            |[007]             |(16,[8],[1.0])       |
|2    |007 (series)   |[007, (series)]   |(16,[8,10],[1.0,1.0])|
|3    |18th century   |[18th, century]   |(16,[1,13],[1.0,1.0])|
|4    |1920s          |[1920s]           |(16,[14],[1.0])      |
|5    |1930s          |[1930s]           |(16,[12],[1.0])      |
|6    |1950s          |[1950s]           |(16,[5],[1.0])       |
|7    |1960s          |[1960s]           |(16,[1],[1.0])       |
|8    |1970s          |[1970s]           |(16,[1],[1.0])       |
|9    |1980s          |[1980s]           |(16,[13],[1.0])      |
|10   |19th century   |[19th, century]   |(16,[1,14],[1.0,1.0])|
|11   |3d             |[3d]              |(16,[13],[1.0])      |
|12   |70mm           |[70mm]            |(16,[2],[1.0])       |
|13   |80s            |[8

In [None]:
# Code to change hashedValues to feature for use in model 