# Content Based Recommendation

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, desc, broadcast, concat, lit
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Normalizer, PCA
import numpy as np

# Intializer Spark Session
# Set Driver and Exectutor Memory
spark = SparkSession.builder \
        .appName("Content-basedRecommendationSystem") \
        .config("spark.driver.memory", "4g") \
        .config("sparl.executor.memory", "4g") \
        .config("spark.driver.extraJavaOptions", "-Dfile.encoding=UTF-8") \
        .getOrCreate()

In [10]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [1]:
from google.colab import files
uploaded = files.upload()

Saving transactions_data.csv to transactions_data.csv


#### Load Dataset and Select Relevant Columns

In [6]:
from os import truncate
# Load Transactions Data
transactions_data = spark.read.csv("transactions_data.csv", header=True)

# Select distinict products and build features combining product_name, category and product_description
df= transactions_data\
    .select(
        col("product_name"),
        col("product_description"),
        col("category"),
        concat(
            col("product_name"), lit(" "),
            col("product_description"), lit(" "),
            col("category")).alias("full_product_description"))\
    .distinct()

# Show data
df.show(10, truncate=0)

+-------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+
|product_name |product_description                              |category            |full_product_description                                              |
+-------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+
|Paper Towels |Ultra-absorbent paper towels.                    |Household Essentials|Paper Towels Ultra-absorbent paper towels. Household Essentials       |
|Apples       |Fresh organic apples, rich in fiber and vitamins.|Fresh Produce       |Apples Fresh organic apples, rich in fiber and vitamins. Fresh Produce|
|Almonds      |Roasted almonds, a healthy snack.                |Snacks & Sweets     |Almonds Roasted almonds, a healthy snack. Snacks & Sweets             |
|Milk         |Whole organic milk, high in calcium a

#### Feature Engineering with TF-IDF

In [17]:
# Tokenizing product names
tokenizer = Tokenizer(inputCol="full_product_description", outputCol="words")
wordData = tokenizer.transform(df)

wordData.show(5, truncate=0)

+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+
|product_name|product_description                              |category            |full_product_description                                              |words                                                                              |
+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+
|Paper Towels|Ultra-absorbent paper towels.                    |Household Essentials|Paper Towels Ultra-absorbent paper towels. Household Essentials       |[paper, towels, ultra-absorbent, paper, towels., household, essentials]            |
|Apples      |Fresh organic apples, 

In [18]:
# Removing common words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = remover.transform(wordData)

filteredData.show(5, truncate=0)

+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+-------------------------------------------------------------------------+
|product_name|product_description                              |category            |full_product_description                                              |words                                                                              |filtered                                                                 |
+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+-------------------------------------------------------------------------+
|Paper Towels|Ultra-absorbent paper towels.            

In [19]:
# Applying TF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
featurizedData = hashingTF.transform(filteredData)

featurizedData.show(5, truncate=0)

+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+-------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
|product_name|product_description                              |category            |full_product_description                                              |words                                                                              |filtered                                                                 |rawFeatures                                                                              |
+------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+----------------------------------

In [21]:
# Applying Inverse Document Frequency(IDF)
idf =IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

# Applying TF-IDF
rescaledData = idfModel.transform(featurizedData)

rescaledData.show(10, truncate=0)

+-------------+-------------------------------------------------+--------------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------+----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_name |product_description                              |category            |full_product_description                                              |words                                                                              |filtered                                                                    |rawFeatures                         

#### Using PCA to Reduce Features

In [22]:
# Applying PCA for Dimentionality Reduction
pca = PCA(k=5, inputCol="features", outputCol="pcaFeatures")
pcaModel = pca.fit(rescaledData)
pcaData = pcaModel.transform(rescaledData)
pcaData.select("product_name", "full_product_description", "pcaFeatures").show(10, truncate=0)

+-------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|product_name |full_product_description                                              |pcaFeatures                                                                                          |
+-------------+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|Paper Towels |Paper Towels Ultra-absorbent paper towels. Household Essentials       |[0.4227433601435139,-2.8527575735066852,0.7815012058048467,2.851357712791382,0.5166529130973089]     |
|Apples       |Apples Fresh organic apples, rich in fiber and vitamins. Fresh Produce|[-0.2578555688665118,-0.31818330959050434,0.6848604781956911,-1.2392067275061625,0.41110667580394844]|
|Almonds      |Almonds Roasted almonds, a healthy snack