In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

this step is used to calculate the statistic of the user preferred categories, brands and prices from the dataset

In [None]:
file_path_prefix = "/content/drive/MyDrive/datasets/sampled-data/"

file1_name = "filtered-sampled-cleaned-2019-Oct.csv"
file2_name = "filtered-sampled-cleaned-2019-Nov.csv"
file3_name = "filtered-sampled-cleaned-2019-Dec.csv"
file4_name = "filtered-sampled-cleaned-2020-Jan.csv"
file5_name = "filtered-sampled-cleaned-2020-Feb.csv"
file6_name = "filtered-sampled-cleaned-2020-Mar.csv"
file7_name = "filtered-sampled-cleaned-2020-Apr.csv"

filename_list = [file1_name, file2_name, file3_name, file4_name, file5_name, file6_name, file7_name]

In [None]:
from collections import defaultdict
import numpy as np

In [None]:
chunk_size = 1000000
user_preffered_category = defaultdict(int)
user_preffered_brand = defaultdict(int)
user_price_dict = defaultdict(int)

for filename in filename_list:

  # load dataset in chunks
  chunks = pd.read_csv(file_path_prefix + filename, chunksize=chunk_size)
  print(f"reading {filename}")

  for chunk_index, chunk in enumerate(chunks):
    print(f"-> chunk: {chunk_index+1}")
    for index, row in chunk.iterrows():
      if row["user_id"] not in user_preffered_category:
        user_preffered_category[row["user_id"]] = defaultdict(int)
      if row["user_id"] not in user_preffered_brand:
        user_preffered_brand[row["user_id"]] = defaultdict(int)

      # calculate user preferred categories
      for category_index, category in enumerate(row["category_code"].split(".")):
        user_preffered_category[row["user_id"]][category] += (row["rating"] * (category_index+1))

      # calculate user preferred brand
      user_preffered_brand[row["user_id"]][row["brand"]] += (row["rating"])

      # calculate user preferred price range
      if str(row["user_id"]) not in user_price_dict: user_price_dict[str(row["user_id"])] = []

      user_price_dict[str(row["user_id"])].append(row['price'])

  print(f"{filename} completed")

reading filtered-sampled-cleaned-2019-Oct.csv
-> chunk: 1
-> chunk: 2
-> chunk: 3
filtered-sampled-cleaned-2019-Oct.csv completed
reading filtered-sampled-cleaned-2019-Nov.csv
-> chunk: 1
-> chunk: 2
-> chunk: 3
-> chunk: 4
-> chunk: 5
filtered-sampled-cleaned-2019-Nov.csv completed
reading filtered-sampled-cleaned-2019-Dec.csv
-> chunk: 1
-> chunk: 2
filtered-sampled-cleaned-2019-Dec.csv completed
reading filtered-sampled-cleaned-2020-Jan.csv
-> chunk: 1
-> chunk: 2
filtered-sampled-cleaned-2020-Jan.csv completed
reading filtered-sampled-cleaned-2020-Feb.csv
-> chunk: 1
-> chunk: 2
filtered-sampled-cleaned-2020-Feb.csv completed
reading filtered-sampled-cleaned-2020-Mar.csv
-> chunk: 1
-> chunk: 2
filtered-sampled-cleaned-2020-Mar.csv completed
reading filtered-sampled-cleaned-2020-Apr.csv
-> chunk: 1
-> chunk: 2
filtered-sampled-cleaned-2020-Apr.csv completed


In [None]:
def calculate_statistics(data):
    q1 = np.percentile(data, 25)
    median = np.median(data)
    q3 = np.percentile(data, 75)
    mean = np.mean(data)
    variance = np.var(data)
    count = len(data)
    return {"q1": q1,
            "median": median,
            "q3": q3,
            "mean": mean,
            "variance": variance,
            "count": count}

In [None]:
user_price_stat = {}

for key, values in user_price_dict.items():
        statistics = calculate_statistics(values)
        user_price_stat[key] = statistics

In [None]:
user_preffered_category

defaultdict(int,
            {541078402: defaultdict(int,
                         {'electronics': 6,
                          'smartphone': 12,
                          'computers': 2,
                          'peripherals': 4,
                          'printer': 6,
                          'appliances': 2,
                          'kitchen': 4,
                          'coffee_grinder': 6,
                          'construction': 1,
                          'components': 2,
                          'faucet': 3}),
             541677055: defaultdict(int, {'electronics': 2, 'clocks': 4}),
             515721215: defaultdict(int,
                         {'appliances': 2,
                          'kitchen': 2,
                          'refrigerators': 3,
                          'auto': 1,
                          'accessories': 4,
                          'videoregister': 3,
                          'bag': 4,
                          'electronics': 7,
                 

In [None]:
user_preffered_brand

defaultdict(int,
            {541078402: defaultdict(int,
                         {'samsung': 3, 'apple': 3, 'lucente': 4, 'jade': 1}),
             541677055: defaultdict(int, {'samsung': 1, 'garmin': 1}),
             515721215: defaultdict(int,
                         {'alpicool': 1,
                          'sho-me': 1,
                          'ritmix': 2,
                          'xiaomi': 5,
                          'karcher': 1,
                          'huawei': 2,
                          'salamander': 1,
                          'bosch': 5,
                          'sony': 3,
                          'redmond': 2,
                          'apple': 1,
                          'stagg': 2}),
             539251936: defaultdict(int,
                         {'salamander': 4,
                          'baden': 1,
                          'respect': 2,
                          'vitek': 1}),
             512748185: defaultdict(int,
                         {'oppo': 2

In [None]:
user_price_stat

{'541078402': {'q1': 176.84,
  'median': 358.57,
  'q3': 380.31,
  'mean': 531.1477777777778,
  'variance': 531365.484506173,
  'count': 9},
 '541677055': {'q1': 587.5875,
  'median': 1023.565,
  'q3': 1459.5425,
  'mean': 1023.565,
  'variance': 760305.522025,
  'count': 2},
 '515721215': {'q1': 41.807500000000005,
  'median': 151.97,
  'q3': 186.68,
  'mean': 254.2846153846154,
  'variance': 155072.55784792898,
  'count': 26},
 '539251936': {'q1': 96.785,
  'median': 128.45,
  'q3': 141.9575,
  'mean': 114.97499999999998,
  'variance': 1152.4028916666668,
  'count': 6},
 '512748185': {'q1': 213.09,
  'median': 257.38,
  'q3': 267.04499999999996,
  'mean': 283.21,
  'variance': 17422.970314285714,
  'count': 7},
 '536074778': {'q1': 113.0,
  'median': 113.0,
  'q3': 113.0,
  'mean': 113.0,
  'variance': 0.0,
  'count': 1},
 '551219980': {'q1': 16.7075,
  'median': 25.715,
  'q3': 34.7225,
  'mean': 25.715,
  'variance': 324.54022499999996,
  'count': 2},
 '521143369': {'q1': 691.71499

In [None]:
import csv

output_file_path = file_path_prefix + 'user_preffered_category.csv'
with open(output_file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write header
    csv_writer.writerow(['user_id', 'category', 'rating'])

    # Write data
    for user_id, categories in user_preffered_category.items():
        for category, rating in categories.items():
            csv_writer.writerow([user_id, category, rating])

In [None]:
output_file_path = file_path_prefix + 'user_preffered_brand.csv'
with open(output_file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write header
    csv_writer.writerow(['user_id', 'brand', 'rating'])

    # Write data
    for user_id, brands in user_preffered_brand.items():
        for brand, rating in brands.items():
            csv_writer.writerow([user_id, brand, rating])

In [None]:
output_file_path = file_path_prefix + 'user_preffered_price.csv'
with open(output_file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write header
    csv_writer.writerow(['user_id', "q1", "median", "q3", "mean", "variance", "count"])

    # Write data rows
    for user_id, stats in user_price_stat.items():
        row = [user_id, stats['q1'],
               stats['median'],
               stats['q3'],
               stats['mean'],
               stats['variance'],
               stats['count']]
        csv_writer.writerow(row)

In [None]:
# extract user-product-rating

# user input: user embedding [10]
# product input: item embedding [10], most representative category, brand, price [5]
# output: (rating) [1]

In [None]:
!pip install pyspark py4j



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("JoinCSVParquet").getOrCreate()

In [None]:
# Read the CSV file into a DataFrame
csv_file_path = "/content/drive/MyDrive/datasets/sampled-data/user-product-rating.csv"
csv_df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Read the Parquet file into a DataFrame
user_parquet_file_path = "/content/drive/MyDrive/spark-model/userFactors.parquet"
user_parquet_df = spark.read.parquet(user_parquet_file_path)

product_parquet_file_path = "/content/drive/MyDrive/spark-model/itemFactors.parquet"
product_parquet_df = spark.read.parquet(product_parquet_file_path)

In [None]:
csv_df.show(5)

+--------+----------+------+
| user_id|product_id|rating|
+--------+----------+------+
|12511517| 100042492|     1|
|29515875|   1801638|     1|
|31198833|   1003549|     1|
|34526405|  18600003|     1|
|34916060|  12600007|     1|
+--------+----------+------+
only showing top 5 rows



In [None]:
user_parquet_df.show(5)

+---------+--------------------+
|       id|            features|
+---------+--------------------+
| 80970791|[-0.13640156, -0....|
| 95036441|[0.29266906, 0.64...|
|122003351|[1.100352, 0.0836...|
|126150271|[0.16644049, -0.4...|
|145970761|[0.26003927, -0.3...|
+---------+--------------------+
only showing top 5 rows



In [None]:
product_parquet_df.show(5)

+-------+--------------------+
|     id|            features|
+-------+--------------------+
|1001402|[0.4387882, 0.243...|
|1002042|[-0.08665615, 0.5...|
|1002062|[0.0031371613, 0....|
|1002102|[0.2362063, -0.04...|
|1002482|[0.060402397, 0.2...|
+-------+--------------------+
only showing top 5 rows



In [None]:
joined_df = csv_df.join(user_parquet_df, csv_df.user_id == user_parquet_df.id, "inner")

In [None]:
joined_df.show(5)

+---------+----------+------+---------+--------------------+
|  user_id|product_id|rating|       id|            features|
+---------+----------+------+---------+--------------------+
| 31198833|   1003549|     1| 31198833|[0.21142678, -0.6...|
| 82098029|  32801015|     1| 82098029|[0.55766875, -0.2...|
| 82098029|  34700097|     1| 82098029|[0.55766875, -0.2...|
|110760953| 100019316|     1|110760953|[-0.10180759, -0....|
|110760953| 100045147|     1|110760953|[-0.10180759, -0....|
+---------+----------+------+---------+--------------------+
only showing top 5 rows



In [None]:
joined_df = joined_df.withColumnRenamed("features", "user_embedding")
joined_df.show()

+---------+----------+------+---------+--------------------+
|  user_id|product_id|rating|       id|      user_embedding|
+---------+----------+------+---------+--------------------+
| 31198833|   1003549|     1| 31198833|[0.21142678, -0.6...|
| 82098029|  32801015|     1| 82098029|[0.55766875, -0.2...|
| 82098029|  34700097|     1| 82098029|[0.55766875, -0.2...|
|110760953| 100019316|     1|110760953|[-0.10180759, -0....|
|110760953| 100045147|     1|110760953|[-0.10180759, -0....|
|116566414|   2701022|     1|116566414|[-0.15877374, 0.3...|
|116566414|   5000096|     1|116566414|[-0.15877374, 0.3...|
|116566414|   5000776|     1|116566414|[-0.15877374, 0.3...|
|122384079|   4900173|     1|122384079|[0.87699986, 0.13...|
|122384079|   4900383|     1|122384079|[0.87699986, 0.13...|
|125917727|  15200473|     1|125917727|[0.11588339, 0.09...|
|127670265| 100051486|     1|127670265|[0.15974428, 0.19...|
|134184564|  15200987|     1|134184564|[0.13365045, 0.03...|
|138365902|  16300033|  

In [None]:
joined_df = joined_df.join(product_parquet_df, csv_df.product_id == product_parquet_df.id, "inner")

In [None]:
joined_df = joined_df.withColumnRenamed("features", "product_embedding")
joined_df.show()

+---------+----------+------+---------+--------------------+---------+--------------------+
|  user_id|product_id|rating|       id|      user_embedding|       id|   product_embedding|
+---------+----------+------+---------+--------------------+---------+--------------------+
| 31198833|   1003549|     1| 31198833|[0.21142678, -0.6...|  1003549|[0.043348957, -0....|
| 82098029|  32801015|     1| 82098029|[0.55766875, -0.2...| 32801015|[0.39218187, -0.1...|
| 82098029|  34700097|     1| 82098029|[0.55766875, -0.2...| 34700097|[0.24300474, 0.05...|
|110760953| 100019316|     1|110760953|[-0.10180759, -0....|100019316|[-0.55461156, 0.4...|
|110760953| 100045147|     1|110760953|[-0.10180759, -0....|100045147|[0.60582936, -1.1...|
|116566414|   2701022|     1|116566414|[-0.15877374, 0.3...|  2701022|[-0.648407, 0.262...|
|116566414|   5000096|     1|116566414|[-0.15877374, 0.3...|  5000096|[-0.010925028, 0....|
|116566414|   5000776|     1|116566414|[-0.15877374, 0.3...|  5000776|[0.3724867

In [None]:
result_df = joined_df.drop("id")

In [None]:
result_df.show()

+---------+----------+------+--------------------+--------------------+
|  user_id|product_id|rating|      user_embedding|   product_embedding|
+---------+----------+------+--------------------+--------------------+
| 31198833|   1003549|     1|[0.21142678, -0.6...|[0.043348957, -0....|
| 82098029|  32801015|     1|[0.55766875, -0.2...|[0.39218187, -0.1...|
| 82098029|  34700097|     1|[0.55766875, -0.2...|[0.24300474, 0.05...|
|110760953| 100019316|     1|[-0.10180759, -0....|[-0.55461156, 0.4...|
|110760953| 100045147|     1|[-0.10180759, -0....|[0.60582936, -1.1...|
|116566414|   2701022|     1|[-0.15877374, 0.3...|[-0.648407, 0.262...|
|116566414|   5000096|     1|[-0.15877374, 0.3...|[-0.010925028, 0....|
|116566414|   5000776|     1|[-0.15877374, 0.3...|[0.3724867, 0.576...|
|122384079|   4900173|     1|[0.87699986, 0.13...|[0.4079314, 0.733...|
|122384079|   4900383|     1|[0.87699986, 0.13...|[0.9032859, -0.35...|
|125917727|  15200473|     1|[0.11588339, 0.09...|[1.2610983, 1.

In [None]:
num_elements = 10

for i in range(num_elements):
    column_name = f"user_embedding{i + 1}"
    result_df = result_df.withColumn(column_name, col("user_embedding")[i])

for i in range(num_elements):
    column_name = f"product_embedding{i + 1}"
    result_df = result_df.withColumn(column_name, col("product_embedding")[i])

result_df.show()

+---------+----------+------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  user_id|product_id|rating|      user_embedding|   product_embedding|user_embedding1|user_embedding2|user_embedding3|user_embedding4|user_embedding5|user_embedding6|user_embedding7|user_embedding8|user_embedding9|user_embedding10|product_embedding1|product_embedding2|product_embedding3|product_embedding4|product_embedding5|product_embedding6|product_embedding7|product_embedding8|product_embedding9|product_embedding10|
+---------+----------+------+--------------------+--------------------+---------------+---------------+---------------+---------------+---------------+-

In [None]:
result_df = result_df.drop("user_embedding", "product_embedding")
result_df.show()

+---------+----------+------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  user_id|product_id|rating|user_embedding1|user_embedding2|user_embedding3|user_embedding4|user_embedding5|user_embedding6|user_embedding7|user_embedding8|user_embedding9|user_embedding10|product_embedding1|product_embedding2|product_embedding3|product_embedding4|product_embedding5|product_embedding6|product_embedding7|product_embedding8|product_embedding9|product_embedding10|
+---------+----------+------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+------------------+------------------+--------

In [None]:
# Write the result to a new CSV file
output_csv_path = "/content/drive/MyDrive/datasets/sampled-data/embedding-user-product-rating"
result_df.coalesce(1).write.csv(output_csv_path, header=True, mode="overwrite")

# Merge User Embedding into Data Frame

In [None]:
user_price_csv = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/user_preffered_price.csv")
product_category_price_csv = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/sampled-product-dataset.csv")

In [None]:
embedding_csv = "/content/drive/MyDrive/datasets/sampled-data/embedding-user-product-rating/part-00000-e1d68937-e015-4815-813b-e0379ff2cf6e-c000.csv"

user_embedding_csv = pd.read_csv(embedding_csv)

In [None]:
user_price_csv.head(5)

Unnamed: 0,user_id,q1,median,q3,mean,variance,count
0,541078402,176.84,358.57,380.31,531.147778,531365.484506,9
1,541677055,587.5875,1023.565,1459.5425,1023.565,760305.522025,2
2,515721215,41.8075,151.97,186.68,254.284615,155072.557848,26
3,539251936,96.785,128.45,141.9575,114.975,1152.402892,6
4,512748185,213.09,257.38,267.045,283.21,17422.970314,7


In [None]:
product_category_price_csv.head(5)

Unnamed: 0,product_id,category_id,category_code,brand,price
0,1004858,2053013555631882655,electronics.smartphone,samsung,134.41
1,5100719,2053013553341792533,electronics.clocks,samsung,151.61
2,46800027,2110187395394568257,appliances.kitchen.refrigerators,alpicool,668.93
3,28719200,2053013565639492569,apparel.shoes,salamander,146.46
4,1004992,2053013555631882655,electronics.smartphone,oppo,257.38


In [None]:
merged_df = pd.merge(user_embedding_csv, user_price_csv, on="user_id", how="inner")

In [None]:
merged_df = merged_df.drop(columns=["count"])
merged_df.head(5)

Unnamed: 0,user_id,product_id,rating,user_embedding1,user_embedding2,user_embedding3,user_embedding4,user_embedding5,user_embedding6,user_embedding7,...,product_embedding6,product_embedding7,product_embedding8,product_embedding9,product_embedding10,q1,median,q3,mean,variance
0,31198833,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.24148,-0.114664,0.096166,-0.11463,-0.167803,411.59,411.59,411.59,411.59,0.0
1,82098029,32801015,1,0.557669,-0.200036,-0.25295,0.364029,-0.343561,-0.541339,0.648282,...,-0.393886,0.376065,-0.182886,0.035876,-0.179808,137.02,154.63,172.24,154.63,1240.4484
2,82098029,34700097,1,0.557669,-0.200036,-0.25295,0.364029,-0.343561,-0.541339,0.648282,...,-0.189106,0.565716,-0.277306,0.221186,0.153996,137.02,154.63,172.24,154.63,1240.4484
3,110760953,100019316,1,-0.101808,-0.054436,0.320441,0.372512,0.293968,-0.158318,-0.082287,...,-0.158848,-0.332332,0.00665,0.05209,-0.913521,198.435,268.17,337.905,268.17,19451.8809
4,110760953,100045147,1,-0.101808,-0.054436,0.320441,0.372512,0.293968,-0.158318,-0.082287,...,-0.406273,0.267485,-0.48765,-0.968052,-0.371113,198.435,268.17,337.905,268.17,19451.8809


In [None]:
merged_df.to_csv("/content/drive/MyDrive/datasets/sampled-data/price-embedding-user-product-rating.csv", index=False, header=True)

In [None]:
del user_price_csv

# Merge Product Details into Data Frame

In [None]:
merged_df = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/price-embedding-user-product-rating.csv")
product_category_price_csv = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/sampled-product-dataset.csv")

In [None]:
product_category_price_csv.head(5)

Unnamed: 0,product_id,category_id,category_code,brand,price
0,1004858,2053013555631882655,electronics.smartphone,samsung,134.41
1,5100719,2053013553341792533,electronics.clocks,samsung,151.61
2,46800027,2110187395394568257,appliances.kitchen.refrigerators,alpicool,668.93
3,28719200,2053013565639492569,apparel.shoes,salamander,146.46
4,1004992,2053013555631882655,electronics.smartphone,oppo,257.38


In [None]:
merged_df = pd.merge(merged_df, product_category_price_csv, on="product_id", how="inner")

In [None]:
merged_df.head(5)

Unnamed: 0,user_id,product_id,rating,user_embedding1,user_embedding2,user_embedding3,user_embedding4,user_embedding5,user_embedding6,user_embedding7,...,product_embedding10,q1,median,q3,mean,variance,category_id,category_code,brand,price
0,31198833,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.167803,411.59,411.59,411.59,411.59,0.0,2053013555631882655,electronics.smartphone,samsung,344.64
1,363053807,1003549,1,0.508616,-0.018628,0.125026,0.910796,0.473574,-1.552954,-0.348487,...,-0.167803,295.7175,312.035,328.3525,312.035,1065.043225,2053013555631882655,electronics.smartphone,samsung,344.64
2,443084985,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.167803,344.9,344.9,344.9,344.9,0.0,2053013555631882655,electronics.smartphone,samsung,344.64
3,445996152,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.167803,360.11,360.11,360.11,360.11,0.0,2053013555631882655,electronics.smartphone,samsung,344.64
4,483453847,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.167803,411.59,411.59,411.59,411.59,0.0,2053013555631882655,electronics.smartphone,samsung,344.64


In [None]:
merged_df.size

405457920

In [None]:
merged_df = merged_df.drop(columns=["category_code"])
merged_df.head(5)

Unnamed: 0,user_id,product_id,rating,user_embedding1,user_embedding2,user_embedding3,user_embedding4,user_embedding5,user_embedding6,user_embedding7,...,product_embedding9,product_embedding10,q1,median,q3,mean,variance,category_id,brand,price
0,31198833,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.11463,-0.167803,411.59,411.59,411.59,411.59,0.0,2053013555631882655,samsung,344.64
1,363053807,1003549,1,0.508616,-0.018628,0.125026,0.910796,0.473574,-1.552954,-0.348487,...,-0.11463,-0.167803,295.7175,312.035,328.3525,312.035,1065.043225,2053013555631882655,samsung,344.64
2,443084985,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.11463,-0.167803,344.9,344.9,344.9,344.9,0.0,2053013555631882655,samsung,344.64
3,445996152,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.11463,-0.167803,360.11,360.11,360.11,360.11,0.0,2053013555631882655,samsung,344.64
4,483453847,1003549,1,0.211427,-0.60073,-0.093796,1.013287,0.544058,-1.177774,-0.559255,...,-0.11463,-0.167803,411.59,411.59,411.59,411.59,0.0,2053013555631882655,samsung,344.64


In [None]:
merged_df.to_csv("/content/drive/MyDrive/datasets/sampled-data/dnn-recommendation-dataset.csv", index=False, header=True)

# Data Preprocessing

the categorical columns are transformed into indices to reduce computational complexity and time.

In [None]:
chunk_size = 10000
selected_columns = ['user_id',
                    'product_id',
                    'rating',
                    'user_embedding1',
                    'user_embedding2',
                    'user_embedding3',
                    'user_embedding4',
                    'user_embedding5',
                    'user_embedding6',
                    'user_embedding7',
                    'user_embedding8',
                    'user_embedding9',
                    'user_embedding10',
                    'product_embedding1',
                    'product_embedding2',
                    'product_embedding3',
                    'product_embedding4',
                    'product_embedding5',
                    'product_embedding6',
                    'product_embedding7',
                    'product_embedding8',
                    'product_embedding9',
                    'product_embedding10',
                    'q1', 'median', 'q3', 'mean', 'variance',
                    'category_id', 'brand', 'price']

dataset = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/dnn-recommendation-dataset.csv",
                      chunksize=chunk_size,
                      usecols = selected_columns
                      )

In [None]:
# input
# user: user_embedding 1 - 10,
# product: product_embedding 1 - 10, q1, median, q3, mean, variance, category_id, brand, price

In [None]:
user_id_index_dict = {}
product_id_index_dict = {}
category_id_index_dict = {}
brand_index_dict = {}
columns_to_handle = ["user_id", "product_id", "category_id", "brand"]

In [None]:
# extracting values to form lists
for chunk in dataset:
  for index, value in chunk.iterrows():
    if value["user_id"] not in user_id_index_dict:
      user_id_index_dict[value["user_id"]] = len(user_id_index_dict)
    if value["product_id"] not in product_id_index_dict:
      product_id_index_dict[value["product_id"]] = len(product_id_index_dict)
    if value["category_id"] not in category_id_index_dict:
      category_id_index_dict[value["category_id"]] = len(category_id_index_dict)
    if value["brand"] not in brand_index_dict:
      brand_index_dict[value["brand"]] = len(brand_index_dict)

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/dnn-recommendation-dataset.csv",
                      chunksize=chunk_size,
                      usecols = selected_columns
                      )

In [None]:
# applying index for each column

import csv

output_file = "/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/cleaned-dnn-recommendation-dataset.csv"

with open(output_file, 'a', newline='') as output_csv:

  field_names = ['user_id',
                 'product_id',
                 'rating',
                 'user_embedding1',
                 'user_embedding2',
                 'user_embedding3',
                 'user_embedding4',
                 'user_embedding5',
                 'user_embedding6',
                 'user_embedding7',
                 'user_embedding8',
                 'user_embedding9',
                 'user_embedding10',
                 'product_embedding1',
                 'product_embedding2',
                 'product_embedding3',
                 'product_embedding4',
                 'product_embedding5',
                 'product_embedding6',
                 'product_embedding7',
                 'product_embedding8',
                 'product_embedding9',
                 'product_embedding10',
                 'q1', 'median', 'q3', 'mean', 'variance',
                 'category_id', 'brand', 'price',
                 "user_id_idx", "product_id_idx", "category_id_idx", "brand_idx"]

  csv_writer = csv.DictWriter(output_csv, fieldnames=field_names)
  csv_writer.writeheader()

  for idx, chunk in enumerate(dataset):
    for _, row in chunk.iterrows():
      row["user_id_idx"] = user_id_index_dict[row["user_id"]]
      row["product_id_idx"] = product_id_index_dict[row["product_id"]]
      row["category_id_idx"] = category_id_index_dict[row["category_id"]]
      row["brand_idx"] = brand_index_dict[row["brand"]]

      csv_writer.writerow(row.to_dict())

  print(f"-> training data preparation completed")


In [None]:
for chunk in dataset:
  print(chunk.columns.tolist())
  print(len(chunk))
  break

['user_id', 'product_id', 'rating', 'user_embedding1', 'user_embedding2', 'user_embedding3', 'user_embedding4', 'user_embedding5', 'user_embedding6', 'user_embedding7', 'user_embedding8', 'user_embedding9', 'user_embedding10', 'product_embedding1', 'product_embedding2', 'product_embedding3', 'product_embedding4', 'product_embedding5', 'product_embedding6', 'product_embedding7', 'product_embedding8', 'product_embedding9', 'product_embedding10', 'q1', 'median', 'q3', 'mean', 'variance', 'category_id', 'brand', 'price']
10000


# Model Design

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [None]:
tf.__version__

'2.15.0'

In [None]:
chunk_size = 1000000
chunks = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/cleaned-dnn-recommendation-dataset.csv", chunksize=chunk_size, header=None)

In [None]:
# [0'user_id',1'product_id',2'rating',3'user_embedding1',4'user_embedding2',5'user_embedding3',6'user_embedding4',7'user_embedding5',8'user_embedding6',9'user_embedding7',
# 10'user_embedding8',11'user_embedding9',12'user_embedding10',13'product_embedding1',14'product_embedding2',15'product_embedding3',16'product_embedding4',17'product_embedding5',18'product_embedding6',19'product_embedding7',
# 20'product_embedding8',21'product_embedding9',22'product_embedding10',23'q1', 24'median', 25'q3', 26'mean', 27'variance',28'category_id', 29'brand',
# 30'price',31"user_id_idx",32"product_id_idx",33"category_id_idx",34"brand_idx"]

**Column Description**

0. user_id
1. product_id
2. rating
3. user_embedding1
4. user_embedding2
5. user_embedding3
6. user_embedding4
7. user_embedding5
8. user_embedding6
9. user_embedding7
10. user_embedding8
11. user_embedding9
12. user_embedding10
13. product_embedding1
14. product_embedding2
15. product_embedding3
16. product_embedding4
17. product_embedding5
18. product_embedding6
19. product_embedding7
20. product_embedding8
21. product_embedding9
22. product_embedding10
23. q1
24. median
25. q3
26. mean
27. variance
28. category_id
29. brand
30. price
31. user_id_idx
32. product_id_idx
33. category_id_idx
34. brand_idx



In [None]:
# looping through the chunks to find the min max values

global_min = None
global_max = None

sampled_df =  pd.DataFrame()

for _, chunk in enumerate(chunks):

  sampled_chunk = chunk.sample(frac=0.5)

  if global_min == None and global_max == None:
    global_min = sampled_chunk[2].min()
    global_max = sampled_chunk[2].max()
    continue

  min_rating = sampled_chunk[2].min()
  max_rating = sampled_chunk[2].max()

  if min_rating < global_min: global_min = min_rating
  if max_rating > global_max: global_max = max_rating

  sampled_df = pd.concat([sampled_df, sampled_chunk], ignore_index=True)

print(f"min: {global_min}, max: {global_max}")

min: 1, max: 253


In [None]:
sampled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,512380048,1004249,6,-4.987651,0.222951,-1.003102,4.013586,1.697001,-0.616291,3.673216,-0.371695,-3.421739,-1.276262,-0.441914,-0.030003,-0.287731,0.321831,0.095122,-0.154138,0.148423,-0.134409,-0.192603,-0.109458,739.0400,739.210,739.2100,765.687778,5366.713240,2053013555631882655,apple,733.89,1033351,942,0,71
1,521213773,1004659,2,0.684224,0.358782,-1.057332,1.310991,0.978886,-1.721648,0.288140,-1.407572,-0.501258,-1.422345,-0.059965,0.170683,-0.329993,0.143163,-0.102422,-0.117643,-0.043494,-0.347111,0.062951,-0.231157,81.0650,279.110,463.7300,295.180256,49537.511551,2053013555631882655,samsung,787.08,10928,833,0,0
2,540451444,1004856,1,-0.055376,-0.027079,-0.565257,0.490118,-0.498776,-0.496873,-0.534287,-0.044980,-0.842681,-0.549805,-0.022906,-0.011201,-0.233817,0.202736,-0.206318,-0.205530,-0.221007,-0.018606,-0.348573,-0.227426,131.0200,131.020,131.0200,131.020000,0.000000,2053013555631882655,samsung,130.70,1346079,1138,0,0
3,535610357,1004856,1,-0.533492,0.443116,-0.292479,0.708290,0.954507,-1.504061,-0.398331,-0.300297,-0.945521,-0.934270,-0.022906,-0.011201,-0.233817,0.202736,-0.206318,-0.205530,-0.221007,-0.018606,-0.348573,-0.227426,142.3250,172.230,215.5150,196.335714,7878.724024,2053013555631882655,samsung,130.70,72013,1138,0,0
4,530192037,100119370,1,-3.171978,2.518868,1.023160,0.011114,-0.286361,0.021201,0.374223,2.404803,-0.518048,-1.502740,-0.027533,0.115755,0.104660,0.005433,0.094579,-0.130613,0.272036,-0.015397,0.179639,-0.350931,17.9500,283.140,1247.6500,600.623333,320899.078511,2232732093077520756,samsung,1029.34,1118075,895,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5835275,558097670,100142442,1,0.168577,-0.422955,-1.018468,0.121391,-0.214414,-0.511140,-0.907451,0.158392,-0.139942,-0.674009,-0.629219,-0.462969,-0.159120,1.057128,-0.842543,0.532155,-0.909335,-0.588942,-0.389877,0.414926,19.8375,51.460,137.8150,124.029333,49738.930795,2085718636156158307,stanley,27.69,179312,95504,724,110
5835276,512959666,13903102,1,0.131537,1.093951,0.252410,0.349420,0.044337,-2.515295,-0.737434,-0.280736,0.166731,-0.854856,0.039422,-0.532190,1.014249,0.252563,0.392235,-0.446651,-0.269843,-0.405452,0.530185,0.486147,118.6950,366.675,458.6150,376.245000,102631.859342,2053013557343158789,grohe,92.55,12381,150143,551,1129
5835277,526078965,100024424,2,-0.082128,1.552372,0.457269,0.351833,0.223436,-0.141488,0.393145,0.048412,-0.752455,0.250501,-0.642777,0.485268,0.685840,-0.064138,1.388168,0.844786,-0.085675,1.060635,-1.004483,-0.309436,347.2400,388.430,430.2550,355.548571,11912.410869,2232732110022509290,rossignol,437.33,3615269,158800,731,446
5835278,543678517,17700784,3,0.058537,1.405534,1.027430,-0.194949,1.338806,-0.056467,1.373643,-1.485485,-1.129754,-0.455268,0.572662,0.007508,1.753722,1.069586,-0.177754,0.584967,0.822083,-0.312285,0.281796,-0.730343,10.8100,11.195,57.7575,97.075000,29891.756000,2053013558861496931,bioderma,10.81,702618,99120,332,256


In [None]:
# # looping through the chunks again to normalise "rating" column

# chunks = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/cleaned-dnn-recommendation-dataset.csv", chunksize=chunk_size, header=None)

# for _, chunk in enumerate(chunks):
#   sampled_chunk = chunk.sample(frac=0.5)
#   merged_df = pd.concat([result_df, sampled_chunk], ignore_index=True)

In [None]:
# normalise the sampled dataframe

sampled_df[2] = sampled_df[2].map(lambda x:
 (x - global_min)/(global_max - global_min)
)

In [None]:
X = sampled_df[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                   20, 21, 22, 23, 24, 25, 26, 27, 30, 31, 32, 33, 34]]
y = sampled_df[[2]]

In [None]:
from sklearn.model_selection import train_test_split
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X.head(5)

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,30,31,32,33,34
0,-4.987651,0.222951,-1.003102,4.013586,1.697001,-0.616291,3.673216,-0.371695,-3.421739,-1.276262,-0.441914,-0.030003,-0.287731,0.321831,0.095122,-0.154138,0.148423,-0.134409,-0.192603,-0.109458,739.04,739.21,739.21,765.687778,5366.71324,733.89,1033351,942,0,71
1,0.684224,0.358782,-1.057332,1.310991,0.978886,-1.721648,0.28814,-1.407572,-0.501258,-1.422345,-0.059965,0.170683,-0.329993,0.143163,-0.102422,-0.117643,-0.043494,-0.347111,0.062951,-0.231157,81.065,279.11,463.73,295.180256,49537.511551,787.08,10928,833,0,0
2,-0.055376,-0.027079,-0.565257,0.490118,-0.498776,-0.496873,-0.534287,-0.04498,-0.842681,-0.549805,-0.022906,-0.011201,-0.233817,0.202736,-0.206318,-0.20553,-0.221007,-0.018606,-0.348573,-0.227426,131.02,131.02,131.02,131.02,0.0,130.7,1346079,1138,0,0
3,-0.533492,0.443116,-0.292479,0.70829,0.954507,-1.504061,-0.398331,-0.300297,-0.945521,-0.93427,-0.022906,-0.011201,-0.233817,0.202736,-0.206318,-0.20553,-0.221007,-0.018606,-0.348573,-0.227426,142.325,172.23,215.515,196.335714,7878.724024,130.7,72013,1138,0,0
4,-3.171978,2.518868,1.02316,0.011114,-0.286361,0.021201,0.374223,2.404803,-0.518048,-1.50274,-0.027533,0.115755,0.10466,0.005433,0.094579,-0.130613,0.272036,-0.015397,0.179639,-0.350931,17.95,283.14,1247.65,600.623333,320899.078511,1029.34,1118075,895,48,0


In [None]:
def get_model():
  user_embedding1 = keras.layers.Input(shape=(1,), name='user_embedding1')
  user_embedding2 = keras.layers.Input(shape=(1,), name='user_embedding2')
  user_embedding3 = keras.layers.Input(shape=(1,), name='user_embedding3')
  user_embedding4 = keras.layers.Input(shape=(1,), name='user_embedding4')
  user_embedding5 = keras.layers.Input(shape=(1,), name='user_embedding5')
  user_embedding6 = keras.layers.Input(shape=(1,), name='user_embedding6')
  user_embedding7 = keras.layers.Input(shape=(1,), name='user_embedding7')
  user_embedding8 = keras.layers.Input(shape=(1,), name='user_embedding8')
  user_embedding9 = keras.layers.Input(shape=(1,), name='user_embedding9')
  user_embedding10 = keras.layers.Input(shape=(1,), name='user_embedding10')
  product_embedding1 = keras.layers.Input(shape=(1,), name='product_embedding1')
  product_embedding2 = keras.layers.Input(shape=(1,), name='product_embedding2')
  product_embedding3 = keras.layers.Input(shape=(1,), name='product_embedding3')
  product_embedding4 = keras.layers.Input(shape=(1,), name='product_embedding4')
  product_embedding5 = keras.layers.Input(shape=(1,), name='product_embedding5')
  product_embedding6 = keras.layers.Input(shape=(1,), name='product_embedding6')
  product_embedding7 = keras.layers.Input(shape=(1,), name='product_embedding7')
  product_embedding8 = keras.layers.Input(shape=(1,), name='product_embedding8')
  product_embedding9 = keras.layers.Input(shape=(1,), name='product_embedding9')
  product_embedding10 = keras.layers.Input(shape=(1,), name='product_embedding10')
  q1 = keras.layers.Input(shape=(1,), name='q1')
  median = keras.layers.Input(shape=(1,), name='median')
  q3 = keras.layers.Input(shape=(1,), name='q3')
  mean = keras.layers.Input(shape=(1,), name='mean')
  variance = keras.layers.Input(shape=(1,), name='variance')
  price = keras.layers.Input(shape=(1,), name='price')
  product_id_idx = keras.layers.Input(shape=(1,), name='product_id_idx')
  category_id_idx = keras.layers.Input(shape=(1,), name='category_id_idx')
  brand_idx = keras.layers.Input(shape=(1,), name='brand_idx')

  user_vector = tf.keras.layers.concatenate([
      user_embedding1, user_embedding2,
      user_embedding3, user_embedding4,
      user_embedding5, user_embedding6,
      user_embedding7, user_embedding8,
      user_embedding9, user_embedding10,
  ])

  user_vector = layers.Dense(32, activation='relu')(user_vector)
  user_vector = layers.Dense(8, activation='relu', name="user_embedding",
                             kernel_regularizer=tf.keras.regularizers.l2(0.01))(user_vector)

  product_vector = tf.keras.layers.concatenate([
      product_embedding1, product_embedding2,
      product_embedding3, product_embedding4,
      product_embedding5, product_embedding6,
      product_embedding7, product_embedding8,
      product_embedding9, product_embedding10,
      q1, median, q3, mean, variance, price,
      product_id_idx, category_id_idx, brand_idx
  ])

  product_vector = layers.Dense(32, activation='relu')(product_vector)
  product_vector = layers.Dense(8, activation='relu', name='product_embedding',
                                kernel_regularizer=tf.keras.regularizers.l2(0.01))(product_vector)

  dot_user_product = tf.reduce_sum(user_vector*product_vector, axis=1)
  dot_user_product = tf.expand_dims(dot_user_product, 1)

  output = layers.Dense(1, activation='sigmoid')(dot_user_product)

  return keras.models.Model(inputs=[user_embedding1, user_embedding2, user_embedding3, user_embedding4, user_embedding5,
                                    user_embedding6, user_embedding7, user_embedding8, user_embedding9, user_embedding10,
                                    product_embedding1, product_embedding2, product_embedding3, product_embedding4, product_embedding5,
                                    product_embedding6, product_embedding7, product_embedding8, product_embedding9, product_embedding10,
                                    q1, median, q3, mean, variance, price,
                                    product_id_idx, category_id_idx, brand_idx],
                            outputs=[output])

In [None]:
model = get_model()
model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy', 'mae'])

# Model Training

In [None]:
fit_x_train = [
    X_train[3],X_train[4],X_train[5],X_train[6],X_train[7],X_train[8],X_train[9],X_train[10],X_train[11],X_train[12],
    X_train[13],X_train[14],X_train[15],X_train[16],X_train[17],X_train[18],X_train[19],X_train[20],X_train[21],X_train[22],
    X_train[23],X_train[24],X_train[25],X_train[26],X_train[27],X_train[30], X_train[32],X_train[33],X_train[34]
]
print(type(X_train))
print(type(fit_x_train))

<class 'pandas.core.frame.DataFrame'>
<class 'list'>


In [None]:
from datetime import datetime
TIMESTAMP = "{0:%Y-%m-%dt%H-%M-%S/}".format(datetime.now())
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=",/logs/logs_"+TIMESTAMP)

In [None]:
history = model.fit(
    x=fit_x_train,
    y=y_train,
    batch_size=32,
    epochs=5,
    verbose=1,
    callbacks=[tensorboard_callback]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/model")

# Model Prediction

In [None]:
inputs = X_test[
    [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
     20, 21, 22, 23, 24, 25, 26, 27, 30, 31, 32, 33, 34]
    ]

In [None]:
test_res = model.predict([
    inputs[3],inputs[4],inputs[5],inputs[6],inputs[7],inputs[8],inputs[9],inputs[10],inputs[11],inputs[12],
    inputs[13],inputs[14],inputs[15],inputs[16],inputs[17],inputs[18],inputs[19],inputs[20],inputs[21],inputs[22],
    inputs[23],inputs[24],inputs[25],inputs[26],inputs[27],inputs[30],inputs[32],inputs[33],inputs[34]
])



In [None]:
test_res

array([[0.0000000e+00],
       [0.0000000e+00],
       [0.0000000e+00],
       ...,
       [2.4400456e-06],
       [0.0000000e+00],
       [8.7514163e-31]], dtype=float32)

In [None]:
training_loss = history.history['loss']
training_accuracy = history.history['accuracy']
print(f'Training Loss: {training_loss}')
print(f'Training Accuracy: {training_accuracy}')

Training Loss: [0.0009605138911865652, 1.2872248589701485e-05, 1.2268315913388506e-05, 1.2250159670657013e-05, 1.2278281246835832e-05]
Training Accuracy: [0.8644872307777405, 0.8644872307777405, 0.8644872307777405, 0.8644872307777405, 0.8644872307777405]


In [None]:
evaluation_result = model.evaluate(x=[
    inputs[3],inputs[4],inputs[5],inputs[6],inputs[7],inputs[8],inputs[9],inputs[10],inputs[11],inputs[12],
    inputs[13],inputs[14],inputs[15],inputs[16],inputs[17],inputs[18],inputs[19],inputs[20],inputs[21],inputs[22],
    inputs[23],inputs[24],inputs[25],inputs[26],inputs[27],inputs[30],inputs[32],inputs[33],inputs[34]
], y=y_test)
print("Test Loss:", evaluation_result[0])
print("Test Accuracy:", evaluation_result[1])

Test Loss: 1.1896421710844152e-05
Test Accuracy: 0.8645197749137878


In [None]:
from tensorflow.keras import backend as K

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=[rmse])

evaluation_result = model.evaluate(x=[
    inputs[3],inputs[4],inputs[5],inputs[6],inputs[7],inputs[8],inputs[9],inputs[10],inputs[11],inputs[12],
    inputs[13],inputs[14],inputs[15],inputs[16],inputs[17],inputs[18],inputs[19],inputs[20],inputs[21],inputs[22],
    inputs[23],inputs[24],inputs[25],inputs[26],inputs[27],inputs[30],inputs[32],inputs[33],inputs[34]
], y=y_test)
print("Test Loss:", evaluation_result[0])
print("Test RMSE:", evaluation_result[1])

Test Loss: 1.127727227867581e-05
Test RMSE: 0.002750461921095848


# DNN Recommendation

steps:
1. import model
2. select a user
3. get the user embedding
4. loop through all products with the user embedding and each product embedding
5. store the predicted rating into a dataframe: product_id, rating
6. sort the rating in descending order
7. filter out the purchased product by that user
8. display the top 20 products

In [None]:
from tensorflow.keras.models import load_model
recommendation_model = load_model("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/model")

In [None]:
selected_user_id = 567950899

In [None]:
user_embedding = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/tensorflow-user-embedding.csv", header=None)
selected_user_embedding = user_embedding[user_embedding[0] == selected_user_id]
del user_embedding

In [None]:
selected_user_embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
1350361,567950899,1.745698,-0.37917,-1.544525,-0.567893,0.872955,-3.845304,3.077188,0.793373,-1.694098,-3.356388,1350361


In [None]:
# [0'user_id',1'product_id',2'rating',3'user_embedding1',4'user_embedding2',5'user_embedding3',6'user_embedding4',7'user_embedding5',8'user_embedding6',9'user_embedding7',
# 10'user_embedding8',11'user_embedding9',12'user_embedding10',13'product_embedding1',14'product_embedding2',15'product_embedding3',16'product_embedding4',17'product_embedding5',18'product_embedding6',19'product_embedding7',
# 20'product_embedding8',21'product_embedding9',22'product_embedding10',23'q1', 24'median', 25'q3', 26'mean', 27'variance',28'category_id', 29'brand',
# 30'price',31"user_id_idx",32"product_id_idx",33"category_id_idx",34"brand_idx"]

In [None]:
product_embedding = pd.read_csv("/content/drive/MyDrive/datasets/sampled-data/tensorflow-data/tensorflow-product-embedding.csv", header=None)
product_embedding.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1003549,0.043349,-0.123168,-0.019231,0.207755,0.111549,-0.24148,-0.114664,0.096166,-0.11463,-0.167803,411.59,411.59,411.59,411.59,0.0,344.64,0,0,0
1,32801015,0.392182,-0.181261,-0.080616,0.155197,-0.27699,-0.393886,0.376065,-0.182886,0.035876,-0.179808,137.02,154.63,172.24,154.63,1240.4484,199.06,1,1,1
2,34700097,0.243005,0.056804,-0.455283,0.516227,-0.024201,-0.189106,0.565716,-0.277306,0.221186,0.153996,137.02,154.63,172.24,154.63,1240.4484,119.41,2,2,2
3,100019316,-0.554612,0.485401,0.530732,0.787051,0.083901,-0.158848,-0.332332,0.00665,0.05209,-0.913521,198.435,268.17,337.905,268.17,19451.88,128.7,3,3,3
4,100045147,0.605829,-1.174564,0.421368,0.163907,1.158821,-0.406273,0.267485,-0.48765,-0.968052,-0.371113,198.435,268.17,337.905,268.17,19451.88,407.64,4,4,4


In [None]:
import numpy as np

result_df = pd.DataFrame(columns=["product_id", "predicted_rating"])
sampled_product_embedding = product_embedding.sample(n=500, random_state=42)

for _, row in sampled_product_embedding.iterrows():

  row_product_embedding = row.to_frame().T

  prediction = recommendation_model.predict([
    (selected_user_embedding[1]), (selected_user_embedding[2]), (selected_user_embedding[3]), (selected_user_embedding[4]), (selected_user_embedding[5]),
    (selected_user_embedding[6]), (selected_user_embedding[7]), (selected_user_embedding[8]), (selected_user_embedding[9]), (selected_user_embedding[10]),
    (row_product_embedding[1]), (row_product_embedding[2]), (row_product_embedding[3]), (row_product_embedding[4]), (row_product_embedding[5]),
    (row_product_embedding[6]), (row_product_embedding[7]), (row_product_embedding[8]), (row_product_embedding[9]), (row_product_embedding[10]),
    (row_product_embedding[11]), (row_product_embedding[12]), (row_product_embedding[13]), (row_product_embedding[14]), (row_product_embedding[15]),
    (row_product_embedding[16]), (row_product_embedding[17]), (row_product_embedding[18]), (row_product_embedding[19])
  ])

  result_df = pd.concat([result_df, pd.DataFrame.from_records([{ 'product_id': str(row_product_embedding[0].iloc[0]).split('.')[0], 'predicted_rating': prediction[0][0] }])], ignore_index=True)

In [None]:
result_df["product_id"]

0       28718149
1      100021660
2      100060935
3      100040368
4      100209344
         ...    
495      3900003
496     35109140
497    100109132
498     14100141
499     45601020
Name: product_id, Length: 500, dtype: object

In [None]:
result_df["predicted_rating"]

0      1.211088e-18
1      2.723898e-11
2      5.348310e-30
3      6.862940e-04
4      9.792425e-14
           ...     
495    7.024572e-04
496    2.404461e-10
497    1.052600e-06
498    1.395882e-12
499    1.781138e-21
Name: predicted_rating, Length: 500, dtype: float32

In [None]:
df_sorted = result_df.sort_values(by='predicted_rating', ascending=False)
df_sorted.head(50)

Unnamed: 0,product_id,predicted_rating
495,3900003,0.0007024572
3,100040368,0.000686294
103,100091278,0.0005800619
391,45000072,0.0004743732
444,100044713,0.0004425176
128,3801304,0.0003733535
194,1004946,0.0002009009
69,1005085,0.0001789879
52,16000749,0.0001736213
105,21400599,0.0001655621


In [None]:
file_path_prefix = "/content/drive/MyDrive/datasets/sampled-data/"

purchase_df = pd.DataFrame(pd.read_csv(file_path_prefix + "purchase_history.csv"))
purchased_by_user_list = purchase_df[purchase_df["user_id"] == selected_user_id]["product_id"].tolist()

df_target_product_id = (
  df_sorted[~df_sorted["product_id"].isin(purchased_by_user_list)]
  .sort_values(by="predicted_rating", ascending=False)
  .head(50)[["product_id", "predicted_rating"]]
)

In [None]:
products_df = pd.DataFrame(pd.read_csv(file_path_prefix + "sampled-product-dataset.csv"))

df_target_product_id["product_id"] = df_target_product_id["product_id"].astype(str)
products_df["product_id"] = products_df["product_id"].astype(str)

print(df_target_product_id["product_id"].dtype)
print(products_df["product_id"].dtype)

df_result = pd.merge(
    left = df_target_product_id,
    right = products_df,
    on = "product_id"
    )[["product_id", "category_code", "brand", "price", "predicted_rating"]]

object
object


In [None]:
df_result

Unnamed: 0,product_id,category_code,brand,price,predicted_rating
0,3900003,appliances.environment.water_heater,ariston,120.45,0.0007024572
1,100040368,electronics.video.tv,ams,193.06,0.000686294
2,100091278,construction.tools.welding,alteco,520.68,0.0005800619
3,45000072,apparel.shoes,meinl,100.39,0.0004743732
4,100044713,computers.peripherals.mouse,xiaomi,33.21,0.0004425176
5,3801304,appliances.iron,tefal,65.61,0.0003733535
6,1004946,electronics.smartphone,oneplus,746.2,0.0002009009
7,1005085,electronics.smartphone,inoi,66.04,0.0001789879
8,16000749,accessories.bag,scovo,8.39,0.0001736213
9,21400599,electronics.clocks,orient,180.18,0.0001655621
