In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
from ast import literal_eval

#data import
impressions = pd.read_csv(
    "TransformedData.csv",
    header = None,
    names= ['user_id','timestamp','history','category','subcategory','title','next_item']
    ) 

news_data = pd.read_table("news.tsv",
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

news_data = news_data.drop_duplicates('id')

In [2]:
history_eval = impressions["history"].map(lambda x: literal_eval(x)).tolist()
category_eval = impressions["category"].map(lambda x: literal_eval(x)).tolist()
subcategory_eval = impressions["subcategory"].map(lambda x: literal_eval(x)).tolist()
next_id_eval = impressions["next_item"].map(lambda x: literal_eval(x)).tolist()

#Create Data Tensors && dataset
history_tensor = tf.convert_to_tensor(list(zip(history_eval, category_eval, subcategory_eval)), dtype=tf.string)
next_news_tensor = tf.convert_to_tensor(next_id_eval, dtype=tf.string)


dataset = tf.data.Dataset.from_tensor_slices((next_news_tensor,history_tensor))

print(dataset)
#Vocabularies
news_id_vocabulary = tf.constant(news_data["id"], dtype=tf.string)
news_id_vocabulary = tf.data.Dataset.from_tensor_slices((news_id_vocabulary))
news_id_vocabulary = news_id_vocabulary.batch(1000)
news_id_vocabulary = np.unique(np.concatenate(list(news_id_vocabulary)))

news_category_vocabulary = tf.constant(news_data["category"], dtype=tf.string)
news_category_vocabulary = tf.data.Dataset.from_tensor_slices((news_category_vocabulary))
news_category_vocabulary = news_category_vocabulary.batch(1000)
news_category_vocabulary = np.unique(np.concatenate(list(news_category_vocabulary)))

news_subcategory_vocabulary = tf.constant(news_data["subcategory"], dtype=tf.string)
news_subcategory_vocabulary = tf.data.Dataset.from_tensor_slices((news_subcategory_vocabulary))
news_subcategory_vocabulary = news_subcategory_vocabulary.batch(1000)
news_subcategory_vocabulary = np.unique(np.concatenate(list(news_subcategory_vocabulary)))

news_id = list(news_data["id"].values)
news_category = list(news_data["category"].values)
news_subcategory = list(news_data["subcategory"].values)
newses = tf.convert_to_tensor(list(zip(news_id,news_category,news_subcategory)), dtype=tf.string)
news_dataset = tf.data.Dataset.from_tensor_slices((newses))

<TensorSliceDataset element_spec=(TensorSpec(shape=(3,), dtype=tf.string, name=None), TensorSpec(shape=(3, 10), dtype=tf.string, name=None))>


In [6]:
embedding_dimension=64
learning_rate=0.1
epochs=3

class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        #Create History Model
        self.history_model = tf.keras.Sequential()
        self.history_model._name = "user_history"
        self.history_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.history_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary)+1, embedding_dimension))
        self.history_model.add(tf.keras.layers.GRU(embedding_dimension))

        #Create Category Model
        self.category_model = tf.keras.Sequential()
        self.category_model._name = "user_category"
        self.category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary)+1, embedding_dimension))
        self.category_model.add(tf.keras.layers.GRU(embedding_dimension))

        #Create SubCategory Model
        self.subcategory_model = tf.keras.Sequential()
        self.subcategory_model._name = "user_subcategory"
        self.subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary)+1, embedding_dimension))
        self.subcategory_model.add(tf.keras.layers.GRU(embedding_dimension))

    def call(self, features):
        return tf.concat([
            self.history_model(features[0]),
            self.category_model(features[1]),
            self.subcategory_model(features[2]),
        ], axis = 1)
    
class NewsModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        # ID_model
        self.NewsId_model = tf.keras.Sequential()
        self.NewsId_model._name = "news_id"
        self.NewsId_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.NewsId_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary) +1, embedding_dimension))
        
        # category model
        self.news_category_model = tf.keras.Sequential()
        self.news_category_model._name = "news_category"
        self.news_category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.news_category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary) +1, embedding_dimension))
        
        # subcategory model
        self.news_subcategory_model = tf.keras.Sequential()
        self.news_subcategory_model._name = "news_subcategory"
        self.news_subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.news_subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary) +1, embedding_dimension))

    def call(self, features):
        return tf.concat([
            self.NewsId_model(features[0]),
            self.news_category_model(features[1]),
            self.news_subcategory_model(features[2]),
        ], axis = 1)
    
class Model(tfrs.Model):
    def __init__(self):
        super().__init__()

        self.query_model = tf.keras.Sequential([
            UserModel(),
            tf.keras.layers.Dense(embedding_dimension)

        ])
        
        self.query_model._name = "query"
        
        self.candidate_model = tf.keras.Sequential([
            NewsModel(),
            tf.keras.layers.Dense(embedding_dimension)

        ])
        
        self.candidate_model._name = "candidate"
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates= news_dataset.batch(1024).map(self.candidate_model),
                ),
            name = "retrival_task"
        )

    def compute_loss(self, features, training=False):
        candidate_embedding = self.candidate_model(features[0])
        query_embedding = self.query_model(features[1])
        return self.task(query_embedding, candidate_embedding, compute_metrics=not training)

model = Model()

In [7]:
## Train Model
#training  constants

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate))

train_ds = dataset.take(130_000)
test_ds = dataset.skip(130_000).take(10_000)
validation_ds = dataset.skip(130_000).skip(10_000)

cached_train = train_ds.shuffle(10_000).batch(10000).cache()
cached_test = test_ds.batch(1024).cache()

model.fit(cached_train, epochs=epochs)


model.evaluate(cached_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.03333333507180214,
 0.06666667014360428,
 0.10000000149011612,
 0.4333333373069763,
 0.8999999761581421,
 11.841562271118164,
 0,
 11.841562271118164]

In [9]:
identifier = tf.data.Dataset.from_tensor_slices(news_data["id"])

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
# recommends movies out of the entire movies dataset.
index.index(
    news_dataset.batch(100).map(model.candidate_model),
    identifier
)

# Get recommendations.
_, titles = index(tf.constant([[
    ['N38629', 'N50155', 'N29177', 'N56426', 'N63842', 'N36565', 'N30710', 'N43854', 'N41229', 'N31983'],
    ['tv', 'music', 'tv', 'news', 'tv', 'health', 'health', 'health', 'entertainment', 'health'],
    ['tv-celebrity', 'musicnews', 'tv-celebrity', 'newsus', 'tv-celebrity', 'weightloss', 'weight-loss', 'wellness', 'entertainment-celebrity', 'weightloss']
]]))
print(f"Then Give this: {titles[0, :5]}")

ValueError: Attempt to convert a value (<MapDataset element_spec=TensorSpec(shape=(3, 64), dtype=tf.float32, name=None)>) with an unsupported type (<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>) to a Tensor.