In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
from ast import literal_eval
from typing import Dict, Text

#data import
impressions = pd.read_csv(
    "TransformedData.csv",
    header = None,
    names= ['user_id','timestamp','history','category','subcategory','title','next_item']
    ) 

impressions = impressions.drop(columns=['user_id','timestamp','title'])

news_data = pd.read_table("news.tsv",
              header=None,
              names=[
                  'next_id', 'next_category', 'next_subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

news_data = news_data.drop(columns=['title', 'abstract','url', 'title_entities','abstract_entities'])
news_data = news_data.drop_duplicates('next_id')

history = impressions["history"].map(lambda x: literal_eval(x)).tolist()
category = impressions["category"].map(lambda x: literal_eval(x)).tolist()
subcategory = impressions["subcategory"].map(lambda x: literal_eval(x)).tolist()
next_id = impressions["next_item"].map(lambda x: literal_eval(x)[0])
next_category = impressions["next_item"].map(lambda x: literal_eval(x)[1])
next_subcategory = impressions["next_item"].map(lambda x: literal_eval(x)[2])

In [2]:
history = tf.ragged.constant(history, dtype=tf.string)
category = tf.ragged.constant(category, dtype=tf.string)
subcategory = tf.ragged.constant(subcategory, dtype=tf.string)
next_id = tf.constant(next_id, dtype=tf.string)
next_category = tf.constant(next_category, dtype=tf.string)
next_subcategory = tf.constant(next_subcategory, dtype=tf.string)

news_dict = {name: np.array(value) for name, value in news_data.items()}
impressions_dict = {
    "history" : history,
    "category" : category,
    "subcategory" : subcategory,
    "next_id" : next_id,
    "next_category" : next_category,
    "next_subcategory" : next_subcategory,
}

In [3]:
news_ds = tf.data.Dataset.from_tensor_slices(news_dict)
impressions_ds = tf.data.Dataset.from_tensor_slices(impressions_dict)

In [4]:
#Vocabularies
news_id_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_id"]))))
news_category_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_category"]))))
news_subcategory_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_subcategory"]))))

In [5]:
news_ds = news_ds.map(lambda x: {
    "next_id": x['next_id'],
    "next_category": x['next_category'],
    "next_subcategory": x['next_subcategory'],
})

impressions_ds = impressions_ds.map(lambda x: {
    "history" : x["history"],
    "category" : x["category"],
    "subcategory" : x["subcategory"],
    "next_id" : x["next_id"],
    "next_category" : x["next_category"],
    "next_subcategory" : x["next_subcategory"],
})

In [7]:
embedding_dimension=128
learning_rate=0.1
epochs=100

class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        #Create History Model
        self.history_model = tf.keras.Sequential()
        self.history_model._name = "user_history"
        self.history_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.history_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary)+1, embedding_dimension))
        self.history_model.add(tf.keras.layers.GRU(embedding_dimension))

        #Create Category Model
        self.category_model = tf.keras.Sequential()
        self.category_model._name = "user_category"
        self.category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary)+1, embedding_dimension))
        self.category_model.add(tf.keras.layers.GRU(embedding_dimension))

        #Create SubCategory Model
        self.subcategory_model = tf.keras.Sequential()
        self.subcategory_model._name = "user_subcategory"
        self.subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary)+1, embedding_dimension))
        self.subcategory_model.add(tf.keras.layers.GRU(embedding_dimension))

    def call(self, features) -> tf.Tensor:
        return tf.concat([
            self.history_model(features["history"]),
            self.category_model(features["category"]),
            self.subcategory_model(features["subcategory"]),
        ], axis = 1)
    
class NewsModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        # ID_model
        self.NewsId_model = tf.keras.Sequential()
        self.NewsId_model._name = "news_id"
        self.NewsId_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.NewsId_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary) +1, embedding_dimension))
        
        # category model
        self.news_category_model = tf.keras.Sequential()
        self.news_category_model._name = "news_category"
        self.news_category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.news_category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary) +1, embedding_dimension))
        
        # subcategory model
        self.news_subcategory_model = tf.keras.Sequential()
        self.news_subcategory_model._name = "news_subcategory"
        self.news_subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.news_subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary) +1, embedding_dimension))

    def call(self, features) -> tf.Tensor:
        return tf.concat([
            self.NewsId_model(features["next_id"]),
            self.news_category_model(features["next_category"]),
            self.news_subcategory_model(features["next_subcategory"]),
        ], axis = 1)
    
class Model(tfrs.Model):
    def __init__(self):
        super().__init__()

        self.query_model = tf.keras.Sequential([
            UserModel(),

        ])
        
        self.query_model._name = "query"
        
        self.candidate_model = tf.keras.Sequential([
            NewsModel(),
        ])
        
        self.candidate_model._name = "candidate"
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates= news_ds.batch(1024).map(self.candidate_model),
                ),
            name = "retrival_task"
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        candidate_embedding = self.candidate_model({
            "next_id": features["next_id"],
            "next_category":features["next_category"],
            "next_subcategory": features["next_subcategory"],
        })
        query_embedding = self.query_model({
            "history": features["history"],
            "category":features["category"],
            "subcategory": features["subcategory"],
        })
        return self.task(query_embedding, candidate_embedding, compute_metrics=not training)

model = Model()



In [8]:
## Train Model
#training  constants

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate))

train_ds = impressions_ds.take(130_000)
test_ds = impressions_ds.skip(130_000).take(10_000)
validation_ds = impressions_ds.skip(130_000).skip(10_000)

cached_train = train_ds.shuffle(10_000).batch(10000).cache()
cached_test = test_ds.batch(1024).cache()

model.fit(cached_train, epochs=epochs)


model.evaluate(cached_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

[0.0012000000569969416,
 0.012000000104308128,
 0.01810000091791153,
 0.0617000013589859,
 0.09759999811649323,
 10042.166015625,
 0,
 10042.166015625]

In [10]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(news_ds.batch(100).map(lambda item: (item["next_id"], model.candidate_model(item))))

# Get recommendations.
_, titles = index({
    "history": tf.constant([['N10059', 'N54496', 'N871', 'N44559', 'N62342', 'N3909', 'N30867', 'N32939', 'N10414', 'N31801']]),
    "category": tf.constant([['news', 'news', 'tv', 'health', 'video', 'finance', 'news', 'movies', 'movies', 'news']]),
    "subcategory": tf.constant([['newsus', 'newsus', 'tv-celebrity', 'health-news', 'science', 'finance-real-estate', 'causes-environment', 'movies-celebrity', 'movienews', 'newspolitics']])
})

print(f"Recommendations: {titles[0, :5]}")


array([[b'N10059', b'N54496', b'N871', b'N44559', b'N62342', b'N3909',
        b'N30867', b'N32939', b'N10414', b'N31801']], dtype=object)>, 'category': <tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'news', b'news', b'tv', b'health', b'video', b'finance',
        b'news', b'movies', b'movies', b'news']], dtype=object)>, 'subcategory': <tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'newsus', b'newsus', b'tv-celebrity', b'health-news',
        b'science', b'finance-real-estate', b'causes-environment',
        b'movies-celebrity', b'movienews', b'newspolitics']], dtype=object)>}. Consider rewriting this model with the Functional API.
Recommendations: [b'N36221' b'N29715' b'N28413' b'N21707' b'N62318']
