In [2]:
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
from ast import literal_eval
from typing import Dict, Text

#data import
impressions = pd.read_csv(
    "finalDataB.csv",
    header = None,
    names= ['user_id','timestamp','history','category','subcategory','title','next_item']
    ) 

impressions = impressions.drop(columns=['user_id','timestamp'])

news_data = pd.read_table("news.tsv",
              header=None,
              names=[
                  'next_id', 'next_category', 'next_subcategory', 'next_title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

news_data = news_data.drop(columns=['abstract','url', 'title_entities','abstract_entities'])
news_data = news_data.drop_duplicates('next_id')

history = impressions["history"].map(lambda x: literal_eval(x)).tolist()
title = impressions["title"].map(lambda x: literal_eval(x)).tolist()
category = impressions["category"].map(lambda x: literal_eval(x)).tolist()
subcategory = impressions["subcategory"].map(lambda x: literal_eval(x)).tolist()
next_id = impressions["next_item"].map(lambda x: literal_eval(x)[0])
next_title = impressions["next_item"].map(lambda x: literal_eval(x)[1])
next_category = impressions["next_item"].map(lambda x: literal_eval(x)[2])
next_subcategory = impressions["next_item"].map(lambda x: literal_eval(x)[3])

In [None]:
history = tf.ragged.constant(history, dtype=tf.string)
title = tf.convert_to_tensor(title, dtype=tf.string)
category = tf.ragged.constant(category, dtype=tf.string)
subcategory = tf.ragged.constant(subcategory, dtype=tf.string)
next_id = tf.constant(next_id, dtype=tf.string)
next_title = tf.constant(next_title, dtype=tf.string)
next_category = tf.constant(next_category, dtype=tf.string)
next_subcategory = tf.constant(next_subcategory, dtype=tf.string)

news_dict = {name: np.array(value) for name, value in news_data.items()}
impressions_dict = {
    "history" : history,
    "title": title,
    "category" : category,
    "subcategory" : subcategory,
    "next_id" : next_id,
    "next_title": next_title,
    "next_category" : next_category,
    "next_subcategory" : next_subcategory,
}

In [None]:
print(title)

In [None]:
news_ds = tf.data.Dataset.from_tensor_slices(news_dict)
impressions_ds = tf.data.Dataset.from_tensor_slices(impressions_dict)

In [None]:
#Vocabularies
news_id_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_id"]))))
news_title_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_title"]))))
news_category_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_category"]))))
news_subcategory_vocabulary = np.unique(np.concatenate(list(news_ds.batch(1_000).map(lambda x: x["next_subcategory"]))))

In [None]:
news_ds = news_ds.map(lambda x: {
    "next_id": x['next_id'],
    "next_title": x['next_title'],
    "next_category": x['next_category'],
    "next_subcategory": x['next_subcategory'],
})

impressions_ds = impressions_ds.map(lambda x: {
    "history" : x["history"],
    "title": x["title"],
    "category" : x["category"],
    "subcategory" : x["subcategory"],
    "next_id" : x["next_id"],
    "next_title" : x["next_title"],
    "next_category" : x["next_category"],
    "next_subcategory" : x["next_subcategory"],
})

In [None]:
embedding_dimension=128
learning_rate=0.1
epochs=30

class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        max_tokens = 5_000
        
        #Create History Model
        self.history_model = tf.keras.Sequential()
        self.history_model._name = "user_history"
        self.history_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.history_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary)+1, embedding_dimension))
        self.history_model.add(tf.keras.layers.GRU(embedding_dimension))
        
        #Create Title Model
        
        self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

        self.title_model = tf.keras.Sequential()
        self.title_model._name = "user_title"
        self.title_model.add(self.title_vectorizer)
        self.title_model.add(tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True))
        self.title_model.add(tf.keras.layers.GRU(embedding_dimension))
        
        
        self.title_vectorizer.adapt(news_title_vocabulary)

        #Create Category Model
        self.category_model = tf.keras.Sequential()
        self.category_model._name = "user_category"
        self.category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary)+1, embedding_dimension))
        self.category_model.add(tf.keras.layers.GRU(embedding_dimension))

        #Create SubCategory Model
        self.subcategory_model = tf.keras.Sequential()
        self.subcategory_model._name = "user_subcategory"
        self.subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary)+1, embedding_dimension))
        self.subcategory_model.add(tf.keras.layers.GRU(embedding_dimension))

    def call(self, features) -> tf.Tensor:
        return tf.concat([
            self.history_model(features["history"]),
            self.title_model(features["title"]),
            self.category_model(features["category"]),
            self.subcategory_model(features["subcategory"]),
        ], axis = 1)
    
class NewsModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        
        max_tokens = 5_000
        
        # ID_model
        self.NewsId_model = tf.keras.Sequential()
        self.NewsId_model._name = "news_id"
        self.NewsId_model.add(tf.keras.layers.StringLookup(vocabulary=news_id_vocabulary, mask_token=None))
        self.NewsId_model.add(tf.keras.layers.Embedding(len(news_id_vocabulary) +1, embedding_dimension))
        
        #title model
        
        self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

        self.news_title_model = tf.keras.Sequential()
        self.news_title_model._name = "news_title"
        self.news_title_model.add(self.title_vectorizer)
        self.news_title_model.add(tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True))
        self.news_title_model.add(tf.keras.layers.GlobalAveragePooling1D())

        
        
        self.title_vectorizer.adapt(news_title_vocabulary)
        
        # category model
        self.news_category_model = tf.keras.Sequential()
        self.news_category_model._name = "news_category"
        self.news_category_model.add(tf.keras.layers.StringLookup(vocabulary=news_category_vocabulary, mask_token=None))
        self.news_category_model.add(tf.keras.layers.Embedding(len(news_category_vocabulary) +1, embedding_dimension))
        
        # subcategory model
        self.news_subcategory_model = tf.keras.Sequential()
        self.news_subcategory_model._name = "news_subcategory"
        self.news_subcategory_model.add(tf.keras.layers.StringLookup(vocabulary=news_subcategory_vocabulary, mask_token=None))
        self.news_subcategory_model.add(tf.keras.layers.Embedding(len(news_subcategory_vocabulary) +1, embedding_dimension))

    def call(self, features) -> tf.Tensor:
        return tf.concat([
            self.NewsId_model(features["next_id"]),
            self.news_title_model(features["next_title"]),
            self.news_category_model(features["next_category"]),
            self.news_subcategory_model(features["next_subcategory"]),
        ], axis = 1)
    
class Model(tfrs.Model):
    def __init__(self):
        super().__init__()

        self.query_model = tf.keras.Sequential([
            UserModel(),
            tf.keras.layers.Dense(embedding_dimension),
        ])
        
        self.query_model._name = "query"
        
        self.candidate_model = tf.keras.Sequential([
            NewsModel(),
            tf.keras.layers.Dense(embedding_dimension),
        ])
        
        self.candidate_model._name = "candidate"
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates= news_ds.batch(1024).map(self.candidate_model),
                ),
            name = "retrival_task"
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        candidate_embedding = self.candidate_model({
            "next_id": features["next_id"],
            "next_title": features["next_title"],
            "next_category":features["next_category"],
            "next_subcategory": features["next_subcategory"],
        })
        query_embedding = self.query_model({
            "history": features["history"],
            "title": features["title"],
            "category":features["category"],
            "subcategory": features["subcategory"],
        })
        return self.task(query_embedding, candidate_embedding, compute_metrics=not training)

model = Model()

In [206]:
## Train Model
#training  constants

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate))

train_ds = impressions_ds.take(130_000)
test_ds = impressions_ds.skip(130_000).take(10_000)
validation_ds = impressions_ds.skip(130_000).skip(10_000)

cached_train = train_ds.shuffle(10_000).batch(10000).cache()
cached_test = test_ds.batch(1024).cache()

model.fit(cached_train, epochs=epochs)


model.evaluate(cached_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[0.0,
 9.999999747378752e-05,
 0.00019999999494757503,
 0.005200000014156103,
 0.01489999983459711,
 5724.77587890625,
 0,
 5724.77587890625]

In [208]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(news_ds.batch(100).map(lambda item: (item["next_id"], model.candidate_model(item))))

# Get recommendations.
_, titles = index({
    "history": tf.constant([['N11948', 'N39074', 'N52066', 'N13233', 'N20489', 'N33513', 'N54496', 'N28818', 'N25114', 'N986']]),
    "title": tf.constant([[["Ohio State thinks it's cute that Florida, LSU arguing over DBU"], ['Eliud Kipchoge runs 1:59 marathon, first to break 2 hours'], ['Elton John Defends Ellen DeGeneres Over George W. Bush Friendship'], ['Carlos Correa lost his mind while saving Astros vs. Yankees in ALCS'], ['Australian wrongly jailed for 19 years wins payout'], ['Trace Adkins ties the knot in New Orleans'], ['Matt Lauer allegations: Megyn Kelly lauds Meredith Vieira, Ann Curry'], ["Man mistakenly ID'd by Browns in beer-dumping incident sues team"], ["There's a mouse hiding among mushrooms in this viral brainteaser. Can you spot it?"], ["Ken Fisher's sexist comments have cost his company nearly $1 billion in assets"]]]),
    "category": tf.constant([['sports', 'sports', 'music', 'sports', 'news', 'music', 'news', 'sports', 'lifestyle', 'finance']]),
    "subcategory": tf.constant([['football_ncaa', 'more_sports', 'music-celebrity', 'baseball_mlb', 'newsworld', 'music-celebrity', 'newsus', 'football_nfl', 'lifestylebuzz', 'finance-companies']])
})

print(f"Recommendations: {titles[0, :5]}")


array([[b'N11948', b'N39074', b'N52066', b'N13233', b'N20489', b'N33513',
        b'N54496', b'N28818', b'N25114', b'N986']], dtype=object)>, 'title': <tf.Tensor: shape=(1, 10, 1), dtype=string, numpy=
array([[[b"Ohio State thinks it's cute that Florida, LSU arguing over DBU"],
        [b'Eliud Kipchoge runs 1:59 marathon, first to break 2 hours'],
        [b'Elton John Defends Ellen DeGeneres Over George W. Bush Friendship'],
        [b'Carlos Correa lost his mind while saving Astros vs. Yankees in ALCS'],
        [b'Australian wrongly jailed for 19 years wins payout'],
        [b'Trace Adkins ties the knot in New Orleans'],
        [b'Matt Lauer allegations: Megyn Kelly lauds Meredith Vieira, Ann Curry'],
        [b"Man mistakenly ID'd by Browns in beer-dumping incident sues team"],
        [b"There's a mouse hiding among mushrooms in this viral brainteaser. Can you spot it?"],
        [b"Ken Fisher's sexist comments have cost his company nearly $1 billion in assets"]]],
      dtype=