<a href="https://colab.research.google.com/github/argalusmp/CH2-PS_Recommendation-System/blob/faiqa/capstone_recsys_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trying to complete the project, we build a simple matrix factorization model using our csv dataset of `Volunteering Datasets` heavily referencing the MovieLens 100k reccomendation system with TFRS. While the model is used to recommend movies for a given user, this altered set recommend volunteering events to the volunteer appliers.

#### Import TFRS


In [None]:
!pip install -q tensorflow-recommenders

In [None]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow import keras

tf.__version__

### Read the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Volunteers data
volunteers = pd.read_csv('/content/drive/MyDrive/CH2-PS374 | Capstone Project/dataset_volunteers.csv')
# events data
events = pd.read_csv('/content/drive/MyDrive/CH2-PS374 | Capstone Project/dataset_events.csv')

preprocess the data

In [None]:
df_volunteers = pd.DataFrame(volunteers)
df_events = pd.DataFrame(events)

In [None]:
# Convert data type
df_volunteers = df_volunteers.astype(str)
df_events = df_events.astype(str)

In [None]:
both_dict = pd.concat([df_volunteers, df_events], axis=0)

In [None]:
both_dict = both_dict.astype(str)

In [None]:
both_ds = tf.data.Dataset.from_tensor_slices(dict(both_dict))

In [None]:
volunteer_ds = tf.data.Dataset.from_tensor_slices(dict(df_volunteers))
event_ds = tf.data.Dataset.from_tensor_slices(dict(df_events))

In [None]:
def map_both_features(element):
    return {
        "Volunteer Name": element["Volunteer Name"],
        "Type of Organization": element["Type of Organization"],
        "Location": element["Location"],
        "Skill 1": element["Skill 1"],
        "Skill 2 (Additional)": element["Skill 2 (Additional)"],
        "Event_id": element["Event_id"],
        "Category": element["Category"],
        "Location": element["Location"],
        "Qualifications 1": element["Qualifications 1"],
        "Qualifications 2": element["Qualifications 2"]
        # Include other features to keep
    }

In [None]:
#both_ds = both_ds.map(lambda x: tf.py_function(np.array(map_both_features), inp=[x], Tout=[tf.strings]))

In [None]:
both_ds = both_ds.map(lambda x: tf.py_function(map_both_features, [x], [tf.strings]))

In [None]:
def map_volunteer_features(element):
    return {
        "Volunteer Name": element["Volunteer Name"],
        "Type of Organization": element["Type of Organization"],
        # Include other features to keep
    }

def map_event_features(element):
    return {
        "Event_id": element["Event_id"],
        "Category": element["Category"],
        # Include other features to keep
    }

In [None]:
volunteer_ds = volunteer_ds.map(map_volunteer_features)
event_ds = event_ds.map(map_event_features)

In [None]:
both_ds = both_ds.map(map_both_features)

In [None]:
unique_volunteers = np.unique(np.concatenate(list(volunteer_ds.batch(1000).map(lambda x: x["Volunteer Name"]))))
unique_events = np.unique(np.concatenate(list(both_ds.batch(1_000).map(lambda x: x["Event_id"]))))

In [None]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = both_ds.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(600)
test = shuffled.skip(600).take(200)

### 💗 trying another type of **retrieval** model 💗

In [None]:
class VolunteeringModel(tfrs.Model):

    def __init__(self, user_model, event_model):
        super().__init__()

        ### Candidate model (item)
        ### This is Keras preprocessing layers to first convert user ids to integers,
        ### and then convert those to user embeddings via an Embedding layer.
        ### We use the list of unique user ids we computed earlier as a vocabulary:
        event_model = tf.keras.Sequential([
                                        tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_events, mask_token=None),
                                        tf.keras.layers.Embedding(len(unique_events) + 1, embedding_dimension)
                                        ])
        ### we pass the embedding layer into item model
        self.event_model: tf.keras.Model = event_model

        ### Query model (users)
        user_model = tf.keras.Sequential([
                                        tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_volunteers, mask_token=None),
                                        # We add an additional embedding to account for unknown tokens.
                                        tf.keras.layers.Embedding(len(unique_volunteers) + 1, embedding_dimension)
                                        ])
        self.user_model: tf.keras.Model = user_model

        ### for retrieval model. we take top-k accuracy as metrics
        metrics = tfrs.metrics.FactorizedTopK(candidates=event_ds.batch(128).map(lambda x: event_model(x["Category"])))

        # define the task, which is retrieval
        task = tfrs.tasks.Retrieval(metrics=metrics)

        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["Volunteer Name"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        event_embeddings = self.event_model(features["Event_id"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, event_embeddings)

In [None]:
### Fitting and evaluating

### we choose the dimensionality of the query and candicate representation.
embedding_dimension = 32

## we pass the model, which is the same model we created in the query and candidate tower, into the model
event_model = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                vocabulary=unique_events, mask_token=None),
                                tf.keras.layers.Embedding(len(unique_events) + 1, embedding_dimension)
                                ])

user_model = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                vocabulary=unique_volunteers, mask_token=None),
                                # We add an additional embedding to account for unknown tokens.
                                tf.keras.layers.Embedding(len(unique_volunteers) + 1, embedding_dimension)
                                ])

model = VolunteeringModel(user_model, event_model)

# a smaller learning rate may make the model move slower and prone to overfitting, so we stick to 0.1
# other optimizers, such as SGD and Adam
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01))

cached_train = train.shuffle(10).batch(32).cache()
cached_test = test.batch(32).cache()

## fit the model with ten epochs
model_hist = model.fit(cached_train, epochs=10)

In [None]:
cached_test = test.batch(32).cache()

In [None]:
# evaluate model
model.evaluate(cached_test, return_dict=True)

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index(event_ds.batch(16).map(model.event_model), event_ds)

In [None]:
# Get recommendations version 1
j = str(40)
_, titles = index(tf.constant([j]))
print(f"Recommendations for user %s: {titles[0]}" %(j))

In [None]:
# Get some recommendations version 2
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

## 🌞 then with **ranking** model 🌞

In [None]:
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_volunteers, mask_token=None),
          tf.keras.layers.Embedding(len(unique_volunteers) + 1, embedding_dimension)
        ])

        # Compute embeddings for movies.
        self.event_embeddings = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_events, mask_token=None),
          tf.keras.layers.Embedding(len(unique_events) + 1, embedding_dimension)
        ])

        # Compute predictions.
        self.ratings = tf.keras.Sequential([
          # Learn multiple dense layers.
          tf.keras.layers.Dense(256, activation="relu"),
          tf.keras.layers.Dense(64, activation="relu"),
          # Make rating predictions in the final layer.
          tf.keras.layers.Dense(1)
  ])

    def call(self, inputs):

        user_name, event_id = inputs

        user_embeddings = self.user_embeddings(user_name)
        event_embeddings = self.event_embeddings(event_id)

        return self.ratings(tf.concat([user_embeddings, event_embeddings], axis=1))

In [None]:
class VolunteeringModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(),
          metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        rating_predictions = self.ranking_model(
            (features["Volunteer Name"], features["Event_id"]))

        # The task computes the loss and the metrics.
        return self.task(labels=features["Category"], predictions=rating_predictions)

In [None]:
model = VolunteeringModel()

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.5))

cached_train = train.shuffle(100_000).batch(32).cache()
cached_test = test.batch(32).cache()

model.fit(cached_train, epochs=100)

model.evaluate(cached_test, return_dict=True)

## ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾ ⚾




In [None]:
class VolunteeringModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      volunteers_model: tf.keras.Model,
      events_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.volunteers_model = volunteers_model
    self.events_model = events_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    volunteers_embeddings = self.volunteers_model(features["Volunteer Name"])
    events_embeddings = self.events_model(features["Event_id"])

    return self.task(volunteers_embeddings, events_embeddings)

Define the two models and the retrieval task.

In [None]:
# Define user and movie models.
volunteers_model = tf.keras.Sequential([
    volunteers_set,
    tf.keras.layers.Embedding(volunteers_set.vocab_size(), 64)
])
events_model = tf.keras.Sequential([
    event_ids,
    tf.keras.layers.Embedding(event_ids.vocab_size(), 64)
])

# Define your objectives.!!!!
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    mapped_event_ds
  )
)


### Fit the Model and Evaluate

In [None]:
# Create a retrieval model
model = VolunteeringModel(volunteers_model, events_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for epochs
model.fit(mapped_volunteer_ds.batch(128), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.volunteers_model)
index.index_from_dataset(
    mapped_event_ds.batch(100).map(lambda Event_id: (Event_id, model.events_model(Event_id))))

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

## Dumps 🧻

In [None]:
volunteer = pd.get_dummies(df_volunteers, columns=['Skill 1', 'Skill 2 (Additional)', 'Location', 'Type of Organization'])
event = pd.get_dummies(df_events, columns=['Qualifications 1', 'Qualifications 2', 'Location', 'Category'])

In [None]:
# Align data frame on both sets
volunteer, event = volunteer.align(event, fill_value=0, axis=1)

In [None]:
# Data type to tensor slices
volunteer = tf.data.Dataset.from_tensor_slices(dict(volunteer))
event = tf.data.Dataset.from_tensor_slices(dict(event))

In [None]:
volunteer = volunteer.map(lambda x: {
    "Volunteer Name": x["Volunteer Name"],
    "Type of Organization": x.get("Type of Organization", "Unknown")
})

In [None]:
for element in volunteer.take(5):  # Print the first 5 elements
    print(element)

In [None]:
# Select the basic features.
volunteer = volunteers.map(lambda x: {
    "Volunteer Name": x["Volunteer Name"],
    "Type of Organization": x["Type of Organization"]
})
event = events.map(lambda x: x["Category"])

## 🌹 Just defining using volunteer's skill 🌹

In [None]:
# Just using skill as parameter
df_volunteers = volunteers[['Volunteer Name', 'Type of Organization']]
df_volunteers = pd.DataFrame(df_volunteers, columns = ['Volunteer Name', 'Type of Organization'])

df_events = events[['Event_id', 'Category']]
df_events = pd.DataFrame(df_events)

In [None]:
print(df_volunteers.info())
print(df_events.info())

In [None]:
# Convert the pandas DataFrames to tensors
volunteer_ds = tf.data.Dataset.from_tensor_slices(dict(df_volunteers))
event_ds = tf.data.Dataset.from_tensor_slices(dict(df_events))

In [None]:
# map rows to dictionary
volunteer = volunteer_ds.map(lambda x:{
    "Volunteer Name": x["Volunteer Name"],
    "Type of Organization": x["Type of Organization"]
})

event = event_ds.map(lambda x: x["Category"])
    #"Event_id": x["Event_id"],

In [None]:
# use these later for the embeddings
usernames = volunteer.map(lambda x: x['Volunteer Name'])
unique_users = np.unique(np.concatenate(list(usernames.batch(1000))))
unique_events = np.unique(np.concatenate(list(event.batch(1000))))

the volunteer dataset

In [None]:
tf.random.set_seed(42)
vshuffled = volunteer.shuffle(len(df_volunteers), seed=42, reshuffle_each_iteration=False)

vtrain = vshuffled.take(int(len(df_volunteers)*0.8))
vtest = vshuffled.skip(int(len(df_volunteers)*0.8)).take(int(len(df_volunteers)*0.2))

In [None]:
tf.random.set_seed(42)
eshuffled = event.shuffle(len(df_events), seed=42, reshuffle_each_iteration=False)

etrain = vshuffled.take(int(len(df_events)*0.8))
etest = vshuffled.skip(int(len(df_events)*0.8)).take(int(len(df_events)*0.2))

### Model

In [None]:
class VolunteerRetreival(tfrs.Model):
    def __init__(self):
        super().__init__()

        embedding_dims = 32
        self.user_model =  tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary= unique_users, mask_token=None),
            tf.keras.layers.Embedding(len(unique_users)+1, embedding_dims)
        ])

        self.event_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_events, mask_token=None),
            tf.keras.layers.Embedding(len(unique_events)+1, embedding_dims)
        ])

        self.task = tfrs.tasks.Retrieval(
                        metrics=tfrs.metrics.FactorizedTopK(
                        candidates=event.batch(128).cache().map(self.event_model)
                        ))


    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features['Volunteer Name'])
        event_embeddings = self.event_model(features['Category'])
        return self.task(user_embeddings, event_embeddings)

In [None]:
model = VolunteerRetreival()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [None]:
freq = 2
epochs = 20

history = model.fit(
                vtrain,
                validation_data= vtest,
                validation_freq=freq,
                epochs = epochs,
                verbose = 0)

In [None]:
# Mapping function for the volunteer dataset
def map_volunteer_features(element):
    return {
        "Volunteer Name": element["Volunteer Name"],
        "Type of Organization": element["Type of Organization"],
        "Location": element["Location"],
        "Skill 1": element["Skill 1"],
        "Skill 2 (Additional)": element["Skill 2 (Additional)"]
        # Include other features to keep
    }

# Mapping function for the event dataset
def map_event_features(element):
    return {
        "Event_id": element["Event_id"],
        "Category": element["Category"],
        "Location": element["Location"],
        "Qualifications 1": element["Qualifications 1"],
        "Qualifications 2": element["Qualifications 2"]
        # Include other features to keep
    }

# Apply the mapping functions to the datasets
volunteer_ds = volunteer_ds.map(map_volunteer_features)
event_ds = event_ds.map(map_event_features)

# Print some elements to verify the mapping
for element in volunteer_ds.take(5):
    print(element)

for element in event_ds.take(5):
    print(element)