<a href="https://colab.research.google.com/github/argalusmp/CH2-PS_Recommendation-System/blob/mi/Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building A Recommender System using Content Based Filttering
**Content-based filtering** uses item features to recommend other items similar to what the user likes, based on their previous actions or explicit feedback. In this lab, we will try to use **cosine similarity method** or **dot product** to calculate the result of the filtering. I'm using Coursera Rec Sys content based Filtering for the reference, feel free to check it !


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_recommenders as tfrs
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from typing import Dict, Text


Importing Dataset from .csv

In [5]:
user_ds = pd.read_csv("drive/MyDrive/Capstone Project/user_ds.csv")
event_ds= pd.read_csv("drive/MyDrive/Capstone Project/event_ds.csv")

df_user = pd.DataFrame(user_ds)
df_event = pd.DataFrame(event_ds)

print(len(df_user))
print(len(df_event))

1055
5599


In [6]:
# Check the User Dataset
df_user

Unnamed: 0,Uid,Volunteer Name,Age,Gender,Skill 1,Skill 2 (Additional),Location,Type of Organization
0,U1,Mia Davis,27,Female,Dog training,Animal behavior modification,Solo,Pet and Animal Service
1,U2,Amelia Adams,28,Female,Pet grooming,Photography,Ngawi,Healthcare
2,U3,Jessica Thompson,29,Female,Animal shelter volunteering,Dog training,Ngawi,Healthcare
3,U4,Harper Lewis,26,Female,Mentoring,Community outreach,Solo,IT
4,U5,Mia Johnson,33,Female,Nursing,Medical assistance,Ngawi,Healthcare
...,...,...,...,...,...,...,...,...
1050,U1051,Ava Turner,26,Female,Environmental advocacy,Conservation,Tangerang,Environmental
1051,U1052,Noah Walker,32,Female,Public speaking,Advocacy,Tangerang,Event Organizer
1052,U1053,Mia Hernandez,26,Female,Mentoring,Community outreach,Bandung,IT
1053,U1054,Ethan Roberts,31,Female,Teaching,Computer literacy,Tangerang,Social


In [7]:
#Check the Event Dataset
df_event

Unnamed: 0,Event_id,Category,Domisili,Age,Qualifications 1,Qualifications 2,Qualifications 3
0,E1,Pet and Animal Service,Solo,>18,Veterinary assistance,Pet adoption,Veterinary medicine
1,E2,Pet and Animal Service,Jawa Barat,All,Veterinary medicine,Pet adoption,Dog walking
2,E3,Pet and Animal Service,Yogyakarta,All,Animal shelter volunteering,Animal care,Animal rescue
3,E4,Pet and Animal Service,Indonesia,18-24,Animal rescue,Animal behavior modification,Animal behavior modification
4,E5,Pet and Animal Service,Bandung,18-24,Animal rescue,Pet adoption support,Animal behavior modification
...,...,...,...,...,...,...,...
5594,E5595,Social,Tangerang,17-30,Career guidance,Research,Digital marketing
5595,E5596,Social,Kalimantan Selatan,17-30,Fundraising,Research,Community outreach
5596,E5597,Social,Jakarta,>20,Graphic design,Sports coaching,Medical research
5597,E5598,Social,Bali,16-38,Fundraising,Leadership training,Digital marketing


# Pre-proccesing Data


In [8]:
# dataframe info
df_user.info()
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Uid                   1055 non-null   object
 1   Volunteer Name        1055 non-null   object
 2   Age                   1055 non-null   int64 
 3   Gender                1055 non-null   object
 4   Skill 1               1055 non-null   object
 5   Skill 2 (Additional)  1045 non-null   object
 6   Location              1055 non-null   object
 7   Type of Organization  1055 non-null   object
dtypes: int64(1), object(7)
memory usage: 66.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Event_id          5599 non-null   object
 1   Category          5599 non-null   object
 2   Domisili          5599 non-null   object
 3   Ag

In [9]:
# df_user['Skills'] = df_user['Skills'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')
# df_event['Qualifications'] = df_event['Qualifications'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')

# df_user
# df_event

In [10]:
# Drop unused column
unused_cols_user = ['Volunteer Name', 'Age', 'Gender']
unused_cols_event = ['Age']

df_user = df_user.drop(unused_cols_user, axis=1)
df_event = df_event.drop(unused_cols_event, axis=1)

df_user.info()
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Uid                   1055 non-null   object
 1   Skill 1               1055 non-null   object
 2   Skill 2 (Additional)  1045 non-null   object
 3   Location              1055 non-null   object
 4   Type of Organization  1055 non-null   object
dtypes: object(5)
memory usage: 41.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Event_id          5599 non-null   object
 1   Category          5599 non-null   object
 2   Domisili          5599 non-null   object
 3   Qualifications 1  5599 non-null   object
 4   Qualifications 2  5599 non-null   object
 5   Qualifications 3  5599 non-null   object
dtypes: object(6)
memory usag

# Pre-proccesing Using One-Hot Encoding


In [11]:
dummies_user = []
dummies_event = []

cols_user = ['Skill 1', 'Skill 2 (Additional)', 'Type of Organization', 'Location']
cols_event = ['Qualifications 1', 'Qualifications 2', 'Qualifications 3', 'Category', 'Domisili']

for col in cols_user:
   dummies_user.append(pd.get_dummies(df_user[col]))
for col in cols_event:
   dummies_event.append(pd.get_dummies(df_event[col]))

user_dummies = pd.concat(dummies_user, axis=1)
event_dummies = pd.concat(dummies_event, axis=1)

In [12]:
user_df = pd.concat((df_user,user_dummies), axis=1)
event_df = pd.concat((df_event,event_dummies), axis=1)

In [13]:
user_df = user_df.drop((cols_user), axis=1)
event_df = event_df.drop((cols_event), axis=1)

user_df.info()
event_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Columns: 119 entries, Uid to Tangerang
dtypes: object(1), uint8(118)
memory usage: 129.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Columns: 220 entries, Event_id to Yogyakarta
dtypes: object(1), uint8(219)
memory usage: 1.2+ MB


In [14]:
user_df

Unnamed: 0,Uid,Animal care,Animal rescue,Animal shelter volunteering,Art therapy,Childcare,Clinical research,Computer programming,Counseling,Data analysis,...,IT,Pet and Animal Service,Social,Youth Development,Bandung,Jakarta,Maluku,Ngawi,Solo,Tangerang
0,U1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,U2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,U3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,U4,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,U5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,U1051,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1051,U1052,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1052,U1053,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1053,U1054,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [15]:
event_df

Unnamed: 0,Event_id,Animal behavior,Animal care,Animal rescue,Animal shelter volunteering,Animal surgery,Art therapy,Camping,Career guidance,Case management,...,Indonesia,Jabodetabek,Jakarta,Jawa Barat,Kalimantan Selatan,Maluku,Ngawi,Solo,Tangerang,Yogyakarta
0,E1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,E2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,E3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,E4,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,E5,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5594,E5595,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5595,E5596,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5596,E5597,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5597,E5598,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preproccesing Using Tokenizer


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df_user = df_user.astype(str)
df_event = df_event.astype(str)

# Preprocess user data
user_text = df_user['Skill 1'] + ' ' + df_user['Skill 2 (Additional)'] + ' ' + df_user['Type of Organization'] + ' ' + df_user['Location']
user_text = user_text.str.lower()

# Preprocess event data
event_text = df_event['Qualifications 1'] + ' ' + df_event['Qualifications 2'] + ' ' + df_event['Qualifications 3'] + ' ' + df_event['Category'] + ' ' + df_event['Domisili']
event_text = event_text.str.lower()

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([user_text, event_text]))

user_sequences = tokenizer.texts_to_sequences(user_text)
event_sequences = tokenizer.texts_to_sequences(event_text)

user_data = pad_sequences(user_sequences)
event_data = pad_sequences(event_sequences)

In [17]:
user_data

array([[  0,  87,  33, ...,   1,   6, 120],
       [  0,   0,   0, ...,  54,   9,  20],
       [  0,   0,   0, ...,  33,   9,  20],
       ...,
       [  0,   0,   0, ...,  38,  11,  40],
       [  0,   0,   0, ..., 108,   5,  29],
       [  0,   0,   0, ...,  36,   9,  29]], dtype=int32)

# Preproccesing Using Stringlookup()

In [24]:
user = user_ds.apply(lambda x: {
    "Uid": x["Uid"],
    "Location": x["Location"],
    "Type": x["Type of Organization"],
    "Skill 1": x["Skill 1"],
    "Skill 2": x["Skill 2 (Additional)"],
}, axis = 1)

event = event_ds.apply(lambda x: {
    "Event_id": x["Event_id"],
    "Domisili": x["Domisili"],
    "Category": x["Category"],
    "Q1": x["Qualifications 1"],
    "Q2": x["Qualifications 2"],
}, axis = 1)

In [None]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(user.map(lambda x: x["Uid"]))

event = np.asarray(event).astype(np.float32)

event_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
event_vocabulary.adapt(event_values)

# Define the Models :D

In [None]:
class VolunteerModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      event_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.event_model = event_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    event_embeddings = self.event_model(features["Category", "Domisili", "Q1", "Q2"])

    return self.task(user_embeddings, event_embeddings)

In [None]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    df_user,
    tf.keras.layers.Embedding(df_user.vocab_size(), 64)
])
event_model = tf.keras.Sequential([
    df_event,
    tf.keras.layers.Embedding(df_event.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    event.batch(128).map(event_model)
  )
)

In [None]:
# Create a retrieval model.
model = VolunteerModel(user_model, event_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(user.batch(4096), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(event.user_model)
index.index_from_dataset(
    event.batch(100).map(lambda Event_id: (Event_id, event.movie_model(event_id))))

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")