In [1]:
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

from gensim.models import Word2Vec
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, StringLookup, Flatten, TextVectorization, Embedding, Layer, Dense, Dropout, Dot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adagrad, Adam, Ftrl


In [2]:
dataset, info=tfds.load('movielens/100k-ratings', split='train', with_info=True)
dataset=tfds.as_dataframe(dataset)

In [3]:
check=dataset.copy()
check.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,[7],b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4,b'doctor',4.0,b'53211'
1,25.0,"[4, 14]",b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5,b'entertainment',2.0,b'80525'
2,18.0,[4],b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17,b'student',4.0,b'55439'
3,50.0,"[5, 7]",b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4,b'healthcare',4.0,b'06472'
4,50.0,"[10, 16]",b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18,b'technician',3.0,b'75094'


# select relevant columns (for Collaborative Recomendation)

In [4]:
check=check[['movie_id', 'movie_title', 'user_id', 'user_rating']]
check['movie_title']=[string.decode('utf-8') for string in check['movie_title']]
check.head()

Unnamed: 0,movie_id,movie_title,user_id,user_rating
0,b'357',One Flew Over the Cuckoo's Nest (1975),b'138',4.0
1,b'709',Strictly Ballroom (1992),b'92',2.0
2,b'412',"Very Brady Sequel, A (1996)",b'301',4.0
3,b'56',Pulp Fiction (1994),b'60',4.0
4,b'895',Scream 2 (1997),b'197',3.0


# handling the text in dataset

In [5]:
encode_movie={title:i for i, title in enumerate(set(check['movie_title']))}
decode_movie={i:title for i, title in enumerate(set(check['movie_title']))}
encode_user={user:i for i,user in enumerate(set(check['user_id']))}
decode_user={i:user for i,user in enumerate(set(check['user_id']))}
movie_id={title:id for title, id in zip(check['movie_title'],check['movie_id'])}

In [6]:
check['encode_movie']=[encode_movie[x] for x in check['movie_title']]
check['encode_user']=[encode_user[x] for x in check['user_id']]
check.head()

Unnamed: 0,movie_id,movie_title,user_id,user_rating,encode_movie,encode_user
0,b'357',One Flew Over the Cuckoo's Nest (1975),b'138',4.0,1361,179
1,b'709',Strictly Ballroom (1992),b'92',2.0,900,239
2,b'412',"Very Brady Sequel, A (1996)",b'301',4.0,218,698
3,b'56',Pulp Fiction (1994),b'60',4.0,1479,116
4,b'895',Scream 2 (1997),b'197',3.0,522,557


In [7]:
print(f"we have {check['movie_id'].nunique()}movies, and {check['user_id'].nunique()}users")

we have 1682movies, and 943users


In [8]:
check.duplicated().any()

False

# user-item interaction matrix

In [9]:
interaction = check.pivot_table(index='encode_movie', columns='encode_user', values='user_rating').fillna(0)
interaction

encode_user,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
encode_movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1662,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
interaction=interaction.to_numpy()
np.count_nonzero(interaction)

99693

In [11]:
mask=np.where(interaction==0, 0, 1)
mask

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

$$ J(x^{(1)}, ..., x^{(n_m)}, \theta^{(1)}, ..., \theta^{(n_u)}) = \frac{1}{2} \sum_{(i,j):r(i,j)=1} ((\theta^{(j)})^T x^{(i)} - y^{(i,j)})^2 +  REGULARIZATION$$



$$ J(x^{(1)}, ..., x^{(n_m)}, \theta^{(1)}, ..., \theta^{(n_u)}) = \frac{1}{2} \sum_{(i,j):r(i,j)=1} ((\theta^{(j)})^T x^{(i)} - y^{(i,j)})^2 + \frac{\lambda}{2} \sum_{j=1}^{n_u} \sum_{k=1}^{n} (\theta_k^{(j)})^2 + \frac{\lambda}{2} \sum_{i=1}^{n_m} \sum_{k=1}^{n} (x_k^{(i)})^2 $$

In [12]:
class RecomendationSys:
    "this class accept and train item-user interaction matrix to recomend items"
    def __init__(self, features=100, lambda_=0.1):
        self.feature=features
        self.lambda_=lambda_

    def initialize(self, num_item, num_user):
        Xset=tf.cast(tf.convert_to_tensor(np.random.randn(num_item, self.feature)), tf.float64) #item features
        W=tf.cast(tf.convert_to_tensor(np.random.randn(num_user, self.feature)), tf.float64) #user taste
        b=tf.cast(tf.convert_to_tensor(np.zeros((1, num_user))), tf.float64)
        Xset=tf.Variable(Xset, name='Xset')
        W=tf.Variable(W, name='W')
        b=tf.Variable(b, name='b')
        return Xset, W, b

    def cost(self, Xset, W, b, Yset):
        j = (tf.linalg.matmul(Xset, tf.transpose(W)) + b - Yset)
        J = 0.5 * tf.reduce_sum(j**2) + (self.lambda_/2) * (tf.reduce_sum(Xset**2) + tf.reduce_sum(W**2))
        return J

    def compile(self, loss=None, optimizer=None, metrics=None):
        self.loss=loss
        self.optimizer=optimizer
        self.metrics=metrics
        
    def fit(self, Yset, epochs=200):
        self.mask=np.where(Yset != 0, True, False)
        num_item, num_user=Yset.shape
        Yset=tf.cast(tf.convert_to_tensor(Yset),tf.float64)
        Xset,W,b=self.initialize(num_item, num_user)
        for iter in range(epochs):
            start=time.time()
            with tf.GradientTape() as tape: 
                cost_value=self.cost(Xset, W, b, Yset) #foward pass
            grads=tape.gradient(cost_value, [Xset, W, b]) #backward pass
            self.optimizer.apply_gradients(zip(grads, [Xset,W,b]))
            if iter%20==0:
                print( f"Epoch {iter+20}/{epochs}\n6/6 ━━━━━━━━━━━━━━━━━━━━ 2s {time.time()-start:.2f}ms/step -- loss: {cost_value:.2f}")
        self.p = np.matmul(Xset.numpy(), np.transpose(W.numpy())) + b.numpy()
    
            
    def predict_rate(self, user_id=None, movie_id=None):    
        return self.p[movie_id, user_id]
        
    def predict_item(self, user_id, not_seen=True, limit=10):
        if not_seen==True:
            user_mask=self.mask[:,user_id]
            predict=self.p[:,user_id]
            predict[np.where(user_mask)[0]]=0.0
            return np.argsort(predict)[::-1][:limit]
        else:
            return np.argsort(self.p[:,user_id])[::-1][:limit]






In [13]:
model=RecomendationSys(features=50, lambda_=0.5)
model.compile(optimizer=Ftrl(learning_rate=1e-1))
model.fit(interaction)

Epoch 20/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.40ms/step -- loss: 39763290.35
Epoch 40/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.27ms/step -- loss: 5894506.71
Epoch 60/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.26ms/step -- loss: 2765667.77
Epoch 80/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.24ms/step -- loss: 1586784.22
Epoch 100/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.27ms/step -- loss: 1054816.94
Epoch 120/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.28ms/step -- loss: 791241.44
Epoch 140/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.25ms/step -- loss: 649749.23
Epoch 160/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.24ms/step -- loss: 570930.52
Epoch 180/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.27ms/step -- loss: 526469.58
Epoch 200/200
6/6 ━━━━━━━━━━━━━━━━━━━━ 2s 0.24ms/step -- loss: 500635.16


In [14]:
interaction[2,936]

0.0

In [15]:
model.predict_rate(user_id=2, movie_id=936)

0.0675667641876631

In [16]:
recomendations=[decode_movie[x] for x in model.predict_item(4)]
recomendations

['Star Trek IV: The Voyage Home (1986)',
 'Indiana Jones and the Last Crusade (1989)',
 'Cure, The (1995)',
 'Robin Hood: Prince of Thieves (1991)',
 'Back to the Future (1985)',
 'Sleepless in Seattle (1993)',
 'Clear and Present Danger (1994)',
 'Usual Suspects, The (1995)',
 'Lost World: Jurassic Park, The (1997)',
 'Mary Poppins (1964)']

## using tensorflow recomenders to train the above colaborative

selecting relevant feature

In [17]:
check=check[['encode_movie', 'encode_user', 'user_rating']]
check.head()

Unnamed: 0,encode_movie,encode_user,user_rating
0,1361,179,4.0
1,900,239,2.0
2,218,698,4.0
3,1479,116,4.0
4,522,557,3.0


In [18]:
interaction.shape

(1664, 943)

In [19]:
# building model for colaborative filtering
user_input=Input(shape=(943,))
item_input=Input(shape=(1664,))

user_embedding=Embedding(input_dim=943, output_dim=50, trainable=True)(user_input)
item_embedding=Embedding(input_dim=1664, output_dim=50, trainable=True)(item_input)
dot_layer=Dot(axes=-1)([item_embedding, user_embedding])


model=Model(inputs=[item_input, user_input], outputs=dot_layer)
model.summary()

In [20]:
user=tf.data.Dataset.from_tensor_slices(check['encode_user'].to_numpy()).batch(100)
item=tf.data.Dataset.from_tensor_slices(check['encode_movie'].to_numpy()).batch(100)
label=tf.data.Dataset.from_tensor_slices(interaction).batch(100)

In [21]:
X=tf.data.Dataset.zip(item,user)
data=tf.data.Dataset.zip(X,label)
model.compile(loss=tf.keras.losses.CosineSimilarity(),
              optimizer=Ftrl())
#model.fit(, label, epochs=10)
model.fit(user, item, label, epochs=10)

ValueError: When providing `x` as a tf.data.Dataset, `y` should not be passed. Instead, the targets should be included as part of the tf.data.Dataset.

In [1]:
import re

original_string = '⊛ face exhaling'
cleaned_string = re.sub(r'\W+', '', original_string)

print("Original string:", original_string)
print("Cleaned string:", cleaned_string)

Original string: ⊛ face exhaling
Cleaned string: faceexhaling
