In [9]:
#!pip install deepctr_torch

In [11]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc 

import torch
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from IPython.display import display
plt.style.use('ggplot')

import warnings as w
w.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

In [12]:
data1 = pd.read_csv('data.csv')
data1

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,concatenated_tags
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,moldy|old
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,empty
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,mystery|twist ending|serial killer
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,mindfuck|suspense|thriller|tricky|twist ending...
...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller,empty
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller,action|dark hero|gun tactics|hitman|keanu reev...
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,empty
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi,dark|emotional|gritty|heartbreaking|predictibl...


In [13]:
data1['concatenated_tags'].isnull().sum()

0

In [14]:
def split(x , gen):

    key_ans = x.split('|')
    for key in key_ans:
        if ((key not in key2index) and gen==1)or((key not in key3index) and gen==0):
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            if gen :
                key2index[key] = len(key2index) + 1
            else:
                key3index[key] = len(key3index) + 1
    if gen:
        return list(set(map(lambda x: key2index[x], key_ans)))
    else:
        return list(set(map(lambda x: key3index[x], key_ans)))



if __name__ == "__main__":
    data = data1.copy()
    sparse_features = ["movieId", "userId", ] # all without genere , title , time stamp , rating
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    key3index = {}
    
    genres_list = list(map(split, data['genres'].values , np.ones(len(data['genres'].values))))
    concatenated_tags_list = list(map(split, data['concatenated_tags'].values , np.zeros(len(data['concatenated_tags'].values))))
    
    #print(genres_list) [[1,2],[3,4]]
    genres_length = np.array(list(map(len, genres_list)))
    concatenated_tags_length = np.array(list(map(len, concatenated_tags_list)))
    
    #print(concatenated_tags_length) #[2 ,3,4] 
    max_len_gen = max(genres_length)
    max_len_tag = max(concatenated_tags_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len_gen, padding='post', )
    concatenated_tags_list = pad_sequences(concatenated_tags_list, maxlen=max_len_tag, padding='post', )

    #print(genres_list) [[1,2,0,0,0],[]]
    # 2.count #unique features for each sparse field and generate feature config for sequence feature
    
    """
    fixlen_feature_columns = [
    SparseFeat('user_id', 1000, embedding_dim=4),
    SparseFeat('movie_id', 5000, embedding_dim=4),
    """
    
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=16)
                              for feat in sparse_features]

    # get the mean of embeddings of the sequence
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=16), maxlen=max_len_gen, combiner='mean'),
                              VarLenSparseFeat(SparseFeat('concatenated_tags', vocabulary_size=len(key3index) + 1, embedding_dim=16), maxlen=max_len_tag, combiner='mean')
                             ]  # Notice : value 0 is for padding for sequence input feature
    
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

# lets load the model

In [15]:
# instantiate the model 
model = DeepFM(linear_feature_columns, dnn_feature_columns,)

In [16]:
model.load_state_dict(torch.load('DeepFM'))

<All keys matched successfully>

# choose a certain user for certain k

In [17]:
user_id = 123  # Example user ID
k = 10  # Top k recommendations

In [18]:
data[data['userId']==user_id]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,concatenated_tags
19179,123,0,4.0,1336584336,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun
19180,123,46,4.5,1336412889,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,mindfuck|suspense|thriller|tricky|twist ending...
19181,123,97,3.5,1336584326,Braveheart (1995),Action|Drama|War,beautiful scenery|epic|historical|inspirationa...
19182,123,147,4.0,1336409774,Kids (1995),Drama,controversial|new york city|nudity (full frontal)
19183,123,220,3.0,1336409649,Junior (1994),Comedy|Sci-Fi,empty
19184,123,224,4.0,1336423436,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,classic|space action|action|sci-fi|epic|great ...
19185,123,257,4.5,1336423393,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,good dialogue|great soundtrack|non-linear|cult...
19186,123,277,4.5,1336412895,"Shawshank Redemption, The (1994)",Crime|Drama,prison|stephen king|wrongful imprisonment|morg...
19187,123,302,3.5,1336584451,Ace Ventura: Pet Detective (1994),Comedy,empty
19188,123,314,5.0,1336423402,Forrest Gump (1994),Comedy|Drama|Romance|War,shrimp|vietnam|bubba gump shrimp|lieutenant da...


In [19]:
item_ids = data['movieId'].unique()  # Example: IDs from 1 to num_items
user_features = {
    'userId': [user_id] * len(item_ids),  # User ID
}
user_df = pd.DataFrame(user_features)
item_df = pd.DataFrame({'movieId': item_ids})

In [20]:
genres_list = np.array( [ genres_list[data[data['movieId'] == item].index[0]] for item in item_df['movieId'].values])

In [21]:
conc_list = np.array([ concatenated_tags_list[data[data['movieId'] == item].index[0]] for item in item_df['movieId'].values])

In [22]:
all_data = pd.concat([user_df,item_df],axis=1)
all_data

Unnamed: 0,userId,movieId
0,123,0
1,123,2
2,123,5
3,123,43
4,123,46
...,...,...
9719,123,9307
9720,123,9312
9721,123,9324
9722,123,9371


In [23]:
# Create input data dictionary for the model
model_input = {name: all_data[name].values for name in ['userId',	'movieId']}
model_input['genres'] =genres_list#genres_features#all_data['genres'].values
model_input['concatenated_tags'] = conc_list#tag_features #all_data['concatenated_tags'].values

In [24]:
# Predict ratings or probabilities using the trained model
predictions = model.predict(model_input, batch_size=256)

# 3. Sort the predicted ratings or probabilities in descending order
# Get the indices that would sort the predictions in descending order
sorted_indices = np.argsort(predictions.flatten())[::-1]

# 4. Exclude items that the user has interacted with
# Replace interacted_item_ids with the IDs of items the user has interacted with
interacted_item_ids = set(data[data['userId'] == user_id]['movieId'])

In [25]:
# Filter out the interacted items from the sorted indices
filtered_indices = [index for index in sorted_indices if item_ids[index] not in interacted_item_ids]

# 5. Select the top k recommendations
top_k_indices = filtered_indices[:k]  # Get the indices of the top k recommendations
top_k_items = item_ids[top_k_indices]  # Get the IDs of the top k recommended items

# Print or return the top k recommendations
print("Top {} Recommendations: {}".format(k, top_k_items))
i=1
for top in top_k_items:
    print("Top {} Recommendations: {}".format(i, data[data['movieId']== top]['title'].iloc[0]))
    i+=1

Top 10 Recommendations: [9425 8975 9693 8089 7042 9216 4468 8943 1721 8968]
Top 1 Recommendations: I Am Not Your Negro (2017)
Top 2 Recommendations: Ooops! Noah is Gone... (2015)
Top 3 Recommendations: Won't You Be My Neighbor? (2018)
Top 4 Recommendations: Act of Killing, The (2012)
Top 5 Recommendations: Garfield's Pet Force (2009)
Top 6 Recommendations: Who Killed Chea Vichea? (2010)
Top 7 Recommendations: Umberto D. (1952)
Top 8 Recommendations: Villain (1971)
Top 9 Recommendations: Beloved (1998)
Top 10 Recommendations: Nasu: Summer in Andalusia (2003)
