Dataset: https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database?select=rating.csv

## Librerias

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import pandas as pd
import numpy as np

## Recolección de la data.

In [2]:
ratings = pd.read_csv("rating.csv")
items = pd.read_csv("anime.csv")

## Preparación de la data/ preprocesamiento la data

In [3]:
ratings.head(6)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
5,1,355,-1


In [4]:
items.head(6)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [5]:
items_titles = items[["anime_id", "name"]]
items_titles.head(6)

Unnamed: 0,anime_id,name
0,32281,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°
3,9253,Steins;Gate
4,9969,Gintama&#039;
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...


In [6]:
items_ratings = ratings.merge(items_titles, on="anime_id")
items_ratings.head(6)

Unnamed: 0,user_id,anime_id,rating,name
0,1,20,-1,Naruto
1,3,20,8,Naruto
2,5,20,6,Naruto
3,6,20,-1,Naruto
4,10,20,-1,Naruto
5,21,20,8,Naruto


In [7]:
items_ratings.rename(columns={'anime_id': 'item_id'}, inplace = True)

In [8]:
items_titles[items_titles["name"] == "Naruto"]

Unnamed: 0,anime_id,name
841,20,Naruto


In [9]:
items_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813727 entries, 0 to 7813726
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  int64 
 1   item_id  int64 
 2   rating   int64 
 3   name     object
dtypes: int64(3), object(1)
memory usage: 238.5+ MB


In [10]:
items_ratings.isna().any()

user_id    False
item_id    False
rating     False
name       False
dtype: bool

In [11]:
items_ratings.loc[items_ratings['rating'] == -1, 'rating'] = 0

In [12]:
items_ratings = items_ratings.sample(10000)

## Entrenamiento del modelo.

In [13]:
encoder = LabelEncoder()

In [14]:
encoder.fit(items_ratings['user_id'])
items_ratings.loc[:, 'user_id'] = sorted(encoder.transform(items_ratings['user_id']))

encoder.fit(items_ratings['item_id'])
items_ratings.loc[:, 'item_id'] = encoder.transform(items_ratings['item_id'])

In [15]:
df_train, df_val = train_test_split(items_ratings, test_size=0.2, random_state=42)

In [16]:
n_users = len(items_ratings.user_id.unique())
n_items = len(items_ratings.item_id.unique())

train_matrix = np.zeros((n_users, n_items))

for row in df_train.itertuples():
    train_matrix[row[1]-1, row[2]-1] = row[3]

In [17]:
input_users = tf.keras.layers.Input(shape=[1])
embedding_users = tf.keras.layers.Embedding(n_users, 100)(input_users)
flatten_users = tf.keras.layers.Flatten()(embedding_users)

input_items = tf.keras.layers.Input(shape=[1])
embedding_items = tf.keras.layers.Embedding(n_items, 100)(input_items)
flatten_items = tf.keras.layers.Flatten()(embedding_items)

In [18]:
concat = tf.keras.layers.Concatenate()([flatten_users, flatten_items])

dense1 = tf.keras.layers.Dense(128, activation='relu')(concat)
dropout1 = tf.keras.layers.Dropout(0.2)(dense1)

dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = tf.keras.layers.Dropout(0.2)(dense2)

In [19]:
output = tf.keras.layers.Dense(1)(dropout2)

model = tf.keras.Model(inputs=[input_users, input_items], outputs=output)
model.compile(loss='mse', optimizer='adam')

In [20]:
model.fit([df_train.user_id, df_train.item_id], df_train.rating, epochs=10, verbose=1, validation_split=0.2, batch_size=70)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27b0130d480>

In [21]:
model.fit([df_train.user_id, df_train.item_id], df_train.rating, epochs=10, verbose=1, validation_split=0.2, batch_size=70)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27b01627580>

In [22]:
model.fit([df_train.user_id, df_train.item_id], df_train.rating, epochs=10, verbose=1, validation_split=0.2, batch_size=70)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27b016858a0>

In [23]:
test_loss = model.evaluate([df_val.user_id, df_val.item_id], df_val.rating)
print('Test loss:', test_loss)

Test loss: 14.180154800415039


In [24]:
predictions = model.predict([df_val.user_id, df_val.item_id])



In [25]:
predictions[0]

array([5.7409606], dtype=float32)

In [26]:
df_val['rating']

6965479     7
5916873     7
6190729     8
1371699    10
7488997     8
           ..
748240      8
1754798    10
6314206     7
6519316    10
5189450     7
Name: rating, Length: 2000, dtype: int64

In [27]:
user_id = 100
item_id = 200

rating_pred = model.predict([np.array([user_id]), np.array([item_id])])
print('Rating prediction:', rating_pred[0][0])

Rating prediction: 2.6572433


In [33]:
def recommend_items(user_id, model, n=2):
    # Obtener las películas que ya ha visto el usuario
    seen_movies = items_ratings[items_ratings['user_id'] == user_id]['item_id'].tolist()
    
    # Obtener las películas que el usuario aún no ha visto
    item_ids = list(set(items_ratings['item_id']) - set(seen_movies))
    
    # Predecir las puntuaciones para las películas no vistas
    ratings_pred = model.predict([np.array([user_id]*len(item_ids)), np.array(item_ids)])
    ratings_pred = np.array([rating[0] for rating in ratings_pred])
    
    # Obtener las mejores n películas recomendadas
    top_item_ids = (-ratings_pred).argsort()[:n]
    top_item_ids = [item_ids[i] for i in top_item_ids]
    top_item_ids += [0]*(n - len(top_item_ids))
    top_ratings_pred = ratings_pred[top_item_ids]

    items_pred = pd.DataFrame(columns=['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'])

    i = 0
    for ids in top_item_ids:
        items_pred.loc[i] = items.iloc[ids]
        i += 1

    items_pred['predicted_rating'] = top_ratings_pred
    
    return items_pred

In [34]:
user_id = 1
recommendations = recommend_items(user_id, model, n=3)
recommendations



Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,predicted_rating
0,4189,Winter Sonata,"Drama, Romance",TV,26,7.51,13582,6.404348
1,1145,Kaze no Youjinbou,"Action, Drama, Mystery, Shounen",TV,25,7.27,5646,7.870547
2,76,Mahou Shoujo Lyrical Nanoha,"Action, Comedy, Drama, Magic, Super Power",TV,13,7.52,62582,7.503816
