In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import mlflow
import seaborn as sns
import matplotlib.pyplot as plt
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load the latest version
anime_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "CooperUnion/anime-recommendations-database",
  "anime.csv",
)

rating_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "CooperUnion/anime-recommendations-database",
  "rating.csv",
)

# anime_df.to_csv("data/raw/anime.csv")
# rating_df.to_csv("data/raw/rating.csv")

# Cleaning rating_df
rating_df_cleaned = rating_df.drop_duplicates()
rating_df_cleaned.loc[rating_df_cleaned['rating'] == -1, "rating"] = 0
rating_df_cleaned['rating'] = rating_df_cleaned['rating'].values.astype(np.float32)

# FEATURE AUGMENTATION

# Calculate 'main_genre' on the master anime DataFrame
anime_df['genre'] = anime_df['genre'].fillna('Unknown')
anime_df['main_genre'] = anime_df['genre'].apply(
    lambda x: x.split(',')[0].strip() if pd.notna(x) and len(x.split(',')) > 0 else 'Unknown'
)

# FEATURE EXTRACTION

# Merge datasets to get 'type' and 'main_genre' columns
rating_df_merged = pd.merge(rating_df_cleaned, anime_df[['anime_id', 'type', 'main_genre']], on='anime_id')

# Filter by 'TV' series type
rating_df_cleaned = rating_df_merged[rating_df_merged['type'] == 'TV'].copy()

# DATA FILTERING: REMOVING COLD START USERS AND ITEMS

MIN_USER_RATINGS = 100
MIN_ANIME_RATINGS = 100

print(f"Initial filtered TV series ratings: {len(rating_df_cleaned)}.")

# Filter Users (Active Users)
user_counts = rating_df_cleaned['user_id'].value_counts()
active_users = user_counts[user_counts >= MIN_USER_RATINGS].index
rating_df_cleaned = rating_df_cleaned[rating_df_cleaned['user_id'].isin(active_users)]

# Filter Anime (Popular Items)
anime_counts = rating_df_cleaned['anime_id'].value_counts()
popular_anime = anime_counts[anime_counts >= MIN_ANIME_RATINGS].index
rating_df_cleaned = rating_df_cleaned[rating_df_cleaned['anime_id'].isin(popular_anime)]

print(f"Final data size after filtering (100/100): {len(rating_df_cleaned)} ratings.")


min_rating = rating_df_cleaned['rating'].min()
max_rating = rating_df_cleaned['rating'].max()

# --- ENCODING ALL FEATURES (User, Anime, Genre) ---

# Encoding User and Anime IDs
user_ids = rating_df_cleaned['user_id'].unique().tolist()
user_to_user_encoded = {x: i for i, x in enumerate(user_ids)}

anime_ids = rating_df_cleaned['anime_id'].unique().tolist()
anime_to_anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded_to_anime = {i: x for i, x in enumerate(anime_ids)}

# Encoding Main Genre
genre_names = rating_df_cleaned['main_genre'].unique().tolist()
genre_to_genre_encoded = {x: i for i, x in enumerate(genre_names)}

# Mapping encoded features to the DataFrame
rating_df_cleaned['user'] = rating_df_cleaned['user_id'].map(user_to_user_encoded)
rating_df_cleaned['anime'] = rating_df_cleaned['anime_id'].map(anime_to_anime_encoded)
# New encoded column for the hybrid model input
rating_df_cleaned['genre_code'] = rating_df_cleaned['main_genre'].map(genre_to_genre_encoded)

# Shuffle dataset
rating_df_cleaned = rating_df_cleaned.sample(frac=1, random_state=42)


  from .autonotebook import tqdm as notebook_tqdm
  anime_df = kagglehub.load_dataset(
  rating_df = kagglehub.load_dataset(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df_cleaned.loc[rating_df_cleaned['rating'] == -1, "rating"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df_cleaned['rating'] = rating_df_cleaned['rating'].values.astype(np.float32)


Initial filtered TV series ratings: 5283595.
Final data size after filtering (100/100): 3435611 ratings.


In [3]:
rating_df_cleaned

Unnamed: 0,user_id,anime_id,rating,type,main_genre,user,anime,genre_code
2257733,21764,10110,10.0,TV,Comedy,5087,243,1
1654590,16077,16524,6.0,TV,Action,3652,679,0
2935692,27306,25835,7.0,TV,Comedy,6558,1095,1
6651127,61368,11235,7.0,TV,Comedy,14570,553,1
4974481,47662,12967,6.0,TV,Action,11196,416,0
...,...,...,...,...,...,...,...,...
3801027,35524,12291,7.0,TV,Comedy,8526,1057,1
5314729,50375,21881,8.0,TV,Action,11901,86,0
5017487,48024,68,5.0,TV,Adventure,11287,126,2
6272814,58469,6707,6.0,TV,Action,13837,31,0


In [4]:
user_to_user_encoded

{1: 0,
 5: 1,
 7: 2,
 13: 3,
 17: 4,
 21: 5,
 38: 6,
 39: 7,
 43: 8,
 46: 9,
 54: 10,
 73: 11,
 80: 12,
 93: 13,
 98: 14,
 108: 15,
 109: 16,
 120: 17,
 123: 18,
 129: 19,
 139: 20,
 145: 21,
 155: 22,
 159: 23,
 160: 24,
 163: 25,
 166: 26,
 177: 27,
 183: 28,
 189: 29,
 191: 30,
 198: 31,
 201: 32,
 210: 33,
 226: 34,
 232: 35,
 233: 36,
 235: 37,
 244: 38,
 245: 39,
 247: 40,
 248: 41,
 250: 42,
 256: 43,
 261: 44,
 270: 45,
 271: 46,
 280: 47,
 281: 48,
 282: 49,
 285: 50,
 288: 51,
 294: 52,
 296: 53,
 301: 54,
 308: 55,
 317: 56,
 320: 57,
 321: 58,
 326: 59,
 341: 60,
 342: 61,
 348: 62,
 352: 63,
 361: 64,
 372: 65,
 373: 66,
 375: 67,
 379: 68,
 385: 69,
 392: 70,
 395: 71,
 398: 72,
 400: 73,
 407: 74,
 418: 75,
 421: 76,
 427: 77,
 428: 78,
 431: 79,
 435: 80,
 436: 81,
 438: 82,
 439: 83,
 443: 84,
 444: 85,
 446: 86,
 447: 87,
 455: 88,
 460: 89,
 461: 90,
 462: 91,
 475: 92,
 477: 93,
 478: 94,
 488: 95,
 492: 96,
 497: 97,
 500: 98,
 504: 99,
 507: 100,
 508: 101,
 511: 

In [5]:
genre_to_genre_encoded

{'Action': 0,
 'Comedy': 1,
 'Adventure': 2,
 'Drama': 3,
 'Magic': 4,
 'Fantasy': 5,
 'Romance': 6,
 'Historical': 7,
 'Shounen': 8,
 'Mystery': 9,
 'Harem': 10,
 'Psychological': 11,
 'Sci-Fi': 12,
 'Josei': 13,
 'Horror': 14,
 'Ecchi': 15,
 'Music': 16,
 'Dementia': 17,
 'Demons': 18,
 'Game': 19,
 'School': 20,
 'Military': 21,
 'Seinen': 22,
 'Sports': 23,
 'Mecha': 24,
 'Kids': 25,
 'Shoujo': 26,
 'Cars': 27,
 'Slice of Life': 28}