ANIME_REC_MODEL_V1

In [388]:
import pandas as pd
import numpy as np

# nlp
from transformers import Trainer, TrainingArguments

from scipy.spatial.distance import pdist, squareform

from sklearn.preprocessing import MultiLabelBinarizer, OrdinalEncoder

from util.performance import print_mem_mb


In [389]:
og_data = pd.read_csv("../data/anime_dataset_small_nomic.csv")

ord_encoder = OrdinalEncoder()
mlb = MultiLabelBinarizer(sparse_output=True)

Clean the data

In [390]:
# drop image links
data=og_data.drop(list(og_data.filter(regex="Path|Title")),axis=1)

drop_cols = ["Resources", "Char Tags", "Synopsis","Animation Work", "Chief Animation Direction", 
             "Direction", "Chief Direction", "Original Work", "Character Design", "Work",
             "Series Composition", "Animation Character Design", "Original Plan",
             "Music", "synopsis_length",
             "Cast", "Air Date", "Type", "Season", "Year"]

data= data.drop(drop_cols, axis=1)

# convert to lowercase and list
data["Tags"] = data["Tags"].str.lower()
data["Tags"] = data["Tags"].str.split("|")


### Encoding
Encoding the categorical columns first with one hot encoding.

In [391]:
categorical_cols = ["filter_type"]


# encode categorical columns
filter_type = pd.DataFrame(ord_encoder.fit_transform(data[categorical_cols]),
                columns=ord_encoder.get_feature_names_out(categorical_cols))

# drop categorical columns
data = data.drop(categorical_cols, axis=1)

data = pd.concat([data, filter_type], axis=1)


Then, encoding the anime's tags with a MultiLabelBinarizer. There are 1244 tags.

In [392]:
# fuck no
#data["Char Tags"] = data["Char Tags"].str.split("|")

# encode tags
tags = pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(data.pop("Tags")),
                          index=data.index,
                          columns=mlb.classes_)


# removes outer spaces
def strip(col):
  return col.strip()

tags = tags.rename(mapper=strip,axis=1)
data = data.join(tags)


In [393]:
# additional attibutes for distance calculation
attribute_cols = ["Max Rating",	"filter_year","filter_type"]

#test = data[attribute_cols].copy()
#data = data.drop(attribute_cols,axis=1)
#test


Correlation matrix testing

In [None]:
corr = data.corr()

#corr.iloc[0:5][0:5]
#corr["pantyjob"].dropna().sort_values().tail(20)
#corr

Unnamed: 0,Max Rating,filter_year,filter_type,1920s,1960s,3d cg animation,3d cg animation.1,4-koma manga,aboard airship,aboard ship,...,in medias res,japanese production,nudity,perpetual ongoing,plot continuity,romance,sex,slow-paced,storytelling,violence
Max Rating,1.000000,0.000312,-0.034114,-0.003985,0.039026,-0.144536,-0.049876,0.046009,0.027650,0.035415,...,-0.034929,0.078589,-0.041859,-0.019059,-0.040100,-0.008222,0.089199,0.023820,-0.059583,0.068382
filter_year,0.000312,1.000000,0.234873,-0.007775,-0.015701,0.032543,0.092117,0.038121,0.001404,-0.037070,...,0.029223,0.016676,-0.063276,0.040477,0.002018,0.042711,0.066430,0.034838,0.040477,0.029461
filter_type,-0.034114,0.234873,1.000000,-0.042442,-0.034619,0.032153,0.090484,-0.009694,-0.009694,0.032703,...,0.045517,-0.081063,-0.077414,0.032153,0.099958,-0.009694,0.078220,0.005449,-0.024455,0.016972
1920s,-0.003985,-0.007775,-0.042442,1.000000,-0.004924,-0.003478,-0.011653,-0.006036,0.329309,-0.008562,...,-0.004924,-0.003478,-0.057011,-0.003478,-0.016268,-0.006036,-0.010519,-0.004924,-0.003478,-0.038360
1960s,0.039026,-0.015701,-0.034619,-0.004924,1.000000,-0.002837,-0.009505,-0.004924,-0.004924,-0.006984,...,-0.004016,-0.002837,-0.046503,-0.002837,-0.013269,-0.004924,-0.008580,-0.004016,-0.002837,-0.031290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
romance,-0.008222,0.042711,-0.009694,-0.006036,-0.004924,-0.003478,-0.011653,-0.006036,-0.006036,-0.008562,...,-0.004924,-0.003478,-0.057011,-0.003478,-0.016268,1.000000,-0.010519,-0.004924,-0.003478,-0.038360
sex,0.089199,0.066430,0.078220,-0.010519,-0.008580,-0.006061,-0.020306,-0.010519,-0.010519,-0.014921,...,-0.008580,-0.006061,-0.099348,-0.006061,-0.028348,-0.010519,1.000000,-0.008580,-0.006061,-0.066847
slow-paced,0.023820,0.034838,0.005449,-0.004924,-0.004016,-0.002837,-0.009505,-0.004924,-0.004924,-0.006984,...,-0.004016,-0.002837,-0.046503,-0.002837,-0.013269,-0.004924,-0.008580,1.000000,-0.002837,-0.031290
storytelling,-0.059583,0.040477,-0.024455,-0.003478,-0.002837,-0.002004,-0.006714,-0.003478,-0.003478,-0.004934,...,-0.002837,-0.002004,-0.032849,-0.002004,-0.009373,-0.003478,-0.006061,-0.002837,1.000000,-0.022103


In [395]:
anime_name = "hunter x"
find_by = "Official Title (en)"
anime = og_data.index[og_data[find_by].str.contains(anime_name, case=False)].tolist()

Then calculate euclidean distance to find the closest animes by their distances

In [396]:

# vector of euclidean distances
result = pdist(data, 'euclidean')

# turn euclidean distances into a matrix 
dist_data = pd.DataFrame(squareform(result), index=og_data["Main Title"],columns=og_data["Main Title"])

num_results = 15

if len(anime) >= 1:
    print(dist_data.iloc[anime[0],:].sort_values().head(num_results).tail(num_results-1))
#dist_data
#og_data


Main Title
Hunter x Hunter: Original Video Animation     5.572513
Grappler Baki (2001)                          6.147268
Shadow Skill (1998)                           6.292225
Gekijouban Marco: Haha o Tazunete Sanzenri    6.325512
Captain Tsubasa (2001)                        6.404069
One Piece (2000)                              6.436031
Groove Adventure Rave                         6.492465
Hunter x Hunter: Greed Island                 6.509386
Street Fighter Zero The Animation             6.535143
Gensou Maden Saiyuuki                         6.535756
Tales of Eternia The Animation                6.556897
Shin Chou Kyou Ryo: Condor Hero               6.570114
Himiko-Den                                    6.597158
Kaze no Youjinbou                             6.603666
Name: Hunter x Hunter (1999), dtype: float64


In [397]:
# Testing for mem usage
#print_mem_mb(data)