In [2]:
import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




In [3]:
animeDf = pd.read_csv('anime.csv')

In [4]:
print(len(animeDf))
animeDf.columns
columns = ['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites']
animeDf = animeDf[columns]

17562


In [5]:
def getAnimeName(id):
	name = animeDf[animeDf.MAL_ID == id]['English name'].values[0]
	if name is np.nan or name == "Unknown":
		name = animeDf[animeDf.MAL_ID == id]['Name'].values[0]
	return name
animeDf['eng_name'] = animeDf.MAL_ID.apply(lambda x :getAnimeName(x) )

In [49]:

animeDf = animeDf[['Score','Ranked','Genres','eng_name','Type','Episodes','Source','Studios']]
animeDf = animeDf[animeDf.Score != 'Unknown']
animeDf = animeDf[animeDf.Ranked!= 'Unknown']
animeDf = animeDf[animeDf.Episodes != 'Unknown']
animeDf

Unnamed: 0,Score,Ranked,Genres,eng_name,Type,Episodes,Source,Studios
0,8.78,28.0,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,TV,26,Original,Sunrise
1,8.39,159.0,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,Movie,1,Original,Bones
2,8.24,266.0,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,TV,26,Manga,Madhouse
3,7.27,2481.0,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,TV,26,Original,Sunrise
4,6.98,3710.0,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,TV,52,Manga,Toei Animation
...,...,...,...,...,...,...,...,...
17504,6.59,5461.0,Comedy,Kimetsu Gakuen: Valentine-hen,ONA,4,Original,Unknown
17505,7.52,1517.0,"Music, Romance",Heikousen,Music,1,Original,"10Gauge, Studio DURIAN"
17512,6.83,4382.0,"Slice of Life, Comedy",Cooking with Valkyries Spring Festival Special,Special,1,Game,Unknown
17513,4.81,10760.0,"Mystery, Psychological, Supernatural, Thriller...",The Promised Neverland Season 2 Episode 5.5,Special,1,Manga,CloverWorks


In [50]:
animeDf.Score = animeDf.Score.apply(lambda x: float(x))
animeDf.Ranked = animeDf.Ranked.apply(lambda x: float(x))
animeDf.Episodes = animeDf.Episodes.apply(lambda x: int(x))

In [51]:
train, val, test = np.split(animeDf.sample(frac=1), [int(0.8*len(animeDf)), int(0.9*len(animeDf))])
train

Unnamed: 0,Score,Ranked,Genres,eng_name,Type,Episodes,Source,Studios
13330,7.10,3218.0,"Comedy, Romance, School",Kud Wafter,Movie,1,Visual novel,J.C.Staff
1037,6.54,5691.0,"Adventure, Drama, Fantasy, Romance",A Tree of Palme,Movie,1,Unknown,Palm Studio
15591,5.56,9513.0,Fantasy,Donbei x Kemurikusa,ONA,1,Original,Unknown
2420,6.97,3733.0,"Comedy, Fantasy, Kids, Sci-Fi, Shounen",Doraemon: Doraemon Comes Back,Special,1,Unknown,Unknown
6173,7.65,1156.0,"Action, Historical, Demons, Drama, Fantasy, Josei",Hakuoki ~Demon of the Fleeting Blossom~ A Memo...,OVA,6,Visual novel,Studio Deen
...,...,...,...,...,...,...,...,...
886,6.91,3996.0,"Comedy, Ecchi, Harem, Romance, School, Shounen",Ichigo 100% Original DVD Animation,OVA,4,Manga,Madhouse
12702,5.45,9810.0,Action,Kotowa Jiten: Hetana Teppou mo Kazu Ucha Ataru!,Special,1,Original,Unknown
6720,6.28,7011.0,"Adventure, Fantasy, Kids",Treasure Hunting,Movie,1,Picture book,Studio Ghibli
4644,6.88,4121.0,"Comedy, Sci-Fi",It's a Rumic World: 50th Anniversary Weekly★Sh...,Special,1,Unknown,Unknown


In [99]:
l = animeDf.Genres.to_numpy()
l= [x.split(',') for x in l]
uniqueGenres = []
for x in l:
	for y in x:
		y.strip()
		if y not in uniqueGenres:
			uniqueGenres.append(y)
uniqueGenres = np.array(uniqueGenres)
uniqueGenres = [x.strip() for x in uniqueGenres]
uniqueGenres = np.unique(uniqueGenres)

In [95]:
uniqueGenres[45] == uniqueGenres[5]

False

In [52]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('Score')
  df = df.Ranked.apply(lambda x: float(x))
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  labels = [float(x) for x in labels]
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [53]:

train_ds = df_to_dataset(train, batch_size=5)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [56]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of scores:', train_features['Score'])
print('A batch of targets:', label_batch )

Every feature: ['Score', 'Ranked', 'Genres', 'eng_name', 'Type', 'Episodes', 'Source', 'Studios']
A batch of scores: tf.Tensor(
[[6.15]
 [5.85]
 [7.55]
 [6.55]
 [5.48]], shape=(5, 1), dtype=float64)
A batch of targets: tf.Tensor([6.15 5.85 7.55 6.55 5.48], shape=(5,), dtype=float32)


In [57]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [59]:
numEpisodes = train_features['Episodes']
layer = get_normalization_layer('Episodes',train_ds)
layer(numEpisodes)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-0.23386315],
       [-0.05368987],
       [ 0.8696981 ],
       [-0.2563848 ],
       [-0.23386315]], dtype=float32)>

In [60]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [61]:
type_col = train_features['Type']
type_layer = get_category_encoding_layer(name='Type',
                                              dataset=train_ds,
                                              dtype='string')
type_layer(type_col)

<tf.Tensor: shape=(5, 7), dtype=float32, numpy=
array([[0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.]], dtype=float32)>

In [63]:

type_col = train_features['Studios']
type_layer = get_category_encoding_layer(name='Studios',
                                              dataset=train_ds,
                                              dtype='string')
type_layer(type_col)

<tf.Tensor: shape=(5, 779), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>