In [1]:
from transformers import pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
2025-01-27 19:18:07.753525: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-27 19:18:07.762553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738016287.773645    7162 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738016287.776827    7162 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-27 19:18:07.788861: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [2]:
# load the data
data = pd.read_csv('./data/anime-dataset-2023.csv')


In [None]:
# see column names
print(data.columns)

In [None]:
data.head(2)

In [3]:
# get only the useful columns -> 'anime_id', 'Name', 'Genres', 'Synopsis'
data = data[['anime_id', 'Name', 'Genres', 'Synopsis']]

data.head()

Unnamed: 0,anime_id,Name,Genres,Synopsis
0,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Sci-Fi","Another day, another bounty—such is the life o..."
2,6,Trigun,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...


In [4]:
# iterate over genres and preprocess them -> remove spaces, lowercase

data['Genres'] = data['Genres'].apply(lambda x: x.replace(' ', '').lower())

In [13]:
data.head(10)

Unnamed: 0,anime_id,Name,Genres,Synopsis
0,1,Cowboy Bebop,"action,awardwinning,sci-fi","Crime is timeless. By the year 2071, humanity ..."
1,5,Cowboy Bebop: Tengoku no Tobira,"action,sci-fi","Another day, another bounty—such is the life o..."
2,6,Trigun,"action,adventure,sci-fi","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"action,drama,mystery,supernatural",Robin Sena is a powerful craft user drafted in...
4,8,Bouken Ou Beet,"adventure,fantasy,supernatural",It is the dark century and the people are suff...
5,15,Eyeshield 21,sports,"Shy, reserved, and small-statured, Deimon High..."
6,16,Hachimitsu to Clover,"comedy,drama,romance","Yuuta Takemoto, a sophomore at an arts college..."
7,17,Hungry Heart: Wild Striker,"comedy,sliceoflife,sports",As the younger brother of Japanese soccer star...
8,18,Initial D Fourth Stage,"action,drama",Takumi Fujiwara finally joins Ryousuke and Kei...
9,19,Monster,"drama,mystery,suspense","Dr. Kenzou Tenma, an elite neurosurgeon recent..."


In [5]:
# split data into 2 parts -> train and test
# train -> 80% of the data
# test -> 20% of the data
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)

In [None]:
data.shape, train.shape, test.shape

In [None]:
# predict the genre of the anime

# get the first anime synopsis
anime_synopsis = test['Synopsis'].iloc[0]
anime_name = test['Name'].iloc[0]
anime_genre = test['Genres'].iloc[0]

# all possible genres
all_genres = list(set(data['Genres'].str.cat(sep='|').split('|')))

print(all_genres)

genres = set()
for i in all_genres:
    genres_separated = i.replace(' ', '').split(',')
    for j in genres_separated:
        genres.add(j)

print(genres)

genres = list(genres)

In [None]:
print(anime_synopsis)

In [None]:
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")  # change the model identifier here


In [None]:
from tqdm import tqdm
import json

# Initialize results dictionary
results = {}

# Iterate over the test data with progress bar
for index, row in tqdm(test.iterrows(), total=test.shape[0], desc="Processing anime"):
    anime_id = row['anime_id']
    anime_synopsis = row['Synopsis']
    anime_name = row['Name']
    anime_genre = row['Genres']

    text = 'The anime is called ' + anime_name + '. The synopsis of the anime is: ' + anime_synopsis
    output = zeroshot_classifier(text, list(genres), multi_label=True)

    anime_genre_len = len(anime_genre.split(','))

    top_k_labels_predicted = output['labels'][0:anime_genre_len]

    results[anime_id] = {}

    for i in range(anime_genre_len):
        genre = anime_genre.split(',')[i].strip()
        results[anime_id][genre] = "error"
        if genre in top_k_labels_predicted:
            results[anime_id][genre] = "correct"

# Save the results to a JSON file
with open('results_based_model.json', 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

# Split into train and test sets
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)

# Extract all unique genres from the ENTIRE dataset
all_genres = set()
for genres in data['Genres'].dropna():
    all_genres.update(g.strip() for g in genres.split(','))  # Preserve spaces within genre names
unique_genres = sorted(all_genres)


# Verify the number of unique genres
print("Number of unique genres:", len(unique_genres)) 
print("Unique genres:", unique_genres)

# Preprocessing function
def preprocess(df):
    df['text'] = 'The synopsis of the anime is: ' + df['Synopsis']
    
    # Encode labels as binary vectors
    genre_to_id = {genre: idx for idx, genre in enumerate(unique_genres)}
    
    def encode_labels(genres):
        labels = [0] * len(unique_genres)
        if isinstance(genres, str):  # Check if genres is a string
            for genre in genres.split(','):
                labels[genre_to_id[genre.strip()]] = 1
        return labels
    
    df['labels'] = df['Genres'].apply(encode_labels)
    return df

# Preprocess train and test datasets
train = preprocess(train)
test = preprocess(test)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train[['text', 'labels']])
test_dataset = Dataset.from_pandas(test[['text', 'labels']])

# Tokenizer
model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = batch["labels"]
    return tokenized

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load Pre-trained Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_genres),  # Set to the number of unique genres (42)
    ignore_mismatched_sizes=True    # Ignore size mismatches
)

# Freeze pre-trained layers
for param in model.base_model.parameters():
    param.requires_grad = False

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False, 
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train
trainer.train()

Number of unique genres: 22
Unique genres: ['action', 'adventure', 'avantgarde', 'awardwinning', 'boyslove', 'comedy', 'drama', 'ecchi', 'erotica', 'fantasy', 'girlslove', 'gourmet', 'hentai', 'horror', 'mystery', 'romance', 'sci-fi', 'sliceoflife', 'sports', 'supernatural', 'suspense', 'unknown']


Map: 100%|██████████| 19924/19924 [00:03<00:00, 6194.30 examples/s]
Map: 100%|██████████| 4981/4981 [00:00<00:00, 6418.06 examples/s]
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/deberta-v3-large-zeroshot-v2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([22]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([22, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained('./model/fine-tuned-anime-genre-model')
tokenizer.save_pretrained('./model/fine-tuned-anime-genre-model')

In [None]:
import json
from tqdm import tqdm

model_path = './model/fine-tuned-anime-genre-model'
zeroshot_classifier = pipeline("zero-shot-classification", model=model_path)

# Define genres
#genres = sorted(set(g.strip() for genre_list in data['Genres'].dropna() for g in genre_list.split(',')))

# Initialize results dictionary
results = {}

# Iterate over the test data with progress bar
for index, row in tqdm(test.iterrows(), total=test.shape[0], desc="Processing anime"):
    anime_id = row['anime_id']
    anime_synopsis = row['Synopsis']
    anime_name = row['Name']
    anime_genre = row['Genres']

    # Prepare input text for classification
    text = f"The anime is called {anime_name}. The synopsis of the anime is: {anime_synopsis}"

    # Predict genres using the fine-tuned model
    output = zeroshot_classifier(text, unique_genres, multi_label=True)

    # Extract predicted genres
    anime_genre_len = len(anime_genre.split(','))
    top_k_labels_predicted = output['labels'][0:anime_genre_len]

    """ print("Anime ID ", anime_id)
    print("Genre ", anime_genre.split(','))
    print("Top k labels predicted ", top_k_labels_predicted) """

    # Store results
    results[anime_id] = {}
    for i in range(anime_genre_len):
        genre = anime_genre.split(',')[i].lower()
        results[anime_id][genre] = "error"
        if genre in top_k_labels_predicted:
            results[anime_id][genre] = "correct"
    
    """ if len(results) == 10:
        break """

# Save the results to a JSON file
with open('results_finetuned_model.json', 'w') as f:
    json.dump(results, f, indent=4)