In [None]:
from transformers import pipeline
import pandas as pd

In [2]:
# load the data
data = pd.read_csv('./data/anime-dataset-2023.csv')


In [None]:
# see column names
print(data.columns)

In [None]:
data.head(2)

In [None]:
# get only the useful columns -> 'anime_id', 'Name', 'Genres', 'Synopsis'
data = data[['anime_id', 'Name', 'Genres', 'Synopsis']]

data.head()

In [4]:
# split data into 2 parts -> train and test
# train -> 80% of the data
# test -> 20% of the data
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)

In [None]:
data.shape, train.shape, test.shape

In [None]:
# predict the genre of the anime

# get the first anime synopsis
anime_synopsis = test['Synopsis'].iloc[0]
anime_name = test['Name'].iloc[0]
anime_genre = test['Genres'].iloc[0]

# all possible genres
all_genres = list(set(data['Genres'].str.cat(sep='|').split('|')))

print(all_genres)

genres = set()
for i in all_genres:
    genres_separated = i.replace(' ', '').split(',')
    for j in genres_separated:
        genres.add(j)

print(genres)

genres = list(genres)

In [None]:
print(anime_synopsis)

In [None]:
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")  # change the model identifier here


In [None]:
from tqdm import tqdm
import json

# Initialize results dictionary
results = {}

# Iterate over the test data with progress bar
for index, row in tqdm(test.iterrows(), total=test.shape[0], desc="Processing anime"):
    anime_id = row['anime_id']
    anime_synopsis = row['Synopsis']
    anime_name = row['Name']
    anime_genre = row['Genres']

    text = 'The anime is called ' + anime_name + '. The synopsis of the anime is: ' + anime_synopsis
    output = zeroshot_classifier(text, list(genres), multi_label=True)

    anime_genre_len = len(anime_genre.split(','))

    top_k_labels_predicted = output['labels'][0:anime_genre_len]

    results[anime_id] = {}

    for i in range(anime_genre_len):
        genre = anime_genre.split(',')[i].strip()
        results[anime_id][genre] = "error"
        if genre in top_k_labels_predicted:
            results[anime_id][genre] = "correct"

# Save the results to a JSON file
with open('results_based_model.json', 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

def preprocess(df):
    df['text'] = 'The synopsis of the anime is: ' + df['Synopsis']
    # Ensure genres are properly encoded as binary vectors for multi-label classification
    unique_genres = sorted(set(g for genre_list in df['Genres'] for g in genre_list.split(',')))
    genre_to_id = {genre: idx for idx, genre in enumerate(unique_genres)}

    def encode_labels(genres):
        labels = [0] * len(unique_genres)
        for genre in genres.split(','):
            labels[genre_to_id[genre]] = 1
        return labels

    df['labels'] = df['Genres'].apply(encode_labels)
    return df, unique_genres

train, unique_genres = preprocess(train)
test, _ = preprocess(test)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train[['text', 'labels']])
test_dataset = Dataset.from_pandas(test[['text', 'labels']])

# Tokenizer
model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = batch["labels"]
    return tokenized

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load Pre-trained Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_genres),  # Set to the number of genres in your dataset
    ignore_mismatched_sizes=True   # Ignore size mismatches
)

# Freeze pre-trained layers
for param in model.base_model.parameters():
    param.requires_grad = False

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train
trainer.train()