In [None]:
import torch

In [None]:
data_path = "jutsu.jsonl"  # @param {type:"string"}
text_column_name = "text"  # @param {type:"string"}
label_column_name = "jutsu"  # @param {type: string}

model_name = "distilbert-base-uncased"  # @param {type: "string"}
text_size = 0.2  # @param {type:"number"}
num_labels = 3  # @param {type: "number"}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
import pandas as pd

In [None]:
try:
    df = pd.read_json(data_path, lines=True)
    print("Successfully loaded DataFrame!")
except Exception as e:
    print("Error loading JSON:", e)

# Optional: verify it's loaded
if "df" in locals() and isinstance(df, pd.DataFrame):
    display(df.head())
else:
    print("df is not a valid DataFrame.")

In [None]:
def simplifiy_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"

    return None

In [None]:
df["jutsu_type_simplified"] = df["jutsu_type"].apply(simplifiy_jutsu)

In [None]:
df["jutsu_type_simplified"].value_counts()

In [None]:
df["text"] = df["jutsu_name"] + ". " + df["jutsu_description"]

In [None]:
df = df[["text", "jutsu"]]

In [None]:
df = df.dropna()

In [2]:
from bs4 import BeautifulSoup

In [None]:
class Cleaner:
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        text = text.replace("<p>", "</p>\n")
        return text

    def remove_html_tags(self, text):
        cleantext = BeautifulSoup(text, "lxml").text
        return cleantext

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        return text

In [None]:
cleaner = Cleaner()
df["text_cleaned"] = df[text_column_name].apply(cleaner.clean)

NameError: name 'df' is not defined

In [None]:
df["jutsu"].value_counts()

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df["label"] = le.transform(df[label_column_name].tolist())

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    "balanced", classes=sorted(df["label"].unique().tolist()), y=df["label"].tolist()
).tolist()

In [None]:
from sklearn.model_selection import train_text_split

In [None]:
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df["label"])

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], trunation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

In [None]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

import evaluate
import numpy as np
import torch
from torch import nn

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions, references=labels)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor(class_weights).to(device=device)
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('jutsu_model')

In [None]:
from sklearn.metrics import classification_report

In [None]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0], axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT, preds))

In [None]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0], axis=1)  # preds[:3][1]
GT = df_test["label"].tolist()
print(classification_report(GT, preds))