In [1]:
!pip install transformers
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate
!pip install transformers[torch]
!pip install accelerate -U



In [2]:
import torch

In [3]:
data_path = "jutsu.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
import pandas as pd

In [5]:
df = pd.read_json(data_path, lines=True)

In [6]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,,Taijutsu,Lars punches the opponent before striking them...
1,,"Taijutsu, Shurikenjutsu",The user punches the opponent twice with their...
2,,"Kekkei Genkai, Ninjutsu","Making use of his Gold Dust, the Fourth Kazeka..."
3,,"Ninjutsu, Kinjutsu",Akuta is an Earth Release technique that's cre...
4,,"Taijutsu, Shurikenjutsu",The user punches the opponent twice with their...


In [7]:
print(df)

     jutsu_name                          jutsu_type  \
0                                          Taijutsu   
1                           Taijutsu, Shurikenjutsu   
2                           Kekkei Genkai, Ninjutsu   
3                                Ninjutsu, Kinjutsu   
4                           Taijutsu, Shurikenjutsu   
...         ...                                 ...   
2884             Taijutsu, Collaboration Techniques   
2885                                       Taijutsu   
2886                                       Taijutsu   
2887                                       Taijutsu   
2888                                       Taijutsu   

                                      jutsu_description  
0     Lars punches the opponent before striking them...  
1     The user punches the opponent twice with their...  
2     Making use of his Gold Dust, the Fourth Kazeka...  
3     Akuta is an Earth Release technique that's cre...  
4     The user punches the opponent twice with th

In [8]:
def simplify_justu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'

    return None

In [9]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_justu)

In [10]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2005
Taijutsu     623
Genjutsu     101
Name: count, dtype: int64

In [11]:
df['text'] = df['jutsu_name']+'. '+df['jutsu_description']

In [12]:
df['jutsu'] = df['jutsu_type_simplified']

In [13]:
df= df[['text','jutsu']]

In [14]:
df = df.dropna()

In [15]:
from bs4 import BeautifulSoup

In [16]:
class Cleaner():
  def __init__(self):
    pass
  def put_line_breaks(self,text):
    text = text.replace('</p>','</p>\n')
    return text
  def remove_html_tags(self,text):
    cleantext = BeautifulSoup(text, "lxml").text
    return cleantext
  def clean(self,text):
    text = self.put_line_breaks(text)
    text = self.remove_html_tags(text)
    return text

In [17]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  cleantext = BeautifulSoup(text, "lxml").text


In [18]:
df['jutsu'].value_counts()

jutsu
Ninjutsu    2005
Taijutsu     623
Genjutsu     101
Name: count, dtype: int64

In [19]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,. Lars punches the opponent before striking th...,Taijutsu,. Lars punches the opponent before striking th...,2
1,. The user punches the opponent twice with the...,Taijutsu,. The user punches the opponent twice with the...,2
2,". Making use of his Gold Dust, the Fourth Kaze...",Ninjutsu,". Making use of his Gold Dust, the Fourth Kaze...",1
3,. Akuta is an Earth Release technique that's c...,Ninjutsu,. Akuta is an Earth Release technique that's c...,1
4,. The user punches the opponent twice with the...,Taijutsu,. The user punches the opponent twice with the...,2


In [20]:
from sklearn.utils.class_weight import compute_class_weight

In [21]:
class_weights = compute_class_weight('balanced',
                     classes=sorted(df['label'].unique().tolist()),
                     y=df['label'].tolist()).tolist()

In [22]:
class_weights

[9.006600660066006, 0.4536990856192851, 1.460139111824505]

In [23]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df,test_size=test_size,stratify=df['label'])

In [24]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [26]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2183 [00:00<?, ? examples/s]

In [27]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

In [28]:
from transformers import AutoModelForSequenceClassification

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import torch
from torch import nn

In [31]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [32]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [34]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)




In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0084,0.924029,0.851648
2,0.9489,0.729716,0.849817
3,0.9502,0.784018,0.879121
4,0.8954,0.831935,0.849817
5,0.8534,0.964893,0.862637


TrainOutput(global_step=1365, training_loss=0.9312449025583791, metrics={'train_runtime': 277.2762, 'train_samples_per_second': 39.365, 'train_steps_per_second': 4.923, 'total_flos': 611397882016146.0, 'train_loss': 0.9312449025583791, 'epoch': 5.0})

In [36]:
from sklearn.metrics import classification_report

In [37]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        81
           1       0.93      0.92      0.92      1604
           2       0.76      0.89      0.82       498

    accuracy                           0.88      2183
   macro avg       0.56      0.60      0.58      2183
weighted avg       0.85      0.88      0.87      2183



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.91      0.91      0.91       401
           2       0.73      0.86      0.79       125

    accuracy                           0.86       546
   macro avg       0.55      0.59      0.57       546
weighted avg       0.84      0.86      0.85       546



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
