## Load the Dataset

In [2]:
import pandas as pd

In [3]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head(5)

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...
1,Amputation Punishment,"Bukijutsu, Taijutsu",Hidan attacks the enemy many times with his sc...
2,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic..."
3,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the..."
4,Amplification Summoning Technique,"Ninjutsu, Space–Time Ninjutsu",This is a summoning technique that bestows a s...


In [5]:
def simplify_jutsu(jutsu):
  if "Genjutsu" in jutsu:
    return "Genjutsu"
  
  if "Ninjutsu" in jutsu:
    return "Ninjutsu"
  
  if "Taijutsu" in jutsu:
    return "Taijutsu"

In [8]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...,Taijutsu
1,Amputation Punishment,"Bukijutsu, Taijutsu",Hidan attacks the enemy many times with his sc...,Taijutsu
2,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic...",Taijutsu
3,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the...",Taijutsu
4,Amplification Summoning Technique,"Ninjutsu, Space–Time Ninjutsu",This is a summoning technique that bestows a s...,Ninjutsu


In [9]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2258
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [10]:
df['text'] = df['jutsu_name'] + '. ' + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()
df

Unnamed: 0,text,jutsu
0,Absolute: Fang Passing Fang. Kiba and Akamaru ...,Taijutsu
1,Amputation Punishment. Hidan attacks the enemy...,Taijutsu
2,"16 Hit Combo. A very effective move, Ino uses ...",Taijutsu
3,100 Metre Punch. A shorter version of the 1000...,Taijutsu
4,Amplification Summoning Technique. This is a s...,Ninjutsu
...,...,...
2920,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu
2921,Acidic Sludge. The user spits out a small purp...,Ninjutsu
2922,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu
2923,Accelerated Armed Revolving Heaven. Tenten uns...,Ninjutsu


In [21]:
from bs4 import BeautifulSoup

class Cleaner():
  
  def __init__(self):
    pass
  
  def put_line_breaks(self, text):
    return text.replace("<\p>", "<\p>\n")

  def remove_html_tags(self, text):
    clean_text = BeautifulSoup(text, "lxml").text
    return clean_text
  
  def clean(self, text):
    text = self.put_line_breaks(text)  
    text = self.remove_html_tags(text)  
    text = text.strip()
    return text  

In [22]:
text_column_name = 'text'
label_column_name = 'jutsu'

In [23]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [24]:
from sklearn import preprocessing as pe

# Encode labels
le =  pe.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [25]:
label_dict = {index: label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [27]:
df['label'] = le.transform(df[label_column_name].tolist())
df.head(3)

Unnamed: 0,text,jutsu,text_cleaned,label
0,Absolute: Fang Passing Fang. Kiba and Akamaru ...,Taijutsu,Absolute: Fang Passing Fang. Kiba and Akamaru ...,2
1,Amputation Punishment. Hidan attacks the enemy...,Taijutsu,Amputation Punishment. Hidan attacks the enemy...,2
2,"16 Hit Combo. A very effective move, Ino uses ...",Taijutsu,"16 Hit Combo. A very effective move, Ino uses ...",2


In [30]:
from sklearn.model_selection import train_test_split

test_size = 0.2

df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'])

In [32]:
from transformers import AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [33]:
def preprocess_function(tokenizer, examples):
  return tokenizer(examples['text_cleaned'], truncation=True)

In [34]:
from datasets import Dataset

# Convert pandas to HuggingFace dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Tokenized the dataset
tokenizer_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenizer_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]