In [21]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pyarrow 
from datasets import Dataset

# Load Dataset

In [2]:
data_path = '../data/jutsus.jsonl'
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Battōjutsu,Kenjutsu,User attacks the enemy with a sword at an incr...
2,Bat Controlling: Ultrasonic Mind Waves,Ninjutsu,"Once Rinji controls his bats, he commands them..."
3,Beast-Human Fury Kicks,"Hiden, Taijutsu","In their transformed state, a Tenrō clan shino..."
4,Beast Tearing Gale Palm,Ninjutsu,This technique is somewhat of an enhanced vers...


In [3]:
def simplify_justsu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_justsu)
df.head(20)

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Battōjutsu,Kenjutsu,User attacks the enemy with a sword at an incr...,
2,Bat Controlling: Ultrasonic Mind Waves,Ninjutsu,"Once Rinji controls his bats, he commands them...",Ninjutsu
3,Beast-Human Fury Kicks,"Hiden, Taijutsu","In their transformed state, a Tenrō clan shino...",Taijutsu
4,Beast Tearing Gale Palm,Ninjutsu,This technique is somewhat of an enhanced vers...,Ninjutsu
5,Beast Human Clone,"Ninjutsu, Clone Techniques","An ""Imitation Human Ninja Art"" (擬人忍法, Gijin Ni...",Ninjutsu
6,Bee Bomb Technique,"Hiden, Ninjutsu",This is a technique in which bees bombard the ...,Ninjutsu
7,Beast-Human Needle Senbon,"Hiden, Ninjutsu","In their transformed state, a Tenrō clan shino...",Ninjutsu
8,Beast Tearing Palm,Ninjutsu,A powerful technique of wind nature chakra tha...,Ninjutsu
9,Beast Sealing Technique,"Ninjutsu, Fūinjutsu","After creating the necessary seals, the seal f...",Ninjutsu


In [5]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2258
Taijutsu     397
Genjutsu     101
Name: count, dtype: int64

In [6]:
df['text'] = df['jutsu_name'] + '. ' + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
2,Bat Controlling: Ultrasonic Mind Waves. Once R...,Ninjutsu
3,Beast-Human Fury Kicks. In their transformed s...,Taijutsu
4,Beast Tearing Gale Palm. This technique is som...,Ninjutsu
5,"Beast Human Clone. An ""Imitation Human Ninja A...",Ninjutsu


In [7]:
df['jutsus'].value_counts()

jutsus
Ninjutsu    2258
Taijutsu     397
Genjutsu     101
Name: count, dtype: int64

In [8]:
class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):   # put line breaks after each paragraph
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, 'lxml').text
        return clean_text
    
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        
        return text
        

In [9]:
text_column_name = 'text'
label_column_name = 'jutsus'

In [10]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, 'lxml').text


In [11]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
2,Bat Controlling: Ultrasonic Mind Waves. Once R...,Ninjutsu,Bat Controlling: Ultrasonic Mind Waves. Once R...


In [12]:
# Encode labels
le = LabelEncoder()
le.fit(df[label_column_name].tolist())

In [13]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'])}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [14]:
df['label'] = le.transform(df[label_column_name].tolist())

In [15]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
2,Bat Controlling: Ultrasonic Mind Waves. Once R...,Ninjutsu,Bat Controlling: Ultrasonic Mind Waves. Once R...,1
3,Beast-Human Fury Kicks. In their transformed s...,Taijutsu,Beast-Human Fury Kicks. In their transformed s...,2
4,Beast Tearing Gale Palm. This technique is som...,Ninjutsu,Beast Tearing Gale Palm. This technique is som...,1
5,"Beast Human Clone. An ""Imitation Human Ninja A...",Ninjutsu,"Beast Human Clone. An ""Imitation Human Ninja A...",1


In [16]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size = test_size,
                                     stratify = df['label'],
                                     random_state = 42)

In [17]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1806
Taijutsu     317
Genjutsu      81
Name: count, dtype: int64

In [18]:
model_name = 'distilbert/distilbert-base-uncased'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [20]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [22]:
# Convert pandas dataframe to huggingface dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer,examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer,examples), batched=True)

Map: 100%|██████████| 2204/2204 [00:00<00:00, 5385.11 examples/s]
Map: 100%|██████████| 552/552 [00:00<00:00, 8305.82 examples/s]
