In [30]:
from IPython.display import clear_output

# %pip install datasets
# %pip install janome mojimoji
# %pip install transformers
# %pip install sentencepiece
# %pip install tensorflow
# clear_output()

In [31]:
from janome.tokenizer import Tokenizer
import mojimoji

# Initialize Japanese tokenizer
tokenizer = Tokenizer()


def preprocess_japanese_text(text):
    # Convert full-width characters to half-width
    text = mojimoji.zen_to_han(text)

    # Tokenize the text
    tokens = tokenizer.tokenize(text, wakati=True)

    # Join tokens with a space to preserve word boundaries for translation models
    preprocessed_text = " ".join(tokens)

    # Remove any unnecessary whitespace
    preprocessed_text = preprocessed_text.strip()

    return preprocessed_text


# Example usage
text = "私は日本語のテキストを翻訳します。"
processed_text = preprocess_japanese_text(text)
print(processed_text)

私 は 日本語 の ﾃｷｽﾄ を 翻訳 し ます ｡


In [32]:
import pandas as pd

df = pd.read_csv(
    "hf://datasets/Verah/tatoeba_dedupe_en-jp_2024-March-01/tatoeba_dedupe_random_en-jp_2024-03-01.tsv.gz",
    sep="\t",
)
df.to_csv("./datas/eng_jap_dataset.csv")

In [33]:
df[:20]

Unnamed: 0,id,english,japanese
0,1,While in England I often consulted the guidebook.,イギリスにいる間、私はよくそのガイドブックを参考にした。
1,2,Look at the sports car over there.,あそこのスポーツカーを見なさい。
2,3,Never did I expect that he would fail the exam...,彼が試験に失敗するなんて私は予想もしなかった。
3,4,He knows no foreign language except English.,彼は英語以外の外国語は全く知らない。
4,5,All their secrets have been revealed.,彼らの秘密が全部暴かれた。
5,6,He promised to return the money without fail.,彼は間違いなく金を返すと約束した。
6,7,"To tell the truth, things at home haven't been...",実は家庭が上手くいってなくてさ・・・。離婚しようかと思ってるんだ。
7,8,Can I have your phone number?,電話番号教えてもらってもいい？
8,9,Even a monkey can solve a problem as simple as...,こんなに簡単な問題は猿さえも解けますよ。
9,10,You had better speak more naturally.,君はもっと自然に話す方がよい。


### Removing Emojis

In [34]:
import re
import pandas as pd
import tqdm


emoji_list_datas_path = "./datas//Emoji Sheets - Emoji Only.csv"
emoji_df = pd.read_csv(emoji_list_datas_path)

# Extract the emojis into a list
emoji_list = emoji_df["Emoji_List"].tolist()

# Start the pattern string
pattern = "["

# Append each code point to the pattern string, ensuring each one is 8 digits
for cp in emoji_list:
    pattern += f"\\U{cp[1:]:0>8}"

# Close the pattern string
pattern += "]"

# Compile the regular expression
emoji_pattern = re.compile(pattern, re.UNICODE)


# function to completely remove the emojis from the comments using re
def remove_emojis(text):
    if not isinstance(text, str):
        return text  # or return an empty string: return ''

    # Compile the regular expression
    emoji_pattern = re.compile(pattern, re.UNICODE)

    # Use the sub method to remove emojis
    text_no_emojis = emoji_pattern.sub(r"", text)
    return text_no_emojis

In [35]:
from tqdm import tqdm

tqdm.pandas(desc="Removing emojis from English text")
df["english"] = df["english"].progress_apply(remove_emojis)

tqdm.pandas(desc="Removing emojis from Japanese text")
df["japanese"] = df["japanese"].progress_apply(remove_emojis)

Removing emojis from English text: 100%|██████████| 201607/201607 [00:11<00:00, 18235.04it/s]
Removing emojis from Japanese text: 100%|██████████| 201607/201607 [00:05<00:00, 39889.13it/s]


## b

In [36]:
import mojimoji
def zen_to_han(text):
    return mojimoji.zen_to_han(text)

### Removing accented characters

In [37]:
import unicodedata

# English might have accent like é but Japanese doesn't have any accent I just create different function to ascii for Japanese and English
# Removing accented characters
def english_unicode_to_ascii(text):
    return "".join(
        ascii_text
        for ascii_text in unicodedata.normalize("NFKD", text)
        .encode("ascii", "ignore")
        .decode("utf-8", "ignore")
    )

def japanese_unicode_to_ascii(text):
    return "".join(ascii_text for ascii_text in unicodedata.normalize("NFKD", text))


japanese_unicode_to_ascii("こんにちは。 今日は"), english_unicode_to_ascii(
    "Hello world é "
)

('こんにちは。 今日は', 'Hello world e ')

## Remove all punctuation and special characters, keeping only the specified characters (a-z, A-Z, Kanji, Katakana, Hiragana)

In [41]:
import re

def clean_text(text):
    # Define regex pattern for allowed characters (Latin letters and Japanese characters)
    allowed_pattern = r"[^a-zA-Z\u4E00-\u9FFF\u3040-\u30FF\s]"
    # Replace everything not in allowed pattern with a space
    cleaned_text = re.sub(allowed_pattern, " ", text)

    # Remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
    
    cleaned_text=cleaned_text.lower()

    return cleaned_text

# Example usage
text = "こんにちは、世界! This is an example sentence."
print(clean_text(text))

こんにちは 世界 this is an example sentence


In [39]:
from janome.tokenizer import Tokenizer
import mojimoji

# Initialize Japanese tokenizer
tokenizer = Tokenizer()


def preprocess_japanese_text(text):
    # Convert full-width characters to half-width
    text = mojimoji.zen_to_han(text)

    # Tokenize the text
    tokens = tokenizer.tokenize(text, wakati=True)

    # Join tokens with a space to preserve word boundaries for translation models
    preprocessed_text = " ".join(tokens)

    # Remove any unnecessary whitespace
    preprocessed_text = preprocessed_text.strip()

    return preprocessed_text


text = "私は日本語のテキストを翻訳します。"
processed_text = preprocess_japanese_text(text)
print(processed_text)

私 は 日本語 の ﾃｷｽﾄ を 翻訳 し ます ｡


In [40]:
# Sample Japanese sentence
sentence = "私は猫が好きです。"

# Tokenize and print POS information
for token in tokenizer.tokenize(sentence):
    print(f"Word: {token.surface}, POS: {token.part_of_speech}")

Word: 私, POS: 名詞,代名詞,一般,*
Word: は, POS: 助詞,係助詞,*,*
Word: 猫, POS: 名詞,一般,*,*
Word: が, POS: 助詞,格助詞,一般,*
Word: 好き, POS: 名詞,形容動詞語幹,*,*
Word: です, POS: 助動詞,*,*,*
Word: 。, POS: 記号,句点,*,*
