# Japanese Data Cleaning

In [4]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from pykakasi import kakasi
import re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df  =  pd.read_excel('name_data/exiger_datasets/EXGR_Japanese names.xlsx')
print(df)

        Unnamed: 0                                        id  \
0                0  http://www.wikidata.org/entity/Q17229484   
1                1  http://www.wikidata.org/entity/Q19958115   
2                2  http://www.wikidata.org/entity/Q11237697   
3                3  http://www.wikidata.org/entity/Q11482333   
4                4  http://www.wikidata.org/entity/Q11528757   
...            ...                                       ...   
202435      202435  http://www.wikidata.org/entity/Q11672225   
202436      202436    http://www.wikidata.org/entity/Q495816   
202437      202437  http://www.wikidata.org/entity/Q11640390   
202438      202438   http://www.wikidata.org/entity/Q3984842   
202439      202439  http://www.wikidata.org/entity/Q11362511   

                  fullname  Family name  Given name  
0       Hirotoshi Nakamura          NaN         NaN  
1          Sachio Hosokawa          NaN         NaN  
2                      OZA          NaN         NaN  
3              

In [3]:
#shape before doing anything (202440,5)
df.shape

(202440, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,fullname,Family name,Given name
0,0,http://www.wikidata.org/entity/Q17229484,Hirotoshi Nakamura,,
1,1,http://www.wikidata.org/entity/Q19958115,Sachio Hosokawa,,
2,2,http://www.wikidata.org/entity/Q11237697,OZA,,
3,3,http://www.wikidata.org/entity/Q11482333,平塚利男,,
4,4,http://www.wikidata.org/entity/Q11528757,Jun Kochi,,


In [5]:
#cheaking how many null value each columns have 
nan_count = np.sum(df.isnull(), axis = 0)
nan_count


Unnamed: 0          0
id                  0
fullname            1
Family name    202440
Given name     202440
dtype: int64

In [6]:
#checking if full name have any duplicates 
df['fullname'].duplicated().any()

True

In [7]:
#create a new variable and drop the duplicates(went form 202440,5 to 193534,5)
df2 = df.drop_duplicates(subset=['fullname'])
print(df2)

        Unnamed: 0                                        id  \
0                0  http://www.wikidata.org/entity/Q17229484   
1                1  http://www.wikidata.org/entity/Q19958115   
2                2  http://www.wikidata.org/entity/Q11237697   
3                3  http://www.wikidata.org/entity/Q11482333   
4                4  http://www.wikidata.org/entity/Q11528757   
...            ...                                       ...   
202435      202435  http://www.wikidata.org/entity/Q11672225   
202436      202436    http://www.wikidata.org/entity/Q495816   
202437      202437  http://www.wikidata.org/entity/Q11640390   
202438      202438   http://www.wikidata.org/entity/Q3984842   
202439      202439  http://www.wikidata.org/entity/Q11362511   

                  fullname  Family name  Given name  
0       Hirotoshi Nakamura          NaN         NaN  
1          Sachio Hosokawa          NaN         NaN  
2                      OZA          NaN         NaN  
3              

In [8]:
# making sure all duplicates are gone 
df2['fullname'].duplicated().any()

False

In [9]:
#checking if the shape went down(went form (202440,5) to (193534,5))
df2.shape

(193534, 5)

In [10]:
# checking if the null values went down (it did) 
nan_count = np.sum(df2.isnull(), axis = 0)
nan_count

Unnamed: 0          0
id                  0
fullname            1
Family name    193534
Given name     193534
dtype: int64

In [11]:
#printing the col_names with null values
condition = nan_count != 0
col_names = nan_count[condition].index
nan_cols = list(col_names)
print(nan_cols)

print(df2['id'].unique())
print(df2['fullname'].unique())
print(df2['Family name'].unique())
print(df2['Given name'].unique())

['fullname', 'Family name', 'Given name']
['http://www.wikidata.org/entity/Q17229484'
 'http://www.wikidata.org/entity/Q19958115'
 'http://www.wikidata.org/entity/Q11237697' ...
 'http://www.wikidata.org/entity/Q11460373'
 'http://www.wikidata.org/entity/Q11384166'
 'http://www.wikidata.org/entity/Q11384100']
['Hirotoshi Nakamura' 'Sachio Hosokawa' 'OZA' ... 'Ryō Yuzawa' '菅原照仁'
 '世羅りさ']
[nan]
[nan]


In [12]:
#create new variable that has the dataframe with the drop column
df2 = df2.drop(columns = ['Unnamed: 0'])

nan_count = np.sum(df2.isnull(), axis = 0)
nan_count


id                  0
fullname            1
Family name    193534
Given name     193534
dtype: int64

In [13]:
#fullname still has one nan value 
df2[df2['fullname'].isna()]

Unnamed: 0,id,fullname,Family name,Given name
9364,http://www.wikidata.org/entity/Q24859852,,,


In [14]:
#dropping that nan value for good 
df2.dropna(subset=['fullname'], inplace= True)

nan_count = np.sum(df2.isnull(), axis = 0)
nan_count

id                  0
fullname            0
Family name    193533
Given name     193533
dtype: int64

In [15]:
df2[df2['fullname'].isna()]

Unnamed: 0,id,fullname,Family name,Given name


In [16]:
non_alnum_names_japanese = [name for name in df2['fullname'] if isinstance(name, str) and not name.replace(' ', '').isalnum()]
print(len(non_alnum_names_japanese))
non_alnum_names_japanese


4806


['佐藤直樹 (日活)',
 '島田陽子 (詩人)',
 'Fushimi-no-miya Sadakiyo-shinnō',
 '加藤歩 (アナウンサー)',
 '菅原健 (イラストレーター)',
 "Yūza San'yūtei",
 "Shin'ichi Tamura",
 'AKIRA (プロレスラー)',
 '朝日稔 (動物学者)',
 "Gen'ichirō Kakegawa",
 "Gen'e Imai",
 '山田泉 (教育者)',
 'Shigeakira-shinnō',
 'bird (日本の歌手)',
 '優希 (声優)',
 '笑福亭松鶴 (6代目)',
 '斎藤修 (一橋大学)',
 '松本龍 (アナウンサー)',
 'Tokiwai-no-miya Tsunenao-shinnō',
 "Kan'ichi Kawakami",
 "Shin'ichirō Ishiwatari",
 '江藤智 (政治家)',
 'T-岡田',
 "Kin'ichi Aoki",
 '阿修羅・原',
 '桃子 (AV女優)',
 'Enshi-naishinnō',
 "Ken'ichiro Kuroiwa",
 '清水哲也 (医学者)',
 "Shin'ichirō Hara",
 'ジェイムズ・アベグレン',
 "Ken'ichi Tsukamoto",
 "Ken'ichirō Hamada",
 'Rihō-joō',
 "Shin'ya Ōwada",
 '三浦大輔 (雀士)',
 "Shōko Nihon'yanagi",
 '井上薫 (弁護士)',
 'み〜こ',
 '伊藤武彦 (内務官僚)',
 'Nishimura Tei (government official)',
 'トゥイ・ティエン',
 '河野太郎 (美術家)',
 "Shin'ichi Kimura",
 "Gen'ichirō Ōshita",
 '2C = Galore',
 '佐藤まさみ (声優)',
 'シギ (歌手)',
 "Ken'ichirō Masaoka",
 "Jun'ichi Akimoto",
 '坂本茂 (野球)',
 "Kan'ichirō Shinobu",
 '中村俊介 (新聞記者)',
 '前田和之 (サッカー選手)',
 'Ooshima,

In [17]:
#Drop columns that won't be used in df2
print(df2.columns)
df2 = df2.drop(columns=['id', 'Family name', 'Given name'])

Index(['id', 'fullname', 'Family name', 'Given name'], dtype='object')


In [18]:
print(df2)

                  fullname
0       Hirotoshi Nakamura
1          Sachio Hosokawa
2                      OZA
3                     平塚利男
4                Jun Kochi
...                    ...
202435       Takada Makoto
202436    Masakiyo Maezono
202437          Ryō Yuzawa
202438                菅原照仁
202439                世羅りさ

[193533 rows x 1 columns]


In [19]:
def has_numbers(fullname):
    return any(char.isdigit() for char in str(fullname))

# Filter and print rows with numbers
df2 = df2[~df2['fullname'].apply(has_numbers)]
#number_rows = df2[df2['fullname'].apply(has_numbers)]
print(df2)
#print(number_rows)

                  fullname
0       Hirotoshi Nakamura
1          Sachio Hosokawa
2                      OZA
3                     平塚利男
4                Jun Kochi
...                    ...
202435       Takada Makoto
202436    Masakiyo Maezono
202437          Ryō Yuzawa
202438                菅原照仁
202439                世羅りさ

[193194 rows x 1 columns]


In [20]:
df2['original_fullname'] = df2['fullname']
df2['fullname'] = df2['fullname'].apply(str.lower)
df2

Unnamed: 0,fullname,original_fullname
0,hirotoshi nakamura,Hirotoshi Nakamura
1,sachio hosokawa,Sachio Hosokawa
2,oza,OZA
3,平塚利男,平塚利男
4,jun kochi,Jun Kochi
...,...,...
202435,takada makoto,Takada Makoto
202436,masakiyo maezono,Masakiyo Maezono
202437,ryō yuzawa,Ryō Yuzawa
202438,菅原照仁,菅原照仁


In [21]:
df2 = df2.reset_index(drop=True)
print(df2)
df2.shape

                  fullname   original_fullname
0       hirotoshi nakamura  Hirotoshi Nakamura
1          sachio hosokawa     Sachio Hosokawa
2                      oza                 OZA
3                     平塚利男                平塚利男
4                jun kochi           Jun Kochi
...                    ...                 ...
193189       takada makoto       Takada Makoto
193190    masakiyo maezono    Masakiyo Maezono
193191          ryō yuzawa          Ryō Yuzawa
193192                菅原照仁                菅原照仁
193193                世羅りさ                世羅りさ

[193194 rows x 2 columns]


(193194, 2)

In [22]:
#checking if there are still duplicates(no duplicates)
df2['fullname'].duplicated().any()
df2.shape

(193194, 2)

In [23]:
# Count the number of duplicates in the 'fullname' column
num_duplicates = df2['fullname'].duplicated().sum()

# Print the number of duplicate values
print(num_duplicates )

392


In [24]:
#drop the duplicates(went from 193194 to 192802)
df2 = df2.drop_duplicates(subset='fullname')
df2.shape


(192802, 2)

In [25]:
#check again if thee is any null value 
nan_count = np.sum(df2.isnull(), axis = 0)
nan_count


fullname             0
original_fullname    0
dtype: int64

In [26]:
df2 = df2.reset_index(drop=True)
print(df2)
df2.shape

                  fullname   original_fullname
0       hirotoshi nakamura  Hirotoshi Nakamura
1          sachio hosokawa     Sachio Hosokawa
2                      oza                 OZA
3                     平塚利男                平塚利男
4                jun kochi           Jun Kochi
...                    ...                 ...
192797       takada makoto       Takada Makoto
192798    masakiyo maezono    Masakiyo Maezono
192799          ryō yuzawa          Ryō Yuzawa
192800                菅原照仁                菅原照仁
192801                世羅りさ                世羅りさ

[192802 rows x 2 columns]


(192802, 2)

Feature Engineering 

In [27]:
#make the char_ngrams 
def get_ngrams(text, n):
    if isinstance(text,str):
        name = list(text)
    ngrams_list =  list(ngrams(list(text), n))
    return ngrams_list

In [28]:
# feature to check the name length
def name_length(name):
    if isinstance(name,str):
        return len(name.replace(' ', ''))
    else:
        return None

In [29]:
# feature to check the token length
def token_length(name):
    if isinstance(name,str):
       return len(name.split())
    else:
       return None

In [30]:
def japanese_lan(name):
    if isinstance(name, str):
        return [unicodedata.name(char).split(' ')[0] for char in name]
    else:
        return None

# Apply the function only to rows where 'fullname' is a string
df2['determine_alphabet'] = df2['fullname'].apply(japanese_lan)
print(df2['determine_alphabet'])


0         [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...
1         [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
2                                     [LATIN, LATIN, LATIN]
3                                      [CJK, CJK, CJK, CJK]
4         [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
                                ...                        
192797    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
192798    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...
192799    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
192800                                 [CJK, CJK, CJK, CJK]
192801                       [CJK, CJK, HIRAGANA, HIRAGANA]
Name: determine_alphabet, Length: 192802, dtype: object


In [31]:
# create a transliteratoion feature 
def remove_all_diacritics(text):
  normalized_text = unicodedata.normalize('NFKD', text)
  cleaned_text = ''.join([char for char in normalized_text if not unicodedata.combining(char)])
  return cleaned_text
def transliteration(name):
  kks = kakasi()
  kks.setMode("H", "a")
  kks.setMode("K", "a")
  kks.setMode("J", "a")
  kks.setMode("s", True) # Add space, default: no separator
  kks.setMode("C", False) # Do not capitalize
  kks.setMode("E", "K") # Remove accents
  kks.setMode("r", "Hepburn")
  conv = kks.getConverter()
  # Remove all diacritics before transliteration
  name_without_diacritics = remove_all_diacritics(name)
  return conv.do(name_without_diacritics)
df2['transliteration'] = df2['fullname'].apply(transliteration)
print(df2['transliteration'])



  kks.setMode("H", "a")
  kks.setMode("K", "a")
  kks.setMode("J", "a")
  kks.setMode("s", True) # Add space, default: no separator
  kks.setMode("C", False) # Do not capitalize
  kks.setMode("E", "K") # Remove accents
  kks.setMode("r", "Hepburn")
  conv = kks.getConverter()
  return conv.do(name_without_diacritics)


0            hirotoshi nakamura
1               sachio hosokawa
2                           oza
3              hiratsuka toshio
4                     jun kochi
                  ...          
192797            takada makoto
192798         masakiyo maezono
192799               ryo yuzawa
192800    sugawara shou hitoshi
192801                sera risa
Name: transliteration, Length: 192802, dtype: object


In [32]:
eliminate_charaters = r'[!&()*+=@:[\\\^",\'/_\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]'

contain_charaters = df2['transliteration'].str.contains(eliminate_charaters)

df2 = df2[~contain_charaters]
df2.reset_index(drop=True, inplace=True)
df2


Unnamed: 0,fullname,original_fullname,determine_alphabet,transliteration
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",hiratsuka toshio
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi
...,...,...,...,...
187929,takada makoto,Takada Makoto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",takada makoto
187930,masakiyo maezono,Masakiyo Maezono,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",masakiyo maezono
187931,ryō yuzawa,Ryō Yuzawa,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",ryo yuzawa
187932,菅原照仁,菅原照仁,"[CJK, CJK, CJK, CJK]",sugawara shou hitoshi


In [33]:
#Creating the char_ngrams after transliteration
df2= df2.copy()
df2["unigrams"] = df2['transliteration'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2["bigrams"] = df2['transliteration'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2["trigrams"] = df2['transliteration'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2['char_ngrams'] = df2["unigrams"] + df2["bigrams"] + df2["trigrams"]

In [34]:
#after translitration
df2['name_length'] = df2['transliteration'].apply(name_length)

df2['num_tokens'] = df2['transliteration'].apply(token_length)

#df2['determine_alphabet_tr'] = df2['transliteration'].apply(japanese_lan)

df2['period_freq'] = df2['transliteration'].apply(lambda name: name.count('.') if isinstance(name, str) else [])
df2['dash_freq'] = df2['transliteration'].apply(lambda name: name.count('-') if isinstance(name, str) else [])
df2['space_freq'] = df2['transliteration'].apply(lambda name: name.count(' ' )if isinstance(name, str) else [])
df2['apostrophe_freq'] = df2['transliteration'].apply(lambda name: name.count('\''))


In [35]:
df2

Unnamed: 0,fullname,original_fullname,determine_alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura,"[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...","[(h, i), (i, r), (r, o), (o, t), (t, o), (o, s...","[(h, i, r), (i, r, o), (r, o, t), (o, t, o), (...","[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...",17,2,0,0,1,0
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa,"[s, a, c, h, i, o, , h, o, s, o, k, a, w, a]","[(s, a), (a, c), (c, h), (h, i), (i, o), (o, ...","[(s, a, c), (a, c, h), (c, h, i), (h, i, o), (...","[s, a, c, h, i, o, , h, o, s, o, k, a, w, a, ...",14,2,0,0,1,0
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza,"[o, z, a]","[(o, z), (z, a)]","[(o, z, a)]","[o, z, a, (o, z), (z, a), (o, z, a)]",3,1,0,0,0,0
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",hiratsuka toshio,"[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, o]","[(h, i), (i, r), (r, a), (a, t), (t, s), (s, u...","[(h, i, r), (i, r, a), (r, a, t), (a, t, s), (...","[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, ...",15,2,0,0,1,0
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi,"[j, u, n, , k, o, c, h, i]","[(j, u), (u, n), (n, ), ( , k), (k, o), (o, c...","[(j, u, n), (u, n, ), (n, , k), ( , k, o), (...","[j, u, n, , k, o, c, h, i, (j, u), (u, n), (n...",8,2,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187929,takada makoto,Takada Makoto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",takada makoto,"[t, a, k, a, d, a, , m, a, k, o, t, o]","[(t, a), (a, k), (k, a), (a, d), (d, a), (a, ...","[(t, a, k), (a, k, a), (k, a, d), (a, d, a), (...","[t, a, k, a, d, a, , m, a, k, o, t, o, (t, a)...",12,2,0,0,1,0
187930,masakiyo maezono,Masakiyo Maezono,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",masakiyo maezono,"[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, o]","[(m, a), (a, s), (s, a), (a, k), (k, i), (i, y...","[(m, a, s), (a, s, a), (s, a, k), (a, k, i), (...","[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, ...",15,2,0,0,1,0
187931,ryō yuzawa,Ryō Yuzawa,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",ryo yuzawa,"[r, y, o, , y, u, z, a, w, a]","[(r, y), (y, o), (o, ), ( , y), (y, u), (u, z...","[(r, y, o), (y, o, ), (o, , y), ( , y, u), (...","[r, y, o, , y, u, z, a, w, a, (r, y), (y, o),...",9,2,0,0,1,0
187932,菅原照仁,菅原照仁,"[CJK, CJK, CJK, CJK]",sugawara shou hitoshi,"[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...","[(s, u), (u, g), (g, a), (a, w), (w, a), (a, r...","[(s, u, g), (u, g, a), (g, a, w), (a, w, a), (...","[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...",19,3,0,0,2,0


In [36]:
def contains_special_characters(text):
    return bool(re.search(r'[!&*+@:=^():]', text))

def contains_japanese_or_special(text):
    return bool(re.search(r'[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]', text)) or contains_special_characters(text)

contains_japanese_or_special_characters_count = len(df2[df2['transliteration'].apply(contains_japanese_or_special)])

print(f'The number of names with Japanese characters or special characters: {contains_japanese_or_special_characters_count}')


The number of names with Japanese characters or special characters: 0


In [37]:
df2.shape

(187934, 14)

In [38]:
'''
Function that returns the relative frequency distribution for characters, aka unigrams, across the entire language.
Returns a hashmap sorted by the ASCII values of the keys in ascending order.

df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

In [39]:
# Creating the unigrams frequency distribution for the entire Indonesian language
unigram_fdist = create_lang_char_distribution(df2, 'transliteration')
print(len(unigram_fdist))
unigram_fdist

61


{' ': 0.07879491313522605,
 '-': 0.00022155737853618506,
 '.': 0.00021036371307007118,
 'a': 0.15459455578710574,
 'b': 0.007226090041073032,
 'c': 0.01000983884595625,
 'd': 0.010900314233208827,
 'e': 0.028484790703545034,
 'f': 0.003667662388241865,
 'g': 0.011578109976605239,
 'h': 0.055303269206496496,
 'i': 0.11785115590035476,
 'j': 0.008750744475250613,
 'k': 0.07267468004451219,
 'l': 0.0002709639019728256,
 'm': 0.04605035373912815,
 'n': 0.037970843203380025,
 'o': 0.09144298313500603,
 'p': 0.0005662450771996228,
 'q': 1.119366546611388e-05,
 'r': 0.03559354025145604,
 's': 0.06236608612715001,
 't': 0.04603259826976811,
 'u': 0.07241375184261244,
 'v': 7.410978515496085e-05,
 'w': 0.00889742009170314,
 'x': 5.59683273305694e-05,
 'y': 0.02905257973460205,
 'z': 0.008836047925871688,
 '~': 7.719769286975089e-07,
 '\xad': 3.8598846434875445e-07,
 '×': 1.1579653930462634e-06,
 'ł': 3.8598846434875445e-07,
 'ʼ': 1.9299423217437725e-06,
 'ʾ': 3.8598846434875445e-07,
 'θ': 3.859

In [40]:
'''
Function that returns all possible bigrams as a hashmap. Each possible bigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams
    

In [41]:
# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', '-'): 0,
 (' ', '.'): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', '~'): 0,
 (' ', '\xad'): 0,
 (' ', '×'): 0,
 (' ', 'ł'): 0,
 (' ', 'ʼ'): 0,
 (' ', 'ʾ'): 0,
 (' ', 'θ'): 0,
 (' ', 'λ'): 0,
 (' ', 'φ'): 0,
 (' ', '\u200b'): 0,
 (' ', '\u200e'): 0,
 (' ', '\u200f'): 0,
 (' ', '‐'): 0,
 (' ', '―'): 0,
 (' ', '’'): 0,
 (' ', '∀'): 0,
 (' ', '◎'): 0,
 (' ', '★'): 0,
 (' ', '☆'): 0,
 (' ', '♂'): 0,
 (' ', '♪'): 0,
 (' ', '⻄'): 0,
 (' ', '、'): 0,
 (' ', '。'): 0,
 (' ', '々'): 0,
 (' ', '〆'): 0,
 (' ', '〜'): 0,
 (' ', '﨑'): 0,
 (' ', '𠀋'): 0,
 (' ', '𠮷'): 0,
 (' ', '𥙿'): 0,
 (' ', '𧶛'): 0,
 ('-',

In [42]:
'''
Function that returns the relative frequency distribution for -grams (bigrams, trigrams, etc.) across the entire language.
Returns a hashmap.

initialized_grams: a hashmap with all possible -grams as keys and all values initialized to 0. This parameter is copied in the function.
df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

In [43]:
# Creating the bigrams frequency distribution for the entire Indonesian language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df2, 'bigrams')
bigram_fdist

{(' ', ' '): 9.946658442985878e-05,
 (' ', '-'): 2.0808908876539496e-06,
 (' ', '.'): 1.8311839811354757e-05,
 (' ', 'a'): 0.004176348011521476,
 (' ', 'b'): 0.000497332922149294,
 (' ', 'c'): 0.0009339038303790926,
 (' ', 'd'): 0.0006013774665319914,
 (' ', 'e'): 0.0012015063985313904,
 (' ', 'f'): 0.0015519284240123154,
 (' ', 'g'): 0.000901441932531691,
 (' ', 'h'): 0.00663304779348573,
 (' ', 'i'): 0.003674437129419344,
 (' ', 'j'): 0.0010250468512583355,
 (' ', 'k'): 0.012837432064114746,
 (' ', 'l'): 1.9144196166416334e-05,
 (' ', 'm'): 0.009593323170262237,
 (' ', 'n'): 0.0054893901616311185,
 (' ', 'o'): 0.003177520385447581,
 (' ', 'p'): 2.3305977941724235e-05,
 (' ', 'q'): 2.0808908876539496e-06,
 (' ', 'r'): 0.0021562191377870227,
 (' ', 's'): 0.01071117775510994,
 (' ', 't'): 0.010208434516652745,
 (' ', 'u'): 0.0010487690073775905,
 (' ', 'v'): 2.3305977941724235e-05,
 (' ', 'w'): 0.0006667174404043255,
 (' ', 'x'): 2.7883937894562924e-05,
 (' ', 'y'): 0.007403393600095221

In [44]:
'''
Function that returns all possible trigrams as a hashmap. Each possible trigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

In [45]:
# Finding all possible transliterated characters
all_possible_chars_translit = create_lang_char_distribution(df2, 'transliteration').keys()
print(len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print(len(initialized_trigrams))
initialized_trigrams

61
226981


{(' ', ' ', ' '): 0,
 (' ', ' ', '-'): 0,
 (' ', ' ', '.'): 0,
 (' ', ' ', 'a'): 0,
 (' ', ' ', 'b'): 0,
 (' ', ' ', 'c'): 0,
 (' ', ' ', 'd'): 0,
 (' ', ' ', 'e'): 0,
 (' ', ' ', 'f'): 0,
 (' ', ' ', 'g'): 0,
 (' ', ' ', 'h'): 0,
 (' ', ' ', 'i'): 0,
 (' ', ' ', 'j'): 0,
 (' ', ' ', 'k'): 0,
 (' ', ' ', 'l'): 0,
 (' ', ' ', 'm'): 0,
 (' ', ' ', 'n'): 0,
 (' ', ' ', 'o'): 0,
 (' ', ' ', 'p'): 0,
 (' ', ' ', 'q'): 0,
 (' ', ' ', 'r'): 0,
 (' ', ' ', 's'): 0,
 (' ', ' ', 't'): 0,
 (' ', ' ', 'u'): 0,
 (' ', ' ', 'v'): 0,
 (' ', ' ', 'w'): 0,
 (' ', ' ', 'x'): 0,
 (' ', ' ', 'y'): 0,
 (' ', ' ', 'z'): 0,
 (' ', ' ', '~'): 0,
 (' ', ' ', '\xad'): 0,
 (' ', ' ', '×'): 0,
 (' ', ' ', 'ł'): 0,
 (' ', ' ', 'ʼ'): 0,
 (' ', ' ', 'ʾ'): 0,
 (' ', ' ', 'θ'): 0,
 (' ', ' ', 'λ'): 0,
 (' ', ' ', 'φ'): 0,
 (' ', ' ', '\u200b'): 0,
 (' ', ' ', '\u200e'): 0,
 (' ', ' ', '\u200f'): 0,
 (' ', ' ', '‐'): 0,
 (' ', ' ', '―'): 0,
 (' ', ' ', '’'): 0,
 (' ', ' ', '∀'): 0,
 (' ', ' ', '◎'): 0,
 (' ', ' ', '★')

In [46]:
# Changing trigrams column to become transliterated
df2['trigrams'] = df2['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Indonesian language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df2, 'trigrams')
trigram_fdist

{(' ', ' ', ' '): 0.0,
 (' ', ' ', '-'): 0.0,
 (' ', ' ', '.'): 0.0,
 (' ', ' ', 'a'): 5.417886067273891e-06,
 (' ', ' ', 'b'): 1.3544715168184728e-06,
 (' ', ' ', 'c'): 3.160433539243103e-06,
 (' ', ' ', 'd'): 4.514905056061576e-07,
 (' ', ' ', 'e'): 3.6119240448492607e-06,
 (' ', ' ', 'f'): 4.514905056061576e-07,
 (' ', ' ', 'g'): 1.3544715168184728e-06,
 (' ', ' ', 'h'): 1.5802167696215515e-05,
 (' ', ' ', 'i'): 8.578319606516994e-06,
 (' ', ' ', 'j'): 2.257452528030788e-06,
 (' ', ' ', 'k'): 1.2190243651366256e-05,
 (' ', ' ', 'l'): 0.0,
 (' ', ' ', 'm'): 9.932791123335467e-06,
 (' ', ' ', 'n'): 4.514905056061576e-06,
 (' ', ' ', 'o'): 9.029810112123152e-07,
 (' ', ' ', 'p'): 0.0,
 (' ', ' ', 'q'): 0.0,
 (' ', ' ', 'r'): 1.3544715168184728e-06,
 (' ', ' ', 's'): 1.5802167696215515e-05,
 (' ', ' ', 't'): 1.0835772134547783e-05,
 (' ', ' ', 'u'): 4.514905056061576e-07,
 (' ', ' ', 'v'): 0.0,
 (' ', ' ', 'w'): 4.514905056061576e-07,
 (' ', ' ', 'x'): 0.0,
 (' ', ' ', 'y'): 9.029810112

In [47]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
initialized_unigrams

{' ': 0,
 '-': 0,
 '.': 0,
 'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0,
 '~': 0,
 '\xad': 0,
 '×': 0,
 'ł': 0,
 'ʼ': 0,
 'ʾ': 0,
 'θ': 0,
 'λ': 0,
 'φ': 0,
 '\u200b': 0,
 '\u200e': 0,
 '\u200f': 0,
 '‐': 0,
 '―': 0,
 '’': 0,
 '∀': 0,
 '◎': 0,
 '★': 0,
 '☆': 0,
 '♂': 0,
 '♪': 0,
 '⻄': 0,
 '、': 0,
 '。': 0,
 '々': 0,
 '〆': 0,
 '〜': 0,
 '﨑': 0,
 '𠀋': 0,
 '𠮷': 0,
 '𥙿': 0,
 '𧶛': 0}

In [48]:
'''
Function to be applied to an ngrams column. Returns a hashmap of the relative frequency distribution for the current example.

grams_list: the list of -grams for this current example.
initialized_grams: a hashmap of all possible unigrams, bigrams, or trigrams as the keys and all values set to 0. This parameter is copied in the function.
'''
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

In [49]:
# UNIGRAMS individual frequency distributions
df2['indiv_unigrams_fdist'] = df2["unigrams"].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(df2.iloc[0]['indiv_unigrams_fdist'])

df2.tail()

{' ': 0.05555555555555555, '-': 0, '.': 0, 'a': 0.16666666666666666, 'b': 0, 'c': 0, 'd': 0, 'e': 0, 'f': 0, 'g': 0, 'h': 0.1111111111111111, 'i': 0.1111111111111111, 'j': 0, 'k': 0.05555555555555555, 'l': 0, 'm': 0.05555555555555555, 'n': 0.05555555555555555, 'o': 0.1111111111111111, 'p': 0, 'q': 0, 'r': 0.1111111111111111, 's': 0.05555555555555555, 't': 0.05555555555555555, 'u': 0.05555555555555555, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0, '~': 0, '\xad': 0, '×': 0, 'ł': 0, 'ʼ': 0, 'ʾ': 0, 'θ': 0, 'λ': 0, 'φ': 0, '\u200b': 0, '\u200e': 0, '\u200f': 0, '‐': 0, '―': 0, '’': 0, '∀': 0, '◎': 0, '★': 0, '☆': 0, '♂': 0, '♪': 0, '⻄': 0, '、': 0, '。': 0, '々': 0, '〆': 0, '〜': 0, '﨑': 0, '𠀋': 0, '𠮷': 0, '𥙿': 0, '𧶛': 0}


Unnamed: 0,fullname,original_fullname,determine_alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist
187929,takada makoto,Takada Makoto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",takada makoto,"[t, a, k, a, d, a, , m, a, k, o, t, o]","[(t, a), (a, k), (k, a), (a, d), (d, a), (a, ...","[(t, a, k), (a, k, a), (k, a, d), (a, d, a), (...","[t, a, k, a, d, a, , m, a, k, o, t, o, (t, a)...",12,2,0,0,1,0,"{' ': 0.07692307692307693, '-': 0, '.': 0, 'a'..."
187930,masakiyo maezono,Masakiyo Maezono,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",masakiyo maezono,"[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, o]","[(m, a), (a, s), (s, a), (a, k), (k, i), (i, y...","[(m, a, s), (a, s, a), (s, a, k), (a, k, i), (...","[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, ...",15,2,0,0,1,0,"{' ': 0.0625, '-': 0, '.': 0, 'a': 0.1875, 'b'..."
187931,ryō yuzawa,Ryō Yuzawa,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",ryo yuzawa,"[r, y, o, , y, u, z, a, w, a]","[(r, y), (y, o), (o, ), ( , y), (y, u), (u, z...","[(r, y, o), (y, o, ), (o, , y), ( , y, u), (...","[r, y, o, , y, u, z, a, w, a, (r, y), (y, o),...",9,2,0,0,1,0,"{' ': 0.1, '-': 0, '.': 0, 'a': 0.2, 'b': 0, '..."
187932,菅原照仁,菅原照仁,"[CJK, CJK, CJK, CJK]",sugawara shou hitoshi,"[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...","[(s, u), (u, g), (g, a), (a, w), (w, a), (a, r...","[(s, u, g), (u, g, a), (g, a, w), (a, w, a), (...","[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...",19,3,0,0,2,0,"{' ': 0.09523809523809523, '-': 0, '.': 0, 'a'..."
187933,世羅りさ,世羅りさ,"[CJK, CJK, HIRAGANA, HIRAGANA]",sera risa,"[s, e, r, a, , r, i, s, a]","[(s, e), (e, r), (r, a), (a, ), ( , r), (r, i...","[(s, e, r), (e, r, a), (r, a, ), (a, , r), (...","[s, e, r, a, , r, i, s, a, (s, e), (e, r), (r...",8,2,0,0,1,0,"{' ': 0.1111111111111111, '-': 0, '.': 0, 'a':..."


In [50]:
# BIGRAMS individual frequency distributions
df2['indiv_bigrams_fdist'] = df2['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

# checking that it works for 'supriyadi'
print(df2.iloc[0]['indiv_bigrams_fdist'][('s', 'u')])
print(1 / len(df2.iloc[0]['bigrams']))

df2.head()

0
0.058823529411764705


Unnamed: 0,fullname,original_fullname,determine_alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist,indiv_bigrams_fdist
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura,"[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...","[(h, i), (i, r), (r, o), (o, t), (t, o), (o, s...","[(h, i, r), (i, r, o), (r, o, t), (o, t, o), (...","[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...",17,2,0,0,1,0,"{' ': 0.05555555555555555, '-': 0, '.': 0, 'a'...","{(' ', ' '): 0, (' ', '-'): 0, (' ', '.'): 0, ..."
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa,"[s, a, c, h, i, o, , h, o, s, o, k, a, w, a]","[(s, a), (a, c), (c, h), (h, i), (i, o), (o, ...","[(s, a, c), (a, c, h), (c, h, i), (h, i, o), (...","[s, a, c, h, i, o, , h, o, s, o, k, a, w, a, ...",14,2,0,0,1,0,"{' ': 0.06666666666666667, '-': 0, '.': 0, 'a'...","{(' ', ' '): 0, (' ', '-'): 0, (' ', '.'): 0, ..."
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza,"[o, z, a]","[(o, z), (z, a)]","[(o, z, a)]","[o, z, a, (o, z), (z, a), (o, z, a)]",3,1,0,0,0,0,"{' ': 0, '-': 0, '.': 0, 'a': 0.33333333333333...","{(' ', ' '): 0, (' ', '-'): 0, (' ', '.'): 0, ..."
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",hiratsuka toshio,"[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, o]","[(h, i), (i, r), (r, a), (a, t), (t, s), (s, u...","[(h, i, r), (i, r, a), (r, a, t), (a, t, s), (...","[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, ...",15,2,0,0,1,0,"{' ': 0.0625, '-': 0, '.': 0, 'a': 0.125, 'b':...","{(' ', ' '): 0, (' ', '-'): 0, (' ', '.'): 0, ..."
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi,"[j, u, n, , k, o, c, h, i]","[(j, u), (u, n), (n, ), ( , k), (k, o), (o, c...","[(j, u, n), (u, n, ), (n, , k), ( , k, o), (...","[j, u, n, , k, o, c, h, i, (j, u), (u, n), (n...",8,2,0,0,1,0,"{' ': 0.1111111111111111, '-': 0, '.': 0, 'a':...","{(' ', ' '): 0, (' ', '-'): 0, (' ', '.'): 0, ..."


In [51]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2['indiv_unigrams_fdist'] = df2['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [52]:
# Calculating cosine similarity
df2['unigrams_cosine_sim'] = df2['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

In [53]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2['indiv_bigrams_fdist'] = df2['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)


In [54]:
# Calculating cosine similarity
df2['bigrams_cosine_sim'] = df2['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])

In [55]:
df2


Unnamed: 0,fullname,original_fullname,determine_alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura,"[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...","[(h, i), (i, r), (r, o), (o, t), (t, o), (o, s...","[(h, i, r), (i, r, o), (r, o, t), (o, t, o), (...","[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...",17,2,0,0,1,0,"[[0.05555555555555555, 0.0, 0.0, 0.16666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.930607,0.612908
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa,"[s, a, c, h, i, o, , h, o, s, o, k, a, w, a]","[(s, a), (a, c), (c, h), (h, i), (i, o), (o, ...","[(s, a, c), (a, c, h), (c, h, i), (h, i, o), (...","[s, a, c, h, i, o, , h, o, s, o, k, a, w, a, ...",14,2,0,0,1,0,"[[0.06666666666666667, 0.0, 0.0, 0.2, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.805768,0.394186
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza,"[o, z, a]","[(o, z), (z, a)]","[(o, z, a)]","[o, z, a, (o, z), (z, a), (o, z, a)]",3,1,0,0,0,0,"[[0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.523246,0.020991
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",hiratsuka toshio,"[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, o]","[(h, i), (i, r), (r, a), (a, t), (t, s), (s, u...","[(h, i, r), (i, r, a), (r, a, t), (a, t, s), (...","[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, ...",15,2,0,0,1,0,"[[0.0625, 0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.883438,0.600381
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi,"[j, u, n, , k, o, c, h, i]","[(j, u), (u, n), (n, ), ( , k), (k, o), (o, c...","[(j, u, n), (u, n, ), (n, , k), ( , k, o), (...","[j, u, n, , k, o, c, h, i, (j, u), (u, n), (n...",8,2,0,0,1,0,"[[0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.11...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.646229,0.287507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187929,takada makoto,Takada Makoto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",takada makoto,"[t, a, k, a, d, a, , m, a, k, o, t, o]","[(t, a), (a, k), (k, a), (a, d), (d, a), (a, ...","[(t, a, k), (a, k, a), (k, a, d), (a, d, a), (...","[t, a, k, a, d, a, , m, a, k, o, t, o, (t, a)...",12,2,0,0,1,0,"[[0.07692307692307693, 0.0, 0.0, 0.30769230769...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.750043,0.466026
187930,masakiyo maezono,Masakiyo Maezono,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",masakiyo maezono,"[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, o]","[(m, a), (a, s), (s, a), (a, k), (k, i), (i, y...","[(m, a, s), (a, s, a), (s, a, k), (a, k, i), (...","[m, a, s, a, k, i, y, o, , m, a, e, z, o, n, ...",15,2,0,0,1,0,"[[0.0625, 0.0, 0.0, 0.1875, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.822051,0.364580
187931,ryō yuzawa,Ryō Yuzawa,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",ryo yuzawa,"[r, y, o, , y, u, z, a, w, a]","[(r, y), (y, o), (o, ), ( , y), (y, u), (u, z...","[(r, y, o), (y, o, ), (o, , y), ( , y, u), (...","[r, y, o, , y, u, z, a, w, a, (r, y), (y, o),...",9,2,0,0,1,0,"[[0.1, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.630333,0.190562
187932,菅原照仁,菅原照仁,"[CJK, CJK, CJK, CJK]",sugawara shou hitoshi,"[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...","[(s, u), (u, g), (g, a), (a, w), (w, a), (a, r...","[(s, u, g), (u, g, a), (g, a, w), (a, w, a), (...","[s, u, g, a, w, a, r, a, , s, h, o, u, , h, ...",19,3,0,0,2,0,"[[0.09523809523809523, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.850570,0.549854


In [56]:
# Save with compression (gzip)
df2.to_pickle('japanese_df.pkl.gz', compression='gzip')

In [57]:
df_loaded_gz = pd.read_pickle('japanese_df.pkl.gz', compression='gzip')

: 