# Korean Data Cleaning

In [4]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df  =  pd.read_excel('../name_data/exiger_datasets/EXGR_Korean names.xlsx')
print(df)

       Unnamed: 0                                        id       fullname  \
0               0    http://www.wikidata.org/entity/Q484396  Park Joo-bong   
1               1  http://www.wikidata.org/entity/Q55728351  KIM Jong hoon   
2               2  http://www.wikidata.org/entity/Q11266766            이민혁   
3               3  http://www.wikidata.org/entity/Q47492159         Lee Ho   
4               4   http://www.wikidata.org/entity/Q1075756            최민호   
...           ...                                       ...            ...   
21197       21197  http://www.wikidata.org/entity/Q11267007     Lee Han-wi   
21198       21198  http://www.wikidata.org/entity/Q12586843   Gil Jung-woo   
21199       21199    http://www.wikidata.org/entity/Q224639            이정희   
21200       21200  http://www.wikidata.org/entity/Q12586585            금태섭   
21201       21201  http://www.wikidata.org/entity/Q18684836      Oh Ui-sik   

       Family name  Given name  
0              NaN         NaN

In [3]:
#checking the shape(21202, 5)
df.shape

(21202, 5)

In [4]:
#checking the head 
df.head()

Unnamed: 0.1,Unnamed: 0,id,fullname,Family name,Given name
0,0,http://www.wikidata.org/entity/Q484396,Park Joo-bong,,
1,1,http://www.wikidata.org/entity/Q55728351,KIM Jong hoon,,
2,2,http://www.wikidata.org/entity/Q11266766,이민혁,,
3,3,http://www.wikidata.org/entity/Q47492159,Lee Ho,,
4,4,http://www.wikidata.org/entity/Q1075756,최민호,,


In [5]:
#checking how many null entries each columns have( 21202 for family and given name)
nan_count = np.sum(df.isnull(), axis = 0)
nan_count

Unnamed: 0         0
id                 0
fullname           0
Family name    21202
Given name     21202
dtype: int64

In [6]:
#checking if there's any duplicates(yes)
df['fullname'].duplicated().any()

True

In [7]:
#drop them from the fullname and checking if nullentries went down(form 21202 to 19520)
df2 =df.drop_duplicates(subset=['fullname'])

nan_count = np.sum(df2.isnull(), axis = 0)
nan_count

Unnamed: 0         0
id                 0
fullname           0
Family name    19520
Given name     19520
dtype: int64

In [8]:
#now the duplicates are gone
df2['fullname'].duplicated().any()

False

In [9]:
#Drop columns that won't be used in df2
print(df2.columns)
df2 = df2.drop(columns=['Unnamed: 0', 'id', 'Family name', 'Given name'])


Index(['Unnamed: 0', 'id', 'fullname', 'Family name', 'Given name'], dtype='object')


In [10]:
print(df2)

            fullname
0      Park Joo-bong
1      KIM Jong hoon
2                이민혁
3             Lee Ho
4                최민호
...              ...
21197     Lee Han-wi
21198   Gil Jung-woo
21199            이정희
21200            금태섭
21201      Oh Ui-sik

[19520 rows x 1 columns]


In [11]:
#shape went from 19520 from 19219
def has_numbers(fullname):
    return any(char.isdigit() for char in str(fullname))

# Filter and print rows with numbers
df2 = df2[~df2['fullname'].apply(has_numbers)]
#number_rows = df2[df2['fullname'].apply(has_numbers)] # 301
print(df2)
#print(number_rows)

            fullname
0      Park Joo-bong
1      KIM Jong hoon
2                이민혁
3             Lee Ho
4                최민호
...              ...
21197     Lee Han-wi
21198   Gil Jung-woo
21199            이정희
21200            금태섭
21201      Oh Ui-sik

[19219 rows x 1 columns]


In [12]:
df2['original_fullname'] = df2['fullname']
df2['fullname'] = df2['fullname'].apply(str.lower)
df2

Unnamed: 0,fullname,original_fullname
0,park joo-bong,Park Joo-bong
1,kim jong hoon,KIM Jong hoon
2,이민혁,이민혁
3,lee ho,Lee Ho
4,최민호,최민호
...,...,...
21197,lee han-wi,Lee Han-wi
21198,gil jung-woo,Gil Jung-woo
21199,이정희,이정희
21200,금태섭,금태섭


In [16]:
#checking if there's still any duplicates(whoops still have duplicate)
df2['fullname'].duplicated().any()

True

In [19]:
df2.shape

(19219, 2)

In [17]:
# Count the number of duplicates in the 'fullname' column
num_duplicates = df2['fullname'].duplicated().sum()

# Print the number of duplicate values
print(num_duplicates )

Number of duplicates: 101


In [21]:
#drop the duplicates(went from19219 to 19118)
df2 = df2.drop_duplicates(subset='fullname')
df2.shape

(19118, 2)

In [22]:
# just checking again if thee is any duplicates again
# Count the number of duplicates in the 'fullname' column
num_duplicates = df2['fullname'].duplicated().sum()

# Print the number of duplicate values
print(num_duplicates )

0


In [23]:
df2['fullname'].duplicated().any()

False

In [53]:
nan_count = np.sum(df2.isnull(), axis = 0)
nan_count

fullname                0
original_fullname       0
alphabet                0
transliteration         0
unigrams                0
bigrams                 0
trigrams                0
char_ngrams             0
num_tokens              0
period_freq             0
dash_freq               0
space_freq              0
name_length             0
avg_token_length        0
indiv_unigrams_fdist    0
indiv_bigrams_fdist     0
unigrams_cosine_sim     0
bigrams_cosine_sim      0
dtype: int64

Now For Feature Engineering 

In [24]:
#determining the lan
def Korean_lan(name):
    if isinstance(name, str):
        return [unicodedata.name(char).split(' ')[0] for char in name]
    else:
        return None

# Apply the function only to rows where 'fullname' is a string
df2['alphabet'] = df2['fullname'].apply(Korean_lan)
print(df2['alphabet'])


0        [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
1        [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
2                                 [HANGUL, HANGUL, HANGUL]
3               [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]
4                                 [HANGUL, HANGUL, HANGUL]
                               ...                        
21197    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21198    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21199                             [HANGUL, HANGUL, HANGUL]
21200                             [HANGUL, HANGUL, HANGUL]
21201    [LATIN, LATIN, SPACE, LATIN, LATIN, HYPHEN-MIN...
Name: alphabet, Length: 19118, dtype: object


In [25]:
#featuring the char_ngrams. un
def get_ngrams(text, n):
    if isinstance(text,str):
        name = list(text)
    ngrams_list =  list(ngrams(list(text), n))
    return ngrams_list

"""
df2["unigrams"] = df2['fullname'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2["bigrams"] = df2['fullname'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2["trigrams"] = df2['fullname'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2['char_ngrams'] = df2["unigrams"] + df2["bigrams"] + df2["trigrams"]
"""

'\ndf2["unigrams"] = df2[\'fullname\'].apply(lambda name: list(name) if isinstance(name, str) else [])\ndf2["bigrams"] = df2[\'fullname\'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])\ndf2["trigrams"] = df2[\'fullname\'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])\n\ndf2[\'char_ngrams\'] = df2["unigrams"] + df2["bigrams"] + df2["trigrams"]\n'

In [26]:
#feature for token length 
def token_length(name):
    if isinstance(name,str):
       return len(name.split())
    else:
       return None
"""
df2['num_tokens'] = df2['fullname'].apply(token_length)
print(df2['num_tokens'])
"""

"\ndf2['num_tokens'] = df2['fullname'].apply(token_length)\nprint(df2['num_tokens'])\n"

In [27]:
df2


Unnamed: 0,fullname,original_fullname,alphabet
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT..."
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]"
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]"
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]"
...,...,...,...
21197,lee han-wi,Lee Han-wi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT..."
21198,gil jung-woo,Gil Jung-woo,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT..."
21199,이정희,이정희,"[HANGUL, HANGUL, HANGUL]"
21200,금태섭,금태섭,"[HANGUL, HANGUL, HANGUL]"


In [28]:
"""
"""

def transliteration(name):
    transliter = Transliter(academic)
    romanized_name = transliter.translit(name)
    # Split the transliterated name into individual words
    words = romanized_name.split()
    # Insert a space between the words to separate them
    romanized_name_with_spaces = ' '.join(words)
    return romanized_name_with_spaces
df2['transliteration'] = df2['fullname'].apply(transliteration)
print(df2['transliteration'])


"""
"""


0        park joo-bong
1        kim jong hoon
2            iminhyeog
3               lee ho
4            choeminho
             ...      
21197       lee han-wi
21198     gil jung-woo
21199        ijeonghui
21200      geumtaeseob
21201        oh ui-sik
Name: transliteration, Length: 19118, dtype: object


'\n'

In [29]:
#featuring the char_ngrams after transliteration

df2["unigrams"] = df2['transliteration'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2["bigrams"] = df2['transliteration'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2["trigrams"] = df2['transliteration'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2['char_ngrams'] = df2["unigrams"] + df2["bigrams"] + df2["trigrams"]

In [30]:
#df2['determine_alphabet'] = df2['transliteration'].apply(Korean_lan)
df2['num_tokens'] = df2['transliteration'].apply(token_length)

df2['period_freq'] = df2['transliteration'].apply(lambda name: name.count('.') if isinstance(name, str) else [])
df2['dash_freq'] = df2['transliteration'].apply(lambda name: name.count('-') if isinstance(name, str) else [])
df2['space_freq'] = df2['transliteration'].apply(lambda name: name.count(' ' )if isinstance(name, str) else [])

In [31]:
#name_length (length of the entire name string)
def get_name_length(fullname):
    if isinstance(fullname, str):
        return len(fullname)
    else:
        return np.nan 
    
df2['name_length'] = df2['transliteration'].apply(get_name_length)

In [32]:
tokens = df2['transliteration'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
df2['avg_token_length'] = token_lengths.apply(np.mean)
df2

21197      [lee, han-wi]
21198    [gil, jung-woo]
21199        [ijeonghui]
21200      [geumtaeseob]
21201       [oh, ui-sik]
Name: transliteration, dtype: object 

21197    [3, 6]
21198    [3, 8]
21199       [9]
21200      [11]
21201    [2, 6]
Name: transliteration, dtype: object


Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.000000
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,9.000000
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.500000
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",choeminho,"[c, h, o, e, m, i, n, h, o]","[(c, h), (h, o), (o, e), (e, m), (m, i), (i, n...","[(c, h, o), (h, o, e), (o, e, m), (e, m, i), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21197,lee han-wi,Lee Han-wi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",lee han-wi,"[l, e, e, , h, a, n, -, w, i]","[(l, e), (e, e), (e, ), ( , h), (h, a), (a, n...","[(l, e, e), (e, e, ), (e, , h), ( , h, a), (...","[l, e, e, , h, a, n, -, w, i, (l, e), (e, e),...",2,0,1,1,10,4.500000
21198,gil jung-woo,Gil Jung-woo,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",gil jung-woo,"[g, i, l, , j, u, n, g, -, w, o, o]","[(g, i), (i, l), (l, ), ( , j), (j, u), (u, n...","[(g, i, l), (i, l, ), (l, , j), ( , j, u), (...","[g, i, l, , j, u, n, g, -, w, o, o, (g, i), (...",2,0,1,1,12,5.500000
21199,이정희,이정희,"[HANGUL, HANGUL, HANGUL]",ijeonghui,"[i, j, e, o, n, g, h, u, i]","[(i, j), (j, e), (e, o), (o, n), (n, g), (g, h...","[(i, j, e), (j, e, o), (e, o, n), (o, n, g), (...","[i, j, e, o, n, g, h, u, i, (i, j), (j, e), (e...",1,0,0,0,9,9.000000
21200,금태섭,금태섭,"[HANGUL, HANGUL, HANGUL]",geumtaeseob,"[g, e, u, m, t, a, e, s, e, o, b]","[(g, e), (e, u), (u, m), (m, t), (t, a), (a, e...","[(g, e, u), (e, u, m), (u, m, t), (m, t, a), (...","[g, e, u, m, t, a, e, s, e, o, b, (g, e), (e, ...",1,0,0,0,11,11.000000


In [33]:
'''
Function that returns the relative frequency distribution for characters, aka unigrams, across the entire language.
Returns a hashmap sorted by the ASCII values of the keys in ascending order.

df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

In [34]:
# Creating the unigrams frequency distribution for the entire Indonesian language
unigram_fdist = create_lang_char_distribution(df2, 'transliteration')
print(len(unigram_fdist))
unigram_fdist

74


{' ': 0.051050841788382206,
 "'": 1.8582524993496117e-05,
 '(': 0.002740922436540677,
 ')': 0.002740922436540677,
 ',': 5.110194373211432e-05,
 '-': 0.05476734678708143,
 '.': 0.00018582524993496116,
 '/': 4.645631248374029e-06,
 ':': 4.645631248374029e-06,
 'a': 0.05950589066042294,
 'b': 0.013769651020180622,
 'c': 0.014289961719998514,
 'd': 0.008733786746943174,
 'e': 0.0846619838703683,
 'f': 0.0002601553499089456,
 'g': 0.08767235291931468,
 'h': 0.05003344854498829,
 'i': 0.06272066748429776,
 'j': 0.03562734604378043,
 'k': 0.0254719961348348,
 'l': 0.019195748318281487,
 'm': 0.03319303526963244,
 'n': 0.12028003865165199,
 'o': 0.10017374660868919,
 'p': 0.005291373991898019,
 'q': 3.7165049986992234e-05,
 'r': 0.007210019697476493,
 's': 0.03936243356747315,
 't': 0.004984762329505333,
 'u': 0.057517560486118856,
 'v': 0.0006736165310142342,
 'w': 0.013346898576578586,
 'x': 8.362136247073252e-05,
 'y': 0.0433251570223362,
 'z': 0.00036235923737317426,
 '|': 4.64563124837402

In [35]:
'''
Function that returns all possible bigrams as a hashmap. Each possible bigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams
    

In [36]:
# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', "'"): 0,
 (' ', '('): 0,
 (' ', ')'): 0,
 (' ', ','): 0,
 (' ', '-'): 0,
 (' ', '.'): 0,
 (' ', '/'): 0,
 (' ', ':'): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', '|'): 0,
 (' ', 'á'): 0,
 (' ', 'ã'): 0,
 (' ', 'é'): 0,
 (' ', 'í'): 0,
 (' ', 'ó'): 0,
 (' ', 'ô'): 0,
 (' ', 'õ'): 0,
 (' ', 'ø'): 0,
 (' ', 'ú'): 0,
 (' ', 'ą'): 0,
 (' ', 'ć'): 0,
 (' ', 'č'): 0,
 (' ', 'ē'): 0,
 (' ', 'ō'): 0,
 (' ', 'ŏ'): 0,
 (' ', 'ś'): 0,
 (' ', 'ş'): 0,
 (' ', 'š'): 0,
 (' ', 'ū'): 0,
 (' ', 'ŭ'): 0,
 (' ', 'ž'): 0,
 (' ', 'ț'): 0,
 (' ', 'ʻ'): 0,
 (' ', '\u200b'): 0,
 (' ', '\u200e'): 0,
 (' ', '\u200f

In [37]:
'''
Function that returns the relative frequency distribution for -grams (bigrams, trigrams, etc.) across the entire language.
Returns a hashmap.

initialized_grams: a hashmap with all possible -grams as keys and all values initialized to 0. This parameter is copied in the function.
df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

In [38]:
# Creating the bigrams frequency distribution for the entire Indonesian language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df2, 'bigrams')
bigram_fdist

{(' ', ' '): 0.0,
 (' ', "'"): 0.0,
 (' ', '('): 0.0030029876923390675,
 (' ', ')'): 0.0,
 (' ', ','): 0.0,
 (' ', '-'): 0.0,
 (' ', '.'): 0.0,
 (' ', '/'): 0.0,
 (' ', ':'): 0.0,
 (' ', 'a'): 0.0006627986417726294,
 (' ', 'b'): 0.0020546757894951513,
 (' ', 'c'): 0.0023758782082003486,
 (' ', 'd'): 0.003059070654335213,
 (' ', 'e'): 0.0011522499464662636,
 (' ', 'f'): 8.157521744893901e-05,
 (' ', 'g'): 0.0017232764686088366,
 (' ', 'h'): 0.006036566091221487,
 (' ', 'i'): 0.0010553793757456485,
 (' ', 'j'): 0.008437936554874629,
 (' ', 'k'): 0.003599506469934434,
 (' ', 'l'): 0.0006322079352292773,
 (' ', 'm'): 0.0027786558443544853,
 (' ', 'n'): 0.0006475032885009534,
 (' ', 'o'): 0.00036708847852022557,
 (' ', 'p'): 0.0004843528536030754,
 (' ', 'q'): 1.0196902181117377e-05,
 (' ', 'r'): 0.00032630086979575605,
 (' ', 's'): 0.009768632289510446,
 (' ', 't'): 0.0012032344573718505,
 (' ', 'u'): 0.0002243318479845823,
 (' ', 'v'): 6.118141308670426e-05,
 (' ', 'w'): 0.001269514321549

In [39]:
'''
Function that returns all possible trigrams as a hashmap. Each possible trigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

In [40]:
# Finding all possible transliterated characters
all_possible_chars_translit = create_lang_char_distribution(df2, 'transliteration').keys()
print(len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print(len(initialized_trigrams))
initialized_trigrams

74
405224


{(' ', ' ', ' '): 0,
 (' ', ' ', "'"): 0,
 (' ', ' ', '('): 0,
 (' ', ' ', ')'): 0,
 (' ', ' ', ','): 0,
 (' ', ' ', '-'): 0,
 (' ', ' ', '.'): 0,
 (' ', ' ', '/'): 0,
 (' ', ' ', ':'): 0,
 (' ', ' ', 'a'): 0,
 (' ', ' ', 'b'): 0,
 (' ', ' ', 'c'): 0,
 (' ', ' ', 'd'): 0,
 (' ', ' ', 'e'): 0,
 (' ', ' ', 'f'): 0,
 (' ', ' ', 'g'): 0,
 (' ', ' ', 'h'): 0,
 (' ', ' ', 'i'): 0,
 (' ', ' ', 'j'): 0,
 (' ', ' ', 'k'): 0,
 (' ', ' ', 'l'): 0,
 (' ', ' ', 'm'): 0,
 (' ', ' ', 'n'): 0,
 (' ', ' ', 'o'): 0,
 (' ', ' ', 'p'): 0,
 (' ', ' ', 'q'): 0,
 (' ', ' ', 'r'): 0,
 (' ', ' ', 's'): 0,
 (' ', ' ', 't'): 0,
 (' ', ' ', 'u'): 0,
 (' ', ' ', 'v'): 0,
 (' ', ' ', 'w'): 0,
 (' ', ' ', 'x'): 0,
 (' ', ' ', 'y'): 0,
 (' ', ' ', 'z'): 0,
 (' ', ' ', '|'): 0,
 (' ', ' ', 'á'): 0,
 (' ', ' ', 'ã'): 0,
 (' ', ' ', 'é'): 0,
 (' ', ' ', 'í'): 0,
 (' ', ' ', 'ó'): 0,
 (' ', ' ', 'ô'): 0,
 (' ', ' ', 'õ'): 0,
 (' ', ' ', 'ø'): 0,
 (' ', ' ', 'ú'): 0,
 (' ', ' ', 'ą'): 0,
 (' ', ' ', 'ć'): 0,
 (' ', ' ', '

In [41]:
# Changing trigrams column to become transliterated
df2['trigrams'] = df2['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Indonesian language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df2, 'trigrams')
trigram_fdist

{(' ', ' ', ' '): 0.0,
 (' ', ' ', "'"): 0.0,
 (' ', ' ', '('): 0.0,
 (' ', ' ', ')'): 0.0,
 (' ', ' ', ','): 0.0,
 (' ', ' ', '-'): 0.0,
 (' ', ' ', '.'): 0.0,
 (' ', ' ', '/'): 0.0,
 (' ', ' ', ':'): 0.0,
 (' ', ' ', 'a'): 0.0,
 (' ', ' ', 'b'): 0.0,
 (' ', ' ', 'c'): 0.0,
 (' ', ' ', 'd'): 0.0,
 (' ', ' ', 'e'): 0.0,
 (' ', ' ', 'f'): 0.0,
 (' ', ' ', 'g'): 0.0,
 (' ', ' ', 'h'): 0.0,
 (' ', ' ', 'i'): 0.0,
 (' ', ' ', 'j'): 0.0,
 (' ', ' ', 'k'): 0.0,
 (' ', ' ', 'l'): 0.0,
 (' ', ' ', 'm'): 0.0,
 (' ', ' ', 'n'): 0.0,
 (' ', ' ', 'o'): 0.0,
 (' ', ' ', 'p'): 0.0,
 (' ', ' ', 'q'): 0.0,
 (' ', ' ', 'r'): 0.0,
 (' ', ' ', 's'): 0.0,
 (' ', ' ', 't'): 0.0,
 (' ', ' ', 'u'): 0.0,
 (' ', ' ', 'v'): 0.0,
 (' ', ' ', 'w'): 0.0,
 (' ', ' ', 'x'): 0.0,
 (' ', ' ', 'y'): 0.0,
 (' ', ' ', 'z'): 0.0,
 (' ', ' ', '|'): 0.0,
 (' ', ' ', 'á'): 0.0,
 (' ', ' ', 'ã'): 0.0,
 (' ', ' ', 'é'): 0.0,
 (' ', ' ', 'í'): 0.0,
 (' ', ' ', 'ó'): 0.0,
 (' ', ' ', 'ô'): 0.0,
 (' ', ' ', 'õ'): 0.0,
 (' ', ' ',

In [42]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
initialized_unigrams

{' ': 0,
 "'": 0,
 '(': 0,
 ')': 0,
 ',': 0,
 '-': 0,
 '.': 0,
 '/': 0,
 ':': 0,
 'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0,
 '|': 0,
 'á': 0,
 'ã': 0,
 'é': 0,
 'í': 0,
 'ó': 0,
 'ô': 0,
 'õ': 0,
 'ø': 0,
 'ú': 0,
 'ą': 0,
 'ć': 0,
 'č': 0,
 'ē': 0,
 'ō': 0,
 'ŏ': 0,
 'ś': 0,
 'ş': 0,
 'š': 0,
 'ū': 0,
 'ŭ': 0,
 'ž': 0,
 'ț': 0,
 'ʻ': 0,
 '\u200b': 0,
 '\u200e': 0,
 '\u200f': 0,
 '‑': 0,
 '人': 0,
 '卓': 0,
 '政': 0,
 '治': 0,
 '燮': 0,
 '物': 0,
 '賢': 0,
 '趙': 0,
 '郑': 0,
 '镇': 0,
 '高': 0}

In [43]:
'''
Function to be applied to an ngrams column. Returns a hashmap of the relative frequency distribution for the current example.

grams_list: the list of -grams for this current example.
initialized_grams: a hashmap of all possible unigrams, bigrams, or trigrams as the keys and all values set to 0. This parameter is copied in the function.
'''
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

In [44]:
# UNIGRAMS individual frequency distributions
df2['indiv_unigrams_fdist'] = df2["unigrams"].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(df2.iloc[0]['indiv_unigrams_fdist'])

df2.tail()

{' ': 0.07692307692307693, "'": 0, '(': 0, ')': 0, ',': 0, '-': 0.07692307692307693, '.': 0, '/': 0, ':': 0, 'a': 0.07692307692307693, 'b': 0.07692307692307693, 'c': 0, 'd': 0, 'e': 0, 'f': 0, 'g': 0.07692307692307693, 'h': 0, 'i': 0, 'j': 0.07692307692307693, 'k': 0.07692307692307693, 'l': 0, 'm': 0, 'n': 0.07692307692307693, 'o': 0.23076923076923078, 'p': 0.07692307692307693, 'q': 0, 'r': 0.07692307692307693, 's': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0, '|': 0, 'á': 0, 'ã': 0, 'é': 0, 'í': 0, 'ó': 0, 'ô': 0, 'õ': 0, 'ø': 0, 'ú': 0, 'ą': 0, 'ć': 0, 'č': 0, 'ē': 0, 'ō': 0, 'ŏ': 0, 'ś': 0, 'ş': 0, 'š': 0, 'ū': 0, 'ŭ': 0, 'ž': 0, 'ț': 0, 'ʻ': 0, '\u200b': 0, '\u200e': 0, '\u200f': 0, '‑': 0, '人': 0, '卓': 0, '政': 0, '治': 0, '燮': 0, '物': 0, '賢': 0, '趙': 0, '郑': 0, '镇': 0, '高': 0}


Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist
21197,lee han-wi,Lee Han-wi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",lee han-wi,"[l, e, e, , h, a, n, -, w, i]","[(l, e), (e, e), (e, ), ( , h), (h, a), (a, n...","[(l, e, e), (e, e, ), (e, , h), ( , h, a), (...","[l, e, e, , h, a, n, -, w, i, (l, e), (e, e),...",2,0,1,1,10,4.5,"{' ': 0.1, ''': 0, '(': 0, ')': 0, ',': 0, '-'..."
21198,gil jung-woo,Gil Jung-woo,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",gil jung-woo,"[g, i, l, , j, u, n, g, -, w, o, o]","[(g, i), (i, l), (l, ), ( , j), (j, u), (u, n...","[(g, i, l), (i, l, ), (l, , j), ( , j, u), (...","[g, i, l, , j, u, n, g, -, w, o, o, (g, i), (...",2,0,1,1,12,5.5,"{' ': 0.08333333333333333, ''': 0, '(': 0, ')'..."
21199,이정희,이정희,"[HANGUL, HANGUL, HANGUL]",ijeonghui,"[i, j, e, o, n, g, h, u, i]","[(i, j), (j, e), (e, o), (o, n), (n, g), (g, h...","[(i, j, e), (j, e, o), (e, o, n), (o, n, g), (...","[i, j, e, o, n, g, h, u, i, (i, j), (j, e), (e...",1,0,0,0,9,9.0,"{' ': 0, ''': 0, '(': 0, ')': 0, ',': 0, '-': ..."
21200,금태섭,금태섭,"[HANGUL, HANGUL, HANGUL]",geumtaeseob,"[g, e, u, m, t, a, e, s, e, o, b]","[(g, e), (e, u), (u, m), (m, t), (t, a), (a, e...","[(g, e, u), (e, u, m), (u, m, t), (m, t, a), (...","[g, e, u, m, t, a, e, s, e, o, b, (g, e), (e, ...",1,0,0,0,11,11.0,"{' ': 0, ''': 0, '(': 0, ')': 0, ',': 0, '-': ..."
21201,oh ui-sik,Oh Ui-sik,"[LATIN, LATIN, SPACE, LATIN, LATIN, HYPHEN-MIN...",oh ui-sik,"[o, h, , u, i, -, s, i, k]","[(o, h), (h, ), ( , u), (u, i), (i, -), (-, s...","[(o, h, ), (h, , u), ( , u, i), (u, i, -), (...","[o, h, , u, i, -, s, i, k, (o, h), (h, ), ( ...",2,0,1,1,9,4.0,"{' ': 0.1111111111111111, ''': 0, '(': 0, ')':..."


In [45]:
# BIGRAMS individual frequency distributions
df2['indiv_bigrams_fdist'] = df2['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

In [46]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2['indiv_unigrams_fdist'] = df2['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [47]:
# Calculating cosine similarity
df2['unigrams_cosine_sim'] = df2['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

In [48]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2['indiv_bigrams_fdist'] = df2['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)


In [49]:
# Calculating cosine similarity
df2['bigrams_cosine_sim'] = df2['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])

In [50]:
df2

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.000000,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.680590,0.377660
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.762211,0.552090
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,9.000000,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.344410
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.500000,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.537098,0.143205
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",choeminho,"[c, h, o, e, m, i, n, h, o]","[(c, h), (h, o), (o, e), (e, m), (m, i), (i, n...","[(c, h, o), (h, o, e), (o, e, m), (e, m, i), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,9.000000,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21197,lee han-wi,Lee Han-wi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",lee han-wi,"[l, e, e, , h, a, n, -, w, i]","[(l, e), (e, e), (e, ), ( , h), (h, a), (a, n...","[(l, e, e), (e, e, ), (e, , h), ( , h, a), (...","[l, e, e, , h, a, n, -, w, i, (l, e), (e, e),...",2,0,1,1,10,4.500000,"[[0.1, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.675313,0.212623
21198,gil jung-woo,Gil Jung-woo,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",gil jung-woo,"[g, i, l, , j, u, n, g, -, w, o, o]","[(g, i), (i, l), (l, ), ( , j), (j, u), (u, n...","[(g, i, l), (i, l, ), (l, , j), ( , j, u), (...","[g, i, l, , j, u, n, g, -, w, o, o, (g, i), (...",2,0,1,1,12,5.500000,"[[0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.769942,0.394563
21199,이정희,이정희,"[HANGUL, HANGUL, HANGUL]",ijeonghui,"[i, j, e, o, n, g, h, u, i]","[(i, j), (j, e), (e, o), (o, n), (n, g), (g, h...","[(i, j, e), (j, e, o), (e, o, n), (o, n, g), (...","[i, j, e, o, n, g, h, u, i, (i, j), (j, e), (e...",1,0,0,0,9,9.000000,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.777239,0.508705
21200,금태섭,금태섭,"[HANGUL, HANGUL, HANGUL]",geumtaeseob,"[g, e, u, m, t, a, e, s, e, o, b]","[(g, e), (e, u), (u, m), (m, t), (t, a), (a, e...","[(g, e, u), (e, u, m), (u, m, t), (m, t, a), (...","[g, e, u, m, t, a, e, s, e, o, b, (g, e), (e, ...",1,0,0,0,11,11.000000,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.614584,0.229460


In [51]:
df2.to_pickle('../pickled_dataframes/korean_df.pkl.gz', compression='gzip')

In [52]:
df_loaded_gz = pd.read_pickle('../pickled_dataframes/korean_df.pkl.gz', compression='gzip')