# Data Consolidation

In [1]:
import pandas as pd
import numpy as np
import unicodedata
from unidecode import unidecode
from nltk import edit_distance, ngrams
from sklearn.metrics.pairwise import cosine_similarity

### Importing dataframes

In [2]:
# Exiger datasets
df_indo = pd.read_pickle('pickled_dataframes/df_indo.pkl.gz', compression = 'gzip')
print(df_indo.shape)
df_malay = pd.read_pickle('pickled_dataframes/df_malay.pkl.gz', compression = 'gzip')
print(df_malay.shape)
df_viet = pd.read_pickle('pickled_dataframes/viet_df.pkl.gz', compression = 'gzip')
print(df_viet.shape)
df_cnrom = pd.read_pickle('pickled_dataframes/cnrom_df.pkl.gz', compression = 'gzip')
print(df_cnrom.shape)
df_cnchar = pd.read_pickle('pickled_dataframes/cnchar_df.pkl.gz', compression = 'gzip')
print(df_cnchar.shape)
df_turk = pd.read_pickle('pickled_dataframes/turkish_df.pkl.gz', compression = 'gzip')
print(df_turk.shape)
df_korean = pd.read_pickle('pickled_dataframes/korean_df.pkl.gz', compression ='gzip') 
print(df_korean.shape)
df_japan = pd.read_pickle('pickled_dataframes/japanese_df.pkl.gz', compression ='gzip') 
print(df_japan.shape)

(11246, 22)
(2908, 22)
(2290, 18)
(10478, 18)
(11055, 18)
(18037, 21)
(19118, 18)
(187934, 18)


In [3]:
# company_csv                                                                     number of samples
# df_arabic = pd.read_pickle('arabic_df.pkl.gz', compression = 'gzip')                  # 40
df_arabic_latin = pd.read_pickle('pickled_dataframes/arabicLatin_df.pkl.gz', compression = 'gzip')         # 1046
# df_bulgar = pd.read_pickle('bulgarian_df.pkl.gz', compression = 'gzip')               # 2
# df_bulgar_latin = pd.read_pickle('bulgarianLatin_df.pkl.gz', compression = 'gzip')    # 474
# df_croatian = pd.read_pickle('croatian_df.pkl.gz', compression = 'gzip')                # 582
# df_danish = pd.read_pickle('danish_df.pkl.gz', compression = 'gzip')                  # 408
# df_dutch = pd.read_pickle('dutch_df.pkl.gz', compression = 'gzip')                      # 695
df_english = pd.read_pickle('pickled_dataframes/english_df.pkl.gz', compression = 'gzip')                  # 22779
# df_finnish = pd.read_pickle('finnish_df.pkl.gz', compression = 'gzip')                # 451
df_french = pd.read_pickle('pickled_dataframes/french_df.pkl.gz', compression = 'gzip')                    # 1164
df_german = pd.read_pickle('pickled_dataframes/german_df.pkl.gz', compression = 'gzip')                    # 1064
# df_hindi = pd.read_pickle('hindi_df.pkl.gz', compression = 'gzip')                    # 12
# df_hindi_latin = pd.read_pickle('hindiLatin_df.pkl.gz', compression = 'gzip')           # 781
# df_hungar = pd.read_pickle('hungarian_df.pkl.gz', compression = 'gzip')               # 434
df_italian = pd.read_pickle('pickled_dataframes/italian_df.pkl.gz', compression = 'gzip')                  # 1207
# df_norwegian = pd.read_pickle('norwegian_df.pkl.gz', compression = 'gzip')            # 403
# df_polish = pd.read_pickle('polish_df.pkl.gz', compression = 'gzip')                    # 561
df_portug = pd.read_pickle('pickled_dataframes/portuguese_df.pkl.gz', compression = 'gzip')                # 1068
# df_russian = pd.read_pickle('russian_df.pkl.gz', compression = 'gzip')                # 15
# df_russian_latin = pd.read_pickle('russianLatin_df.pkl.gz', compression = 'gzip')       # 968
df_spanish = pd.read_pickle('pickled_dataframes/spanish_df.pkl.gz', compression = 'gzip')                  # 2502

# company_csv_dfs = [df_arabic_latin, df_croatian, df_dutch, df_english, df_french, df_german, df_hindi_latin, df_italian, df_polish,
#                    df_portug, df_russian_latin, df_spanish]
company_csv_dfs = [df_arabic_latin, df_english, df_french, df_german, df_italian,
                   df_portug, df_spanish]

In [4]:
df_arabic_latin.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,...,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
15,Al Dokali Al Seyed,1,ar,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",3.75,4,al dokali al seyed,al dokali al seyed,"[(a,), (l,), ( ,), (d,), (o,), (k,), (a,), (l,...","[a, l, , d, o, k, a, l, i, , a, l, , s, e, ...",...,0,0,3,"[Al, Dokali, Al, Seyed]",0,,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.058823529411...",0.789154,0.374614
116,Afzal Ansari,1,ar,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",5.5,2,afzal ansari,afzal ansari,"[(a,), (f,), (z,), (a,), (l,), ( ,), (a,), (n,...","[a, f, z, a, l, , a, n, s, a, r, i]",...,0,0,1,"[Afzal, Ansari]",0,,"[[0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.090909090909...",0.831071,0.397864
165,Naguib Mahfouz,1,ar,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,naguib mahfouz,naguib mahfouz,"[(n,), (a,), (g,), (u,), (i,), (b,), ( ,), (m,...","[n, a, g, u, i, b, , m, a, h, f, o, u, z]",...,0,0,1,"[Naguib, Mahfouz]",0,,"[[0.07142857142857142, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.75002,0.229505
316,Fahad Barakah Al-Marwani Al-Johani,1,ar,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.75,4,fahad barakah al-marwani al-johani,fahad barakah al-marwani al-johani,"[(f,), (a,), (h,), (a,), (d,), ( ,), (b,), (a,...","[f, a, h, a, d, , b, a, r, a, k, a, h, , a, ...",...,2,0,3,"[Fahad, Barakah, Al-Marwani, Al-Johani]",0,,"[[0.08823529411764705, 0.0, 0.0588235294117647...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.060606060606...",0.909744,0.636527
320,Zulfiqar Ahmed,1,ar,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",6.5,2,zulfiqar ahmed,zulfiqar ahmed,"[(z,), (u,), (l,), (f,), (i,), (q,), (a,), (r,...","[z, u, l, f, i, q, a, r, , a, h, m, e, d]",...,0,0,1,"[Zulfiqar, Ahmed]",0,,"[[0.07142857142857142, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.076923076923...",0.843715,0.30737


In [5]:
for df in company_csv_dfs:
    df['name_length'] = df['name'].apply(len)

In [6]:
all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk, df_korean, df_japan]
for df in company_csv_dfs:
    all_dfs.append(df)

In [7]:
# finding counts
# df_names = ['Indonesian', 'Malay', 'Vietnamese', 'Chinese (Romanized)', 'Chinese (Characters)', 'Turkish', 'Korean (Romanized & Characters)']
# total_size = 0

# for i, df in enumerate(all_dfs):
#     total_size += df.shape[0]
#     print(df_names[i], ':', df.shape[0])
    
# for i, df in enumerate(all_dfs):
#     print(df_names[i], ':', df.shape[0] / total_size)

### Cleaning up column names

column names to KEEP: (10 so far)

* name_length
* avg_token_length
* num_tokens
* period_freq
* dash_freq
* apostrophe_freq
* space_freq
* unigrams_cosine_sim
* bigrams_cosine_sim
* language

In [8]:
#This is where you rename columns to all match
df_viet.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnrom.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnchar.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_viet.head()

Unnamed: 0,fullname,alphabet,name_length,num_tokens,char_ngrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,unigrams,bigrams,trigrams,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(T,), (ừ,), ( ,), (H,), (o,), (à,), (n,), (g,...",0,0,2,0,tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...",4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07...",0.805625,0.508198
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (T,...",0,0,3,0,nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...",4.75,"[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884792,0.667716
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(N,), (i,), (c,), (k,), ( ,), (Ú,), (t,), (N,...",0,0,1,0,nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...",3.0,"[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.59269,0.0056
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(C,), (a,), (o,), ( ,), (V,), (ă,), (n,), ( ,...",0,0,2,0,cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...",3.0,"[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665965,0.243176
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(T,), (ạ,), ( ,), (T,), (h,), (u,), ( ,), (T,...",0,0,2,0,ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...",3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.596114,0.288942


### Redoing frequency distributions across all Latin names

#### Frequency Distribution Functions

In [9]:
# Functions from IndoMalay.ipynb

def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
#df_indo['indiv_trigrams_fdist'] = df_indo.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

#### Determining which languages use Latin

For these lines of code to work, the datasets must have been pickled to preserve data types! `pd.csv` turns everything into strings; for example, a list of `[LATIN, LATIN, LATIN, ...]` becomes `'[LATIN, LATIN, LATIN, ...]'` (i.e., `'['` becomes a character).

In [10]:
indo_latin_percent = create_lang_char_distribution(df_indo, 'alphabet')['LATIN']
malay_latin_percent = create_lang_char_distribution(df_malay, 'alphabet')['LATIN']
viet_latin_percent = create_lang_char_distribution(df_viet, 'alphabet')['LATIN']
cnrom_latin_percent = create_lang_char_distribution(df_cnrom, 'alphabet')['LATIN']
# cnchar_latin_percent = create_lang_char_distribution(df_cnchar, 'alphabet')['LATIN'] error -> no latin
turk_latin_percent = create_lang_char_distribution(df_turk, 'alphabet')['LATIN']
korean_latin_percent = create_lang_char_distribution(df_korean, 'alphabet')['LATIN']
korean_latin_percent

0.6589739940220817

In [11]:
df_cnchar.head()

Unnamed: 0,original_fullname,transliteration,alphabet,name_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.333333,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.083333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.774279,0.548928
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0,4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.071428...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.811762,0.560151
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.77639,0.510394
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.076923...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.841584,0.605839
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0,3.5,"[[0.125, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710349,0.440812


In [12]:
df_korean.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.0,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.68059,0.37766
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.762211,0.55209
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,9.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.34441
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.5,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.537098,0.143205
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",choeminho,"[c, h, o, e, m, i, n, h, o]","[(c, h), (h, o), (o, e), (e, m), (m, i), (i, n...","[(c, h, o), (h, o, e), (o, e, m), (e, m, i), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,9.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394


### Cleaning up other columns

In [15]:
df_korean.columns

Index(['fullname', 'original_fullname', 'alphabet', 'transliteration',
       'unigrams', 'bigrams', 'trigrams', 'char_ngrams', 'num_tokens',
       'period_freq', 'dash_freq', 'space_freq', 'name_length',
       'avg_token_length', 'indiv_unigrams_fdist', 'indiv_bigrams_fdist',
       'unigrams_cosine_sim', 'bigrams_cosine_sim'],
      dtype='object')

In [16]:
df_turk.columns

Index(['id', 'original_fullname', 'fullname', 'transliteration', 'alphabet',
       'unigrams', 'bigrams', 'trigrams', 'char_ngrams', 'name_length',
       'num_tokens', 'avg_token_length', 'period_freq', 'dash_freq',
       'space_freq', 'indiv_unigrams_fdist', 'indiv_bigrams_fdist',
       'indiv_trigrams_fdist', 'unigrams_cosine_sim', 'bigrams_cosine_sim',
       'trigrams_cosine_sim'],
      dtype='object')

In [17]:
df_turk['apostrophe_freq'] = df_turk['fullname'].apply(lambda name: name.count('\''))
df_korean['apostrophe_freq'] = df_korean['fullname'].apply(lambda name: name.count('\''))

### More consolidation: separating characters from romanized, accent features, etc.

#### Adding avg_token_length column to dfs missing it

In [18]:
# NEED:
# name_length	avg_token_length	num_tokens	period_freq	dash_freq	apostrophe_freq	space_freq	unigrams_cosine_sim	bigrams_cosine_sim

In [19]:
df_turk.columns

Index(['id', 'original_fullname', 'fullname', 'transliteration', 'alphabet',
       'unigrams', 'bigrams', 'trigrams', 'char_ngrams', 'name_length',
       'num_tokens', 'avg_token_length', 'period_freq', 'dash_freq',
       'space_freq', 'indiv_unigrams_fdist', 'indiv_bigrams_fdist',
       'indiv_trigrams_fdist', 'unigrams_cosine_sim', 'bigrams_cosine_sim',
       'trigrams_cosine_sim', 'apostrophe_freq'],
      dtype='object')

In [20]:
tokens = df_korean['fullname'].apply(lambda name: name.split(' '))
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
df_korean['avg_token_length'] = token_lengths.apply(np.mean)

In [21]:
tokens = df_japan['fullname'].apply(lambda name: name.split(' '))
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
df_japan['avg_token_length'] = token_lengths.apply(np.mean)

#### Redoing Korean and Japanese ngrams columns using transliterated names

In the ngrams, there are some characters that haven't been transliterated:

In [22]:
create_lang_char_distribution(df_korean, 'unigrams')

{' ': 0.051050841788382206,
 "'": 1.8582524993496117e-05,
 '(': 0.002740922436540677,
 ')': 0.002740922436540677,
 ',': 5.110194373211432e-05,
 '-': 0.05476734678708143,
 '.': 0.00018582524993496116,
 '/': 4.645631248374029e-06,
 ':': 4.645631248374029e-06,
 'a': 0.05950589066042294,
 'b': 0.013769651020180622,
 'c': 0.014289961719998514,
 'd': 0.008733786746943174,
 'e': 0.0846619838703683,
 'f': 0.0002601553499089456,
 'g': 0.08767235291931468,
 'h': 0.05003344854498829,
 'i': 0.06272066748429776,
 'j': 0.03562734604378043,
 'k': 0.0254719961348348,
 'l': 0.019195748318281487,
 'm': 0.03319303526963244,
 'n': 0.12028003865165199,
 'o': 0.10017374660868919,
 'p': 0.005291373991898019,
 'q': 3.7165049986992234e-05,
 'r': 0.007210019697476493,
 's': 0.03936243356747315,
 't': 0.004984762329505333,
 'u': 0.057517560486118856,
 'v': 0.0006736165310142342,
 'w': 0.013346898576578586,
 'x': 8.362136247073252e-05,
 'y': 0.0433251570223362,
 'z': 0.00036235923737317426,
 '|': 4.64563124837402

In [23]:
create_lang_char_distribution(df_japan, 'unigrams')

{' ': 0.07879491313522605,
 '-': 0.00022155737853618506,
 '.': 0.00021036371307007118,
 'a': 0.15459455578710574,
 'b': 0.007226090041073032,
 'c': 0.01000983884595625,
 'd': 0.010900314233208827,
 'e': 0.028484790703545034,
 'f': 0.003667662388241865,
 'g': 0.011578109976605239,
 'h': 0.055303269206496496,
 'i': 0.11785115590035476,
 'j': 0.008750744475250613,
 'k': 0.07267468004451219,
 'l': 0.0002709639019728256,
 'm': 0.04605035373912815,
 'n': 0.037970843203380025,
 'o': 0.09144298313500603,
 'p': 0.0005662450771996228,
 'q': 1.119366546611388e-05,
 'r': 0.03559354025145604,
 's': 0.06236608612715001,
 't': 0.04603259826976811,
 'u': 0.07241375184261244,
 'v': 7.410978515496085e-05,
 'w': 0.00889742009170314,
 'x': 5.59683273305694e-05,
 'y': 0.02905257973460205,
 'z': 0.008836047925871688,
 '~': 7.719769286975089e-07,
 '\xad': 3.8598846434875445e-07,
 '×': 1.1579653930462634e-06,
 'ł': 3.8598846434875445e-07,
 'ʼ': 1.9299423217437725e-06,
 'ʾ': 3.8598846434875445e-07,
 'θ': 3.859

In [24]:
df_korean.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,apostrophe_freq
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.0,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.68059,0.37766,0
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.762211,0.55209,0
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.34441,0
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.5,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.537098,0.143205,0
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",choeminho,"[c, h, o, e, m, i, n, h, o]","[(c, h), (h, o), (o, e), (e, m), (m, i), (i, n...","[(c, h, o), (h, o, e), (o, e, m), (e, m, i), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394,0


In [25]:
df_korean['transliteration'] = df_korean['fullname'].apply(lambda name: unidecode(name)).apply(str.lower)
df_korean['unigrams'] = df_korean['transliteration'].apply(lambda name: list(name))
df_korean['bigrams'] = df_korean['transliteration'].apply(lambda name: list(ngrams(list(name), 2)))
df_korean['trigrams'] = df_korean['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

In [26]:
df_japan['transliteration'] = df_japan['fullname'].apply(lambda name: unidecode(name)).apply(str.lower)
df_japan['unigrams'] = df_japan['transliteration'].apply(lambda name: list(name))
df_japan['bigrams'] = df_japan['transliteration'].apply(lambda name: list(ngrams(list(name), 2)))
df_japan['trigrams'] = df_japan['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

Checking that we no longer have CJK characters:

In [27]:
create_lang_char_distribution(df_korean, 'unigrams')

{' ': 0.05231878132042639,
 "'": 1.901809571807575e-05,
 '(': 0.002805169118416173,
 ')': 0.002805169118416173,
 ',': 5.229976322470831e-05,
 '-': 0.03653376187442351,
 '.': 0.00019018095718075748,
 '/': 4.754523929518937e-06,
 ':': 4.754523929518937e-06,
 '`': 9.509047859037874e-06,
 'a': 0.060967260348221336,
 'b': 0.014111427022812206,
 'c': 0.014786569420803895,
 'd': 0.008976541178931754,
 'e': 0.0889523881973698,
 'f': 0.0002662533400530605,
 'g': 0.08978918440896513,
 'h': 0.04493500565788348,
 'i': 0.06424788185958939,
 'j': 0.03646244401548073,
 'k': 0.026021509466257142,
 'l': 0.015157422287306372,
 'm': 0.03397107347641281,
 'n': 0.1231231516788224,
 'o': 0.10263115354259578,
 'p': 0.005396384660003994,
 'q': 3.80361914361515e-05,
 'r': 0.011872046252008786,
 's': 0.040327871970179625,
 't': 0.005068322508867187,
 'u': 0.05675950667059707,
 'v': 0.0006894059697802459,
 'w': 0.013664501773437426,
 'x': 9.509047859037874e-05,
 'y': 0.04651826212641328,
 'z': 0.0004231526297271

In [28]:
create_lang_char_distribution(df_japan, 'unigrams')

{' ': 0.15807851164007788,
 '"': 0.00020052242491341805,
 "'": 6.7551880733243666e-06,
 '*': 7.11072428770986e-07,
 ',': 3.55536214385493e-07,
 '-': 0.00020514439570042944,
 '.': 4.977507001396902e-05,
 '/': 1.066608643156479e-06,
 '`': 3.55536214385493e-07,
 'a': 0.1184124028097316,
 'b': 0.008188710089726674,
 'c': 0.012258177599583028,
 'd': 0.010506806207520088,
 'e': 0.03521728417974062,
 'f': 0.004212748604253707,
 'g': 0.041813192029020285,
 'h': 0.05323052648158162,
 'i': 0.11140762831390862,
 'j': 0.013004803649792562,
 'k': 0.035685169837871934,
 'l': 0.007763488777321624,
 'm': 0.028342991474597116,
 'n': 0.07994160673214933,
 'o': 0.06366871419972531,
 'p': 0.001771281420068526,
 'q': 0.0039748948768298115,
 'r': 0.020156414602156753,
 's': 0.041249667129219285,
 't': 0.030270708828995258,
 'u': 0.05768575078404624,
 'v': 7.608474987849549e-05,
 'w': 0.0068078074330534194,
 'x': 0.008624597488563288,
 'y': 0.027706937187061468,
 'z': 0.019473785070536608,
 '~': 4.6219707870

We only change the ngrams columns because we redo the frequency distributions later on.

#### Separating Korean and Japanese characters from romanized

In [29]:
df_japan.rename(columns = {'determine_alphabet': 'alphabet'}, inplace = True)
df_japan.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,avg_token_length
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura,"[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...","[(h, i), (i, r), (r, o), (o, t), (t, o), (o, s...","[(h, i, r), (i, r, o), (r, o, t), (o, t, o), (...","[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...",17,2,0,0,1,0,"[[0.05555555555555555, 0.0, 0.0, 0.16666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.930607,0.612908,8.5
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa,"[s, a, c, h, i, o, , h, o, s, o, k, a, w, a]","[(s, a), (a, c), (c, h), (h, i), (i, o), (o, ...","[(s, a, c), (a, c, h), (c, h, i), (h, i, o), (...","[s, a, c, h, i, o, , h, o, s, o, k, a, w, a, ...",14,2,0,0,1,0,"[[0.06666666666666667, 0.0, 0.0, 0.2, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.805768,0.394186,7.0
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza,"[o, z, a]","[(o, z), (z, a)]","[(o, z, a)]","[o, z, a, (o, z), (z, a), (o, z, a)]",3,1,0,0,0,0,"[[0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.523246,0.020991,3.0
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",ping zhong li nan,"[p, i, n, g, , z, h, o, n, g, , l, i, , n, ...","[(p, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(p, i, n), (i, n, g), (n, g, ), (g, , z), (...","[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, ...",15,2,0,0,1,0,"[[0.0625, 0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.883438,0.600381,4.0
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi,"[j, u, n, , k, o, c, h, i]","[(j, u), (u, n), (n, ), ( , k), (k, o), (o, c...","[(j, u, n), (u, n, ), (n, , k), ( , k, o), (...","[j, u, n, , k, o, c, h, i, (j, u), (u, n), (n...",8,2,0,0,1,0,"[[0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.11...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.646229,0.287507,4.0


In [30]:
df_korean['alphabet']

0        [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
1        [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
2                                 [HANGUL, HANGUL, HANGUL]
3               [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]
4                                 [HANGUL, HANGUL, HANGUL]
                               ...                        
21197    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21198    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21199                             [HANGUL, HANGUL, HANGUL]
21200                             [HANGUL, HANGUL, HANGUL]
21201    [LATIN, LATIN, SPACE, LATIN, LATIN, HYPHEN-MIN...
Name: alphabet, Length: 19118, dtype: object

In [31]:
df_korean.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,apostrophe_freq
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.0,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.68059,0.37766,0
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.762211,0.55209,0
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.34441,0
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.5,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.537098,0.143205,0
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",coeminho,"[c, o, e, m, i, n, h, o]","[(c, o), (o, e), (e, m), (m, i), (i, n), (n, h...","[(c, o, e), (o, e, m), (e, m, i), (m, i, n), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394,0


In [32]:
latin_mask = df_korean['fullname'].str.contains(r'[a-zA-Z]')
df_korean_latin = df_korean[latin_mask]
df_korean_non_latin = df_korean[~latin_mask]
df_korean_non_latin.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,apostrophe_freq
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.34441,0
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",coeminho,"[c, o, e, m, i, n, h, o]","[(c, o), (o, e), (e, m), (m, i), (i, n), (n, h...","[(c, o, e), (o, e, m), (e, m, i), (m, i, n), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394,0
6,이수경,이수경,"[HANGUL, HANGUL, HANGUL]",isugyeong,"[i, s, u, g, y, e, o, n, g]","[(i, s), (s, u), (u, g), (g, y), (y, e), (e, o...","[(i, s, u), (s, u, g), (u, g, y), (g, y, e), (...","[i, s, u, g, y, e, o, n, g, (i, s), (s, u), (u...",1,0,0,0,9,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.803067,0.547257,0
7,서하준,서하준,"[HANGUL, HANGUL, HANGUL]",seohajun,"[s, e, o, h, a, j, u, n]","[(s, e), (e, o), (o, h), (h, a), (a, j), (j, u...","[(s, e, o), (e, o, h), (o, h, a), (h, a, j), (...","[s, e, o, h, a, j, u, n, (s, e), (e, o), (o, h...",1,0,0,0,8,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.753968,0.330478,0
9,윤종규,윤종규,"[HANGUL, HANGUL, HANGUL]",yunjonggyu,"[y, u, n, j, o, n, g, g, y, u]","[(y, u), (u, n), (n, j), (j, o), (o, n), (n, g...","[(y, u, n), (u, n, j), (n, j, o), (j, o, n), (...","[y, u, n, j, o, n, g, g, y, u, (y, u), (u, n),...",1,0,0,0,10,3.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.692096,0.458872,0


In [33]:
latin_mask = df_japan['fullname'].str.contains(r'[a-zA-Z]')
df_japan_latin = df_japan[latin_mask]
df_japan_non_latin = df_japan[~latin_mask]
df_japan_latin.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,avg_token_length
0,hirotoshi nakamura,Hirotoshi Nakamura,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",hirotoshi nakamura,"[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...","[(h, i), (i, r), (r, o), (o, t), (t, o), (o, s...","[(h, i, r), (i, r, o), (r, o, t), (o, t, o), (...","[h, i, r, o, t, o, s, h, i, , n, a, k, a, m, ...",17,2,0,0,1,0,"[[0.05555555555555555, 0.0, 0.0, 0.16666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.930607,0.612908,8.5
1,sachio hosokawa,Sachio Hosokawa,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",sachio hosokawa,"[s, a, c, h, i, o, , h, o, s, o, k, a, w, a]","[(s, a), (a, c), (c, h), (h, i), (i, o), (o, ...","[(s, a, c), (a, c, h), (c, h, i), (h, i, o), (...","[s, a, c, h, i, o, , h, o, s, o, k, a, w, a, ...",14,2,0,0,1,0,"[[0.06666666666666667, 0.0, 0.0, 0.2, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.805768,0.394186,7.0
2,oza,OZA,"[LATIN, LATIN, LATIN]",oza,"[o, z, a]","[(o, z), (z, a)]","[(o, z, a)]","[o, z, a, (o, z), (z, a), (o, z, a)]",3,1,0,0,0,0,"[[0.0, 0.0, 0.0, 0.3333333333333333, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.523246,0.020991,3.0
4,jun kochi,Jun Kochi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",jun kochi,"[j, u, n, , k, o, c, h, i]","[(j, u), (u, n), (n, ), ( , k), (k, o), (o, c...","[(j, u, n), (u, n, ), (n, , k), ( , k, o), (...","[j, u, n, , k, o, c, h, i, (j, u), (u, n), (n...",8,2,0,0,1,0,"[[0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.11...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.646229,0.287507,4.0
5,suguru kubota,Suguru Kubota,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",suguru kubota,"[s, u, g, u, r, u, , k, u, b, o, t, a]","[(s, u), (u, g), (g, u), (u, r), (r, u), (u, ...","[(s, u, g), (u, g, u), (g, u, r), (u, r, u), (...","[s, u, g, u, r, u, , k, u, b, o, t, a, (s, u)...",12,2,0,0,1,0,"[[0.07692307692307693, 0.0, 0.0, 0.07692307692...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.604463,0.239627,6.0


In [34]:
df_japan_non_latin.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,period_freq,dash_freq,space_freq,apostrophe_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,avg_token_length
3,平塚利男,平塚利男,"[CJK, CJK, CJK, CJK]",ping zhong li nan,"[p, i, n, g, , z, h, o, n, g, , l, i, , n, ...","[(p, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(p, i, n), (i, n, g), (n, g, ), (g, , z), (...","[h, i, r, a, t, s, u, k, a, , t, o, s, h, i, ...",15,2,0,0,1,0,"[[0.0625, 0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.883438,0.600381,4.0
7,中野正俊,中野正俊,"[CJK, CJK, CJK, CJK]",zhong ye zheng jun,"[z, h, o, n, g, , y, e, , z, h, e, n, g, , ...","[(z, h), (h, o), (o, n), (n, g), (g, ), ( , y...","[(z, h, o), (h, o, n), (o, n, g), (n, g, ), (...","[n, a, k, a, n, o, , m, a, s, a, t, o, s, h, ...",15,2,0,0,1,0,"[[0.0625, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.86512,0.623953,4.0
10,尺振八,尺振八,"[CJK, CJK, CJK]",chi zhen ba,"[c, h, i, , z, h, e, n, , b, a, ]","[(c, h), (h, i), (i, ), ( , z), (z, h), (h, e...","[(c, h, i), (h, i, ), (i, , z), ( , z, h), (...","[s, h, a, k, u, , s, h, i, n, p, a, c, h, i, ...",14,2,0,0,1,0,"[[0.06666666666666667, 0.0, 0.0, 0.13333333333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.758203,0.466828,3.0
11,吉永みち子,吉永みち子,"[CJK, CJK, HIRAGANA, HIRAGANA, CJK]",ji yong michizi,"[j, i, , y, o, n, g, , m, i, c, h, i, z, i, ]","[(j, i), (i, ), ( , y), (y, o), (o, n), (n, g...","[(j, i, ), (i, , y), ( , y, o), (y, o, n), (...","[y, o, s, h, i, n, a, g, a, , m, i, c, h, i, ...",16,3,0,0,2,0,"[[0.1111111111111111, 0.0, 0.0, 0.111111111111...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.869671,0.576394,5.0
13,かかし朝浩,かかし朝浩,"[HIRAGANA, HIRAGANA, HIRAGANA, CJK, CJK]",kakashizhao hao,"[k, a, k, a, s, h, i, z, h, a, o, , h, a, o, ]","[(k, a), (a, k), (k, a), (a, s), (s, h), (h, i...","[(k, a, k), (a, k, a), (k, a, s), (a, s, h), (...","[k, a, k, a, s, h, i, , a, s, a, , h, i, r, ...",17,3,0,0,2,0,"[[0.10526315789473684, 0.0, 0.0, 0.21052631578...","[[0.0, 0.0, 0.0, 0.05555555555555555, 0.0, 0.0...",0.857157,0.650052,5.0


#### Accent features

We had to separate CJK and create a list of Latin dataframes only because otherwise, CJK characters would get counted as accents.

In [35]:
# don't include CJK or HANGUL
# company_csv_dfs = [df_arabic_latin, df_croatian, df_dutch, df_english, df_french, df_german, df_hindi_latin, df_italian, df_polish,
#                    df_portug, df_russian_latin, df_spanish]
# all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk, df_korean, df_japan]
df_latin = company_csv_dfs
df_latin.extend([df_indo, df_malay, df_viet, df_cnrom, df_turk, df_korean_latin, df_japan_latin])

In [36]:
df_cnrom.rename(columns = {'original_fullname': 'fullname'}, inplace = True)

In [37]:
for df in company_csv_dfs:
    df.rename(columns = {'name_lower': 'fullname'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'name_lower': 'fullname'}, inplace = True)


In [38]:
# trying accent counts
def count_accents(name):
    num_accents = 0
    for char in name:
        if unicodedata.normalize('NFD', char) != char:
            num_accents += 1
    return num_accents

In [39]:
for df in df_latin:
    df['accent_count'] = df['fullname'].apply(count_accents)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['accent_count'] = df['fullname'].apply(count_accents)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['accent_count'] = df['fullname'].apply(count_accents)


In [40]:
# set other accents to 0
df_non_latin = [df_cnchar, df_japan_non_latin, df_korean_non_latin]
for df in df_non_latin:
    df['accent_count'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['accent_count'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['accent_count'] = 0


In [41]:
def find_all_accents(df, col_name, all_accents):
    for name in df[col_name]:
        for char in name:
            if unicodedata.normalize('NFD', char) != char:
                all_accents.add(char)

In [42]:
all_accents = set()
for df in df_latin:
    find_all_accents(df, 'fullname', all_accents)
print(len(all_accents))
all_accents

139


{'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'ÿ',
 'ā',
 'ă',
 'ą',
 'ć',
 'č',
 'ē',
 'ę',
 'ğ',
 'ĩ',
 'ī',
 'ō',
 'ŏ',
 'ś',
 'ş',
 'š',
 'ţ',
 'ũ',
 'ū',
 'ŭ',
 'ű',
 'ž',
 'ơ',
 'ư',
 'ǎ',
 'ǧ',
 'ǹ',
 'ș',
 'ț',
 'ạ',
 'ả',
 'ấ',
 'ầ',
 'ẩ',
 'ẫ',
 'ậ',
 'ắ',
 'ằ',
 'ặ',
 'ế',
 'ề',
 'ể',
 'ễ',
 'ệ',
 'ỉ',
 'ị',
 'ọ',
 'ỏ',
 'ố',
 'ồ',
 'ổ',
 'ỗ',
 'ộ',
 'ớ',
 'ờ',
 'ở',
 'ợ',
 'ụ',
 'ủ',
 'ứ',
 'ừ',
 'ử',
 'ữ',
 'ự',
 'ỳ',
 'ỵ',
 'ỷ',
 'ỹ',
 'が',
 'ご',
 'ざ',
 'じ',
 'ず',
 'で',
 'ゴ',
 'ザ',
 'ジ',
 'ズ',
 'ダ',
 'デ',
 'ド',
 'パ',
 'ビ',
 'ブ',
 'ペ',
 'ボ',
 'ポ',
 'ヴ',
 '거',
 '고',
 '김',
 '나',
 '동',
 '딩',
 '라',
 '래',
 '릴',
 '몽',
 '성',
 '스',
 '욱',
 '유',
 '윤',
 '이',
 '조',
 '지',
 '진',
 '컷',
 '클',
 '타',
 '투',
 '퍼',
 '현'}

In [43]:
# hmm

In [44]:
# all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk, df_korean_latin, df_korean_non_latin, df_japan_latin, df_japan_non_latin,
#            df_arabic_latin, df_croatian, df_dutch, df_english, df_french, df_german, df_hindi_latin, df_italian, df_polish,df_portug, df_russian_latin, df_spanish]
all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk, df_korean_latin, df_korean_non_latin, df_japan_latin, df_japan_non_latin,
           df_arabic_latin, df_english, df_french, df_german, df_italian, df_portug, df_spanish]

In [45]:
#all_dfs_names = ['Indonesian', 'Malay', 'Vietnamese', 'Chinese (Romanized)', 'Chinese (Characters)', 'Turkish', 'Korean (Romanized)', 'Korean (Characters)', 'Japanese (Romanized)', '']

### Adding the language (label) to each dataset

In [46]:
df_indo['language'] = 'Indonesian'
df_malay['language'] = 'Malay'
df_viet['language'] = 'Vietnamese'
df_cnrom['language'] = 'Chinese (Romanized)'
df_cnchar['language'] = 'Chinese (Characters)'
df_turk['language'] = 'Turkish'
df_korean_latin['language'] = 'Korean (Romanized)'
df_korean_non_latin['language'] = 'Korean (Characters)'
df_japan_latin['language'] = 'Japanese (Romanized)'
df_japan_non_latin['language'] = 'Japanese (Characters)'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_korean_latin['language'] = 'Korean (Romanized)'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_korean_non_latin['language'] = 'Korean (Characters)'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_japan_latin['language'] = 'Japanese (Romanized)'
A value is trying to be set on a copy of a sli

In [47]:
# company_csv_dfs: df_arabic_latin, df_croatian, df_dutch, df_english, df_french, df_german, df_hindi_latin, df_italian, df_polish,
#                  df_portug, df_russian_latin, df_spanish

In [48]:
df_arabic_latin['language'] = 'Arabic (Romanized)'
# df_croatian['language'] = 'Croatian'
# df_dutch['language'] = 'Dutch'
df_english['language'] = 'English'
df_french['language'] = 'French'
df_german['language'] = 'German'
# df_hindi_latin['language'] = 'Hindi (Romanized)'
df_italian['language'] = 'Italian'
# df_polish['language'] = 'Polish'
df_portug['language'] = 'Portuguese'
# df_russian_latin['language'] = 'Russian (Romanized)'
df_spanish['language'] = 'Spanish'

### Finding which datasets have null fullname

In [49]:
for df in all_dfs:
    if 'fullname' not in df.columns:
        print(df['language'][0])

Chinese (Characters)


In [50]:
df_cnchar.head()

Unnamed: 0,original_fullname,transliteration,alphabet,name_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,accent_count,language
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.333333,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.083333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.774279,0.548928,0,Chinese (Characters)
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0,4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.071428...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.811762,0.560151,0,Chinese (Characters)
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.77639,0.510394,0,Chinese (Characters)
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.076923...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.841584,0.605839,0,Chinese (Characters)
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0,3.5,"[[0.125, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710349,0.440812,0,Chinese (Characters)


In [51]:
df_cnchar['fullname'] = df_cnchar['original_fullname']
df_cnchar.head()

Unnamed: 0,original_fullname,transliteration,alphabet,name_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,...,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,accent_count,language,fullname
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,...,2,0,3.333333,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.083333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.774279,0.548928,0,Chinese (Characters),丁一平
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,...,2,0,4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.071428...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.811762,0.560151,0,Chinese (Characters),丁世雄
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,...,2,0,3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.77639,0.510394,0,Chinese (Characters),丁亦昕
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,...,2,0,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.076923...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.841584,0.605839,0,Chinese (Characters),丁仲礼
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,...,1,0,3.5,"[[0.125, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710349,0.440812,0,Chinese (Characters),丁伟


### Combining all names to make one big dataset

In [52]:
# as you can see from output, we need the columns in the concatenated df (in this case, viet) to match
# it's okay if some values are NaN bc we'll drop all non-numerical columns anyway
merged_df = pd.concat(all_dfs, ignore_index = True, join = 'outer')
merged_df

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim,accent_count,language,id,name,class,lang,detected_accents
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.000000,...,0.664809,0.250640,0.085949,0,Indonesian,,,,,
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.000000,...,0.686625,0.353292,0.117226,0,Indonesian,,,,,
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.000000,...,0.688312,0.197139,0.090295,0,Indonesian,,,,,
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.000000,...,0.581396,0.155386,0.060083,0,Indonesian,,,,,
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.000000,...,0.463215,0.176917,0.052811,0,Indonesian,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293891,josé vizcaíno,,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[j, o, s, e, , v, i, z, c, a, i, n, o]","[(j, o), (o, s), (s, e), (e, ), ( , v), (v, i...","[(j, o, s), (o, s, e), (s, e, ), (e, , v), (...","[(j,), (o,), (s,), (e,), ( ,), (v,), (i,), (z,...","[José, Vizcaíno]",13,6.000000,...,0.743017,0.238011,,2,Spanish,,José Vizcaíno,1.0,es,"é, í"
293892,felipe harboe bascuñán,,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, e, l, i, p, e, , h, a, r, b, o, e, , b, ...","[(f, e), (e, l), (l, i), (i, p), (p, e), (e, ...","[(f, e, l), (e, l, i), (l, i, p), (i, p, e), (...","[(f,), (e,), (l,), (i,), (p,), (e,), ( ,), (h,...","[Felipe, Harboe, Bascuñán]",22,6.666667,...,0.873616,0.335940,,2,Spanish,,Felipe Harboe Bascuñán,1.0,es,"á, ñ"
293893,guillermo lorenzo,,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[g, u, i, l, l, e, r, m, o, , l, o, r, e, n, ...","[(g, u), (u, i), (i, l), (l, l), (l, e), (e, r...","[(g, u, i), (u, i, l), (i, l, l), (l, l, e), (...","[(g,), (u,), (i,), (l,), (l,), (e,), (r,), (m,...","[Guillermo, Lorenzo]",17,8.000000,...,0.726666,0.347654,,0,Spanish,,Guillermo Lorenzo,1.0,es,
293894,elena ramos,,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, l, e, n, a, , r, a, m, o, s]","[(e, l), (l, e), (e, n), (n, a), (a, ), ( , r...","[(e, l, e), (l, e, n), (e, n, a), (n, a, ), (...","[(e,), (l,), (e,), (n,), (a,), ( ,), (r,), (a,...","[Elena, Ramos]",11,5.000000,...,0.891172,0.318310,,0,Spanish,,Elena Ramos,1.0,es,


### Adding/editing features

#### num_alphabets

In [53]:
alphabet_fdist = create_lang_char_distribution(merged_df, 'alphabet')
print(len(alphabet_fdist))
alphabet_fdist

43


{'APOSTROPHE': 0.00011913719588663155,
 'ARABIC': 7.663796226625422e-06,
 'BLACK': 2.786834991500153e-06,
 'BULLSEYE': 6.967087478750383e-07,
 'CJK': 0.13366984365855697,
 'COLON': 3.4835437393751915e-07,
 'COMMA': 5.225315609062787e-06,
 'COMMERCIAL': 2.438480617562634e-06,
 'CYRILLIC': 6.967087478750383e-07,
 'DIGIT': 3.4835437393751915e-07,
 'EIGHTH': 6.967087478750383e-07,
 'FOR': 3.4835437393751915e-07,
 'FULL': 0.0010450631218125574,
 'FULLWIDTH': 3.4835437393751917e-06,
 'GREEK': 1.0450631218125575e-06,
 'HANGUL': 0.01058474765209152,
 'HIRAGANA': 0.008403352562494774,
 'HORIZONTAL': 3.4835437393751915e-07,
 'HYPHEN': 3.4835437393751915e-07,
 'HYPHEN-MINUS': 0.003217400997686927,
 'IDEOGRAPHIC': 0.00020239389125769862,
 'KATAKANA': 0.004122774015550539,
 'KATAKANA-HIRAGANA': 0.00023165565866845023,
 'LATIN': 0.7685627595240085,
 'LEFT': 0.00020552908062313632,
 'LEFT-TO-RIGHT': 1.0450631218125575e-06,
 'MALE': 3.4835437393751915e-07,
 'MODIFIER': 2.786834991500153e-06,
 'MULTIPL

In [54]:
# sort by highest frequency
sorted_alpha_fdist = dict(sorted(alphabet_fdist.items(), key = lambda item: item[1], reverse=True))
sorted_alpha_fdist

{'LATIN': 0.7685627595240085,
 'CJK': 0.13366984365855697,
 'SPACE': 0.06935456901596856,
 'HANGUL': 0.01058474765209152,
 'HIRAGANA': 0.008403352562494774,
 'KATAKANA': 0.004122774015550539,
 'HYPHEN-MINUS': 0.003217400997686927,
 'FULL': 0.0010450631218125574,
 'KATAKANA-HIRAGANA': 0.00023165565866845023,
 'RIGHT': 0.00021354123122369924,
 'LEFT': 0.00020552908062313632,
 'IDEOGRAPHIC': 0.00020239389125769862,
 'APOSTROPHE': 0.00011913719588663155,
 'WHITE': 1.1147339966000613e-05,
 'SYRIAC': 8.01215060056294e-06,
 'ARABIC': 7.663796226625422e-06,
 'SOLIDUS': 6.967087478750383e-06,
 'COMMA': 5.225315609062787e-06,
 'WAVE': 3.831898113312711e-06,
 'FULLWIDTH': 3.4835437393751917e-06,
 'BLACK': 2.786834991500153e-06,
 'MODIFIER': 2.786834991500153e-06,
 'COMMERCIAL': 2.438480617562634e-06,
 'RIGHT-TO-LEFT': 1.7417718696875958e-06,
 'ZERO': 1.3934174957500766e-06,
 'GREEK': 1.0450631218125575e-06,
 'LEFT-TO-RIGHT': 1.0450631218125575e-06,
 'MULTIPLICATION': 1.0450631218125575e-06,
 'BUL

In [55]:
# finding number of alphabets in single name
def find_number_alphabets(alpha_list):
    alpha_set = set()
    for alpha_type in alpha_list:
        alpha_set.add(alpha_type)
    return len(alpha_set)

In [56]:
merged_df['num_alphabets'] = merged_df['alphabet'].apply(find_number_alphabets)
merged_df['num_alphabets'].unique()

array([1, 2, 3, 4, 5])

#### edit_distance(fullname, transliterated)

In [57]:
merged_df.columns

Index(['fullname', 'original_fullname', 'alphabet', 'unigrams', 'bigrams',
       'trigrams', 'char_ngrams', 'word_ngrams', 'name_length',
       'avg_token_length', 'num_tokens', 'transliteration', 'period_freq',
       'dash_freq', 'apostrophe_freq', 'space_freq', 'indiv_unigrams_fdist',
       'indiv_bigrams_fdist', 'indiv_trigrams_fdist', 'unigrams_cosine_sim',
       'bigrams_cosine_sim', 'trigrams_cosine_sim', 'accent_count', 'language',
       'id', 'name', 'class', 'lang', 'detected_accents', 'num_alphabets'],
      dtype='object')

In [58]:
np.any(merged_df['fullname'].isnull())

False

In [59]:
merged_df[merged_df['fullname'].isnull()]

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,bigrams_cosine_sim,trigrams_cosine_sim,accent_count,language,id,name,class,lang,detected_accents,num_alphabets


In [60]:
# just in case
merged_df['fullname'].apply(str.lower)
merged_df['transliteration'].apply(str.lower)

0                      supriyadi
1                   triyaningsih
2                       soerjadi
3                      undunsyah
4                       soeripto
                   ...          
293891             jose vizcaino
293892    felipe harboe bascunan
293893         guillermo lorenzo
293894               elena ramos
293895             miguel bedoya
Name: transliteration, Length: 293896, dtype: object

In [61]:
np.any(merged_df['transliteration'].isnull())

False

In [62]:
#merged_df['edit_distance'] = merged_df['fullname'].apply(edit_distance())
merged_df['edit_distance'] = merged_df.apply(lambda row: edit_distance(row['fullname'], row['transliteration']), axis = 1)

In [63]:
merged_df.tail()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,trigrams_cosine_sim,accent_count,language,id,name,class,lang,detected_accents,num_alphabets,edit_distance
293891,josé vizcaíno,,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[j, o, s, e, , v, i, z, c, a, i, n, o]","[(j, o), (o, s), (s, e), (e, ), ( , v), (v, i...","[(j, o, s), (o, s, e), (s, e, ), (e, , v), (...","[(j,), (o,), (s,), (e,), ( ,), (v,), (i,), (z,...","[José, Vizcaíno]",13,6.0,...,,2,Spanish,,José Vizcaíno,1.0,es,"é, í",2,2
293892,felipe harboe bascuñán,,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, e, l, i, p, e, , h, a, r, b, o, e, , b, ...","[(f, e), (e, l), (l, i), (i, p), (p, e), (e, ...","[(f, e, l), (e, l, i), (l, i, p), (i, p, e), (...","[(f,), (e,), (l,), (i,), (p,), (e,), ( ,), (h,...","[Felipe, Harboe, Bascuñán]",22,6.666667,...,,2,Spanish,,Felipe Harboe Bascuñán,1.0,es,"á, ñ",2,2
293893,guillermo lorenzo,,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[g, u, i, l, l, e, r, m, o, , l, o, r, e, n, ...","[(g, u), (u, i), (i, l), (l, l), (l, e), (e, r...","[(g, u, i), (u, i, l), (i, l, l), (l, l, e), (...","[(g,), (u,), (i,), (l,), (l,), (e,), (r,), (m,...","[Guillermo, Lorenzo]",17,8.0,...,,0,Spanish,,Guillermo Lorenzo,1.0,es,,2,0
293894,elena ramos,,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, l, e, n, a, , r, a, m, o, s]","[(e, l), (l, e), (e, n), (n, a), (a, ), ( , r...","[(e, l, e), (l, e, n), (e, n, a), (n, a, ), (...","[(e,), (l,), (e,), (n,), (a,), ( ,), (r,), (a,...","[Elena, Ramos]",11,5.0,...,,0,Spanish,,Elena Ramos,1.0,es,,2,0
293895,miguel bedoya,,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[m, i, g, u, e, l, , b, e, d, o, y, a]","[(m, i), (i, g), (g, u), (u, e), (e, l), (l, ...","[(m, i, g), (i, g, u), (g, u, e), (u, e, l), (...","[(m,), (i,), (g,), (u,), (e,), (l,), ( ,), (b,...","[Miguel, Bedoya]",13,6.0,...,,0,Spanish,,Miguel Bedoya,1.0,es,,2,0


#### Checking character counts

In [64]:
# modified version of create_lang_char_distribution
# returns whole counts instead of decimals
def count_char_freqs(df, col_name):
    char_freqs = {}

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1

    return char_freqs

In [65]:
# Unigrams counts across all langs
char_counts = count_char_freqs(merged_df, 'unigrams')
print('# unique characters without transliteration:', len(char_counts))

# sorting to see which characters appear the least
char_counts = {key: val for key, val in sorted(char_counts.items(), key = lambda item: item[1])}
char_counts

# unique characters without transliteration: 120


{'ś': 1,
 'ë': 1,
 'ä': 1,
 '7': 1,
 'å': 1,
 'ć': 1,
 'ǹ': 1,
 'ỏ': 1,
 'ū': 1,
 'ē': 1,
 'ā': 1,
 'ǎ': 1,
 ':': 1,
 '|': 1,
 'H': 1,
 'ç': 2,
 'ẫ': 2,
 '*': 2,
 'ỷ': 3,
 'ó': 3,
 'ỵ': 3,
 'ỉ': 3,
 'ở': 3,
 'ī': 3,
 'ñ': 4,
 '`': 4,
 'ö': 5,
 'ủ': 5,
 'ð': 5,
 'ổ': 6,
 '@': 7,
 '’': 9,
 'ự': 9,
 'ẩ': 10,
 'ể': 12,
 'ớ': 12,
 '~': 13,
 'ã': 14,
 'é': 14,
 'ừ': 14,
 'ằ': 15,
 ',': 16,
 'ò': 16,
 'ử': 16,
 'ỹ': 17,
 'ộ': 19,
 'ợ': 20,
 'õ': 21,
 '/': 23,
 'ĩ': 24,
 'ỗ': 25,
 'ụ': 27,
 'ắ': 28,
 'ặ': 29,
 'ữ': 34,
 'ờ': 35,
 'ề': 37,
 'ỳ': 38,
 'ố': 39,
 'ý': 43,
 'ậ': 43,
 'ü': 54,
 'ứ': 56,
 'í': 57,
 'ấ': 58,
 'ì': 58,
 'ồ': 59,
 'ũ': 62,
 'ú': 74,
 'ù': 75,
 'ả': 77,
 'ọ': 84,
 'ệ': 87,
 'ế': 99,
 'á': 124,
 'ị': 138,
 'ạ': 159,
 'ầ': 163,
 'â': 165,
 'ô': 167,
 'ơ': 184,
 'ă': 219,
 'à': 237,
 'ê': 256,
 'ư': 257,
 'đ': 261,
 "'": 364,
 'ễ': 385,
 '"': 566,
 '(': 590,
 ')': 590,
 '.': 3012,
 'v': 8532,
 '-': 9242,
 'q': 14056,
 'p': 17861,
 'f': 24143,
 'x': 29285,
 'w': 34552,
 'b':

In [66]:
merged_df.shape

(293896, 31)

In [67]:
# merged_df[merged_df['unigrams'].apply(lambda name: True if '郑' in name else False)]

#### Redoing unigrams to compare to all languages

Creating base unigrams distribution:

In [68]:
# Unigrams frequency distribution across all languages
unigram_fdist = create_lang_char_distribution(merged_df, 'unigrams')
print('# unique characters without transliteration:', len(unigram_fdist))

# unique characters without transliteration: 120


In [69]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}

**1. Finding unigrams distributions for all languages**

We will use `create_lang_gram_distribution` because all language distributions must start from the same baseline distribution, `initialized_unigrams`; otherwise, they cannot be compared.

In [70]:
merged_df['language'].unique()

array(['Indonesian', 'Malay', 'Vietnamese', 'Chinese (Romanized)',
       'Chinese (Characters)', 'Turkish', 'Korean (Romanized)',
       'Korean (Characters)', 'Japanese (Romanized)',
       'Japanese (Characters)', 'Arabic (Romanized)', 'English', 'French',
       'German', 'Italian', 'Portuguese', 'Spanish'], dtype=object)

For ngrams frequency distributions, we must not distinguish between characters and romanized because for CJK, ngrams are always created from the transliteration of the name. We already have `df_korean` and `df_japan` with combined romanized and character names, so we only need to create `df_chinese`.

In [71]:
df_chinese = pd.concat([df_cnrom, df_cnchar], ignore_index = True, join = 'outer')

We use the following array of dataframes ONLY for creating ngrams frequency distributions.

In [72]:
ngrams_dfs = [df_indo, df_malay, df_viet, df_chinese, df_turk, df_korean, df_japan, df_arabic_latin, df_english, df_french, df_german, df_italian, df_portug, df_spanish]

We define an array of language names in order to construct our columns later on. They must be in the same order as how they appear in `ngrams_dfs` for the next cells to work properly.

In [73]:
# MUST BE IN SAME ORDER AS ABOVE
all_langs = ['indo', 'malay', 'viet', 'chinese', 'turk', 'korean', 'japan', 'arab_rom', 'eng', 'french', 'german', 'ital', 'portug', 'span']

In [74]:
lang_fdists = []
for df in ngrams_dfs:
    lang_fdists.append(create_lang_gram_distribution(initialized_unigrams, df, 'unigrams'))
lang_fdists

[{' ': 0.08276406157308593,
  '"': 0.0,
  "'": 0.0003547118577727765,
  '(': 0.0,
  ')': 0.0,
  '*': 0.0,
  ',': 1.2231443371475051e-05,
  '-': 0.0003791747445157266,
  '.': 0.0031862909982692507,
  '/': 0.0,
  '7': 0.0,
  ':': 0.0,
  '@': 0.0,
  'H': 0.0,
  '`': 0.0,
  'a': 0.1667268045965764,
  'b': 0.016616415820148858,
  'c': 0.009253086910520876,
  'd': 0.0441493948493392,
  'e': 0.04197219792921664,
  'f': 0.01045176836092543,
  'g': 0.017894601652468,
  'h': 0.03426638860518735,
  'i': 0.08912441212625295,
  'j': 0.013075412964106829,
  'k': 0.019111630267929766,
  'l': 0.03099447750331778,
  'm': 0.04122607988355666,
  'n': 0.07453230018408322,
  'o': 0.04553154795031588,
  'p': 0.012372104970247015,
  'q': 0.0007277708806027655,
  'r': 0.06448416945441647,
  's': 0.05043635674227737,
  't': 0.0384128479081174,
  'u': 0.04299963917242054,
  'v': 0.004568444099245931,
  'w': 0.013356736161650756,
  'x': 0.00059322500351654,
  'y': 0.024420076691149938,
  'z': 0.00587109281830802

In [75]:
# unigram_fdist_indo = create_lang_gram_distribution(initialized_unigrams, df_indo, 'unigrams')
# unigram_fdist_malay = create_lang_gram_distribution(initialized_unigrams, df_malay, 'unigrams')
# unigram_fdist_viet = create_lang_gram_distribution(initialized_unigrams, df_viet, 'unigrams')
# unigram_fdist_cnrom = create_lang_gram_distribution(initialized_unigrams, df_cnrom, 'unigrams')
# unigram_fdist_cnchar = create_lang_gram_distribution(initialized_unigrams, df_cnchar, 'unigrams')
# unigram_fdist_turk = create_lang_gram_distribution(initialized_unigrams, df_turk, 'unigrams')
# unigram_fdist_korom = create_lang_gram_distribution(initialized_unigrams, df_korean_latin, 'unigrams')
# unigram_fdist_kochar = create_lang_gram_distribution(initialized_unigrams, df_korean_non_latin, 'unigrams')
# unigram_fdist_jarom = create_lang_gram_distribution(initialized_unigrams, df_japan_latin, 'unigrams')
# unigram_fdist_jachar = create_lang_gram_distribution(initialized_unigrams, df_japan_non_latin, 'unigrams')
# unigram_fdist_arabrom = create_lang_gram_distribution(initialized_unigrams, df_arabic_latin, 'unigrams')
# unigram_fdist_eng = create_lang_gram_distribution(initialized_unigrams, df_english, 'unigrams')
# unigram_fdist_french = create_lang_gram_distribution(initialized_unigrams, df_french, 'unigrams')
# unigram_fdist_german = create_lang_gram_distribution(initialized_unigrams, df_german, 'unigrams')
# unigram_fdist_ital = create_lang_gram_distribution(initialized_unigrams, df_italian, 'unigrams')
# unigram_fdist_portug = create_lang_gram_distribution(initialized_unigrams, df_portug, 'unigrams')
# unigram_fdist_span = create_lang_gram_distribution(initialized_unigrams, df_spanish, 'unigrams')

**2. Finding individual frequency distributions:**

In [76]:
# unigrams individual frequency distributions
merged_df['indiv_unigrams_fdist'] = merged_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

**3. Comparing individual frequency distributions to each language distribution:**

In [77]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
merged_df['indiv_unigrams_fdist'] = merged_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [78]:
# Converting language fdists to numpy arrays
lang_fdists_arr = []
for fdist in lang_fdists:
    lang_fdists_arr.append(np.fromiter(fdist.values(), dtype = float).reshape(1, -1))

In [79]:
# all_langs = ['indo', 'malay', 'viet', 'cn_rom', 'cn_char', 'turk', 'ko_rom', 'ko_char', 'ja_rom', 'ja_char', 'arab_rom', 'eng', 'french', 'german', 'ital', 'portug', 'span']

In [80]:
# Calculating cosine similarity between individual distributions and all languages
for i, lang_fdist in enumerate(lang_fdists_arr):
    curr_lang = all_langs[i]
    merged_df[curr_lang + '_unigrams_cosine_sim'] = merged_df['indiv_unigrams_fdist'].apply(lambda indiv_fdist: cosine_similarity(indiv_fdist, lang_fdist)[0][0])

In [81]:
merged_df.head()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,turk_unigrams_cosine_sim,korean_unigrams_cosine_sim,japan_unigrams_cosine_sim,arab_rom_unigrams_cosine_sim,eng_unigrams_cosine_sim,french_unigrams_cosine_sim,german_unigrams_cosine_sim,ital_unigrams_cosine_sim,portug_unigrams_cosine_sim,span_unigrams_cosine_sim
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.0,...,0.5933,0.418847,0.540195,0.558548,0.521782,0.549821,0.498951,0.585279,0.572371,0.537863
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.0,...,0.627726,0.638403,0.662178,0.537752,0.604509,0.57168,0.611077,0.667908,0.554799,0.545731
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.0,...,0.627556,0.56649,0.523689,0.623316,0.696796,0.729475,0.664509,0.718184,0.775375,0.744647
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.0,...,0.536235,0.602072,0.522787,0.508871,0.46673,0.458651,0.465674,0.401518,0.419256,0.436786
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.0,...,0.474674,0.514863,0.416068,0.328312,0.599574,0.616171,0.588177,0.651231,0.644292,0.594403


#### Failed attempt to redo bigrams distributions comparing to all languages

We attempt on Japanese names since they make up the largest percentage of our data.

In [82]:
# df_japan.shape

In [83]:
# # Doing this again since we turned it into a numpy array
# unigram_fdist = create_lang_char_distribution(merged_df, 'unigrams')

# # Initializing all possible bigrams using all possible characters from unigrams frequency distribution
# initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())

In [84]:
# Creating the bigrams frequency distribution for the entire JAPANESE language
# bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df_japan, 'bigrams')

In [85]:
# BIGRAMS individual frequency distributions
# df_japan['indiv_bigrams_fdist'] = df_japan['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

The kernel died at the above step.

### Adding paren_freq and quot_freq

In [86]:
merged_df['parentheses_freq'] = merged_df['fullname'].apply(lambda name: name.count('(')) # only ( is needed since the number of (s = the number of )s
merged_df['quotation_freq'] = merged_df['fullname'].apply(lambda name: name.count('"'))

### Keeping numerical columns only for each dataset

In [87]:
label_col = merged_df['language']
merged_df = merged_df.select_dtypes(exclude = 'object')
merged_df.drop('trigrams_cosine_sim', inplace = True, axis = 1)
merged_df['language'] = label_col
merged_df

Unnamed: 0,name_length,avg_token_length,num_tokens,period_freq,dash_freq,apostrophe_freq,space_freq,unigrams_cosine_sim,bigrams_cosine_sim,accent_count,...,arab_rom_unigrams_cosine_sim,eng_unigrams_cosine_sim,french_unigrams_cosine_sim,german_unigrams_cosine_sim,ital_unigrams_cosine_sim,portug_unigrams_cosine_sim,span_unigrams_cosine_sim,parentheses_freq,quotation_freq,language
0,9,9.000000,1,0,0,0,0,0.664809,0.250640,0,...,0.558548,0.521782,0.549821,0.498951,0.585279,0.572371,0.537863,0,0,Indonesian
1,12,12.000000,1,0,0,0,0,0.686625,0.353292,0,...,0.537752,0.604509,0.571680,0.611077,0.667908,0.554799,0.545731,0,0,Indonesian
2,8,8.000000,1,0,0,0,0,0.688312,0.197139,0,...,0.623316,0.696796,0.729475,0.664509,0.718184,0.775375,0.744647,0,0,Indonesian
3,9,9.000000,1,0,0,0,0,0.581396,0.155386,0,...,0.508871,0.466730,0.458651,0.465674,0.401518,0.419256,0.436786,0,0,Indonesian
4,8,8.000000,1,0,0,0,0,0.463215,0.176917,0,...,0.328312,0.599574,0.616171,0.588177,0.651231,0.644292,0.594403,0,0,Indonesian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293891,13,6.000000,2,0,0,0,1,0.743017,0.238011,2,...,0.569371,0.682720,0.711797,0.653862,0.799310,0.763259,0.743017,0,0,Spanish
293892,22,6.666667,3,0,0,0,2,0.873616,0.335940,2,...,0.798772,0.869674,0.898352,0.870256,0.832507,0.857226,0.873616,0,0,Spanish
293893,17,8.000000,2,0,0,0,1,0.726666,0.347654,0,...,0.473163,0.725036,0.729340,0.707887,0.714699,0.710887,0.726666,0,0,Spanish
293894,11,5.000000,2,0,0,0,1,0.891172,0.318310,0,...,0.791685,0.861220,0.873019,0.824537,0.807853,0.874240,0.891172,0,0,Spanish


In [88]:
merged_df.drop(labels = ['class', 'unigrams_cosine_sim'], axis = 1, inplace = True)

In [89]:
merged_df.head()

Unnamed: 0,name_length,avg_token_length,num_tokens,period_freq,dash_freq,apostrophe_freq,space_freq,bigrams_cosine_sim,accent_count,num_alphabets,...,arab_rom_unigrams_cosine_sim,eng_unigrams_cosine_sim,french_unigrams_cosine_sim,german_unigrams_cosine_sim,ital_unigrams_cosine_sim,portug_unigrams_cosine_sim,span_unigrams_cosine_sim,parentheses_freq,quotation_freq,language
0,9,9.0,1,0,0,0,0,0.25064,0,1,...,0.558548,0.521782,0.549821,0.498951,0.585279,0.572371,0.537863,0,0,Indonesian
1,12,12.0,1,0,0,0,0,0.353292,0,1,...,0.537752,0.604509,0.57168,0.611077,0.667908,0.554799,0.545731,0,0,Indonesian
2,8,8.0,1,0,0,0,0,0.197139,0,1,...,0.623316,0.696796,0.729475,0.664509,0.718184,0.775375,0.744647,0,0,Indonesian
3,9,9.0,1,0,0,0,0,0.155386,0,1,...,0.508871,0.46673,0.458651,0.465674,0.401518,0.419256,0.436786,0,0,Indonesian
4,8,8.0,1,0,0,0,0,0.176917,0,1,...,0.328312,0.599574,0.616171,0.588177,0.651231,0.644292,0.594403,0,0,Indonesian


In [90]:
# checking that there are no null values
np.any(merged_df.isnull())

False

In [91]:
# checking for duplicate values
len(merged_df[merged_df.duplicated()])

996

In [92]:
# dropping duplicates
merged_df.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(merged_df.duplicated()))

False


In [93]:
# FINAL SIZE 
merged_df.shape

(292900, 28)

In [94]:
merged_df['language'].value_counts()

language
Japanese (Characters)    94905
Japanese (Romanized)     92630
English                  22777
Turkish                  18037
Indonesian               11231
Chinese (Characters)     10486
Chinese (Romanized)      10478
Korean (Romanized)        9800
Korean (Characters)       9311
Malay                     2905
Spanish                   2502
Vietnamese                2289
Italian                   1207
French                    1164
Portuguese                1068
German                    1064
Arabic (Romanized)        1046
Name: count, dtype: int64

In [95]:
merged_df.columns

Index(['name_length', 'avg_token_length', 'num_tokens', 'period_freq',
       'dash_freq', 'apostrophe_freq', 'space_freq', 'bigrams_cosine_sim',
       'accent_count', 'num_alphabets', 'edit_distance',
       'indo_unigrams_cosine_sim', 'malay_unigrams_cosine_sim',
       'viet_unigrams_cosine_sim', 'chinese_unigrams_cosine_sim',
       'turk_unigrams_cosine_sim', 'korean_unigrams_cosine_sim',
       'japan_unigrams_cosine_sim', 'arab_rom_unigrams_cosine_sim',
       'eng_unigrams_cosine_sim', 'french_unigrams_cosine_sim',
       'german_unigrams_cosine_sim', 'ital_unigrams_cosine_sim',
       'portug_unigrams_cosine_sim', 'span_unigrams_cosine_sim',
       'parentheses_freq', 'quotation_freq', 'language'],
      dtype='object')

In [96]:
merged_df.to_pickle('merged_df.pkl.gz', compression='gzip')