# Chinese and Vietnamese Data Cleaning

In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from alphabet_detector import AlphabetDetector
import unicodedata
from langdetect import detect
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import pinyin
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity
import regex as re

nltk.download('stopwords')

cnchar_df = pd.read_excel('name_data/exiger_datasets/EXGR_Chinese names(Character).xlsx')
cnrom_df = pd.read_excel("name_data/exiger_datasets/EXGR_Chinese names(Romantized).xlsx")
viet_df = pd.read_excel("name_data/exiger_datasets/EXGR_Vietnamese.xlsx")

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joliehuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#FUNCTIONS

#alphabet
def langname(name):
    if isinstance(name, str):
        result = []
        for char in name:
            try:
                char_name = unicodedata.name(char).split(' ')[0]
                result.append(char_name)
            except ValueError:
                # Handle characters without Unicode names
                result.append(char)
        return result
    else:
        return None  # Handle non-string or NaN values
#==========================================================================================================================================
#num_tokens (how many parts to the name?)
def calculate_token_length(name):
    if isinstance(name, str):
        # Check if the name contains Chinese characters
        if any('\u4e00' <= char <= '\u9fff' for char in name):
            # For Chinese names, count characters
            return len(name)
        else:
            # For non-Chinese names, split by spaces and count words
            return len(name.split())
    else:
        return None  # Handle non-string or NaN values
#==========================================================================================================================================
#char_ngrams
def generate_char_ngrams(text):
    if isinstance(text, str):
        # Tokenize the text into words
        words = text.split()
        
        # Create unigrams, bigrams, and trigrams
        unigrams = list(ngrams(text, 1))
        bigrams = list(ngrams(text, 2))
        trigrams = list(ngrams(text, 3))
        
        # Interpolated n-grams (combining unigrams, bigrams, and trigrams)
        interpolated_ngrams = unigrams + bigrams + trigrams
        
        return interpolated_ngrams
    else:
        return []
#==========================================================================================================================================
#name_length (length of the entire name string)
def get_name_length(fullname):
    if isinstance(fullname, str):
        return len(fullname)
    else:
        return np.nan 
#==========================================================================================================================================
#Commenting these out - there is a better way to do this with lambda

# #unigrams
# def generate_char_unigrams(text):
#     if isinstance(text, str):
#         # Tokenize the text into characters
#         characters = list(text)
        
#         # Create unigrams
#         unigrams = list(ngrams(characters, 1))
        
#         return unigrams
#     else:
#         return []

# #bigrams
# def generate_char_bigrams(text):
#     if isinstance(text, str):
#         # Tokenize the text into characters
#         characters = list(text)
        
#         # Create unigrams
#         bigrams = list(ngrams(characters, 2))
        
#         return bigrams
#     else:
#         return []

# #trigrams
# def generate_char_trigrams(text):
#     if isinstance(text, str):
#         # Tokenize the text into characters
#         characters = list(text)
        
#         # Create unigrams
#         trigrams = list(ngrams(characters, 3))
        
#         return trigrams
#     else:
#         return []
#==========================================================================================================================================
# Function to transliterate a Chinese name
def transliterate_name(name):
    return pinyin.get(name, format="strip", delimiter=" ")
#==========================================================================================================================================


In [4]:
#NEW FREQ DIST FUNCTIONS

def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

EXGR_Chinese names(Character).xlsx - CLEANING
===

In [5]:
pd.set_option('display.max_columns', None)
# cnchar_df
duplicates = cnchar_df['fullname'].duplicated()
# print(cnchar_df[duplicates])
cnchar_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# cnchar_df
# null_values = cnchar_df['fullname'].isnull()
# print(cnchar_df[null_values])
# NO NULL VALUES
#==========================================================================================================================================
drop_notes = ['minority name', 'non-native', 'pseudonym', 'stage name',
       'should be侯赛因.江；minority name',
       '劉-family name of her husband;吳-her family name', 'satage name',
       '周-family name of her husband;梁-her family name', 'fake name',
       'pen name', '方-family name of her husband; 黄-her family name',
       '曹-family name of her husband;王-her family name',
       '朱-family name of her husband;李-her family name',
       '朱-family name of her husband;葉-her family name',
       '林-family name of her husband;鄭-her family name',
       '梁-family name of her husband;劉- her family name ',
       '梁-family name of her husband;高- her family name',
       'should be 泰迪.羅賓；non-native',
       '罗-family name of her husband;范-her family name',
       '范-family name of her husband;徐-her family name',
       '葉-family name of her husband；劉-her family name',
       '蘇-family name of her husband;周- her family name', 'buddhist',
       'family name should be in Chinese character 周',
       '陈-family name of her husband; 冯-her family name',
       '陳-family name of her husband;方-her family name',
       '高-family name of her husband;金-her family name',
       '黃-family name of her husband;馬-her family name']

cnchar_df = cnchar_df[~cnchar_df['Notes'].isin(drop_notes)]
# cnchar_df
#==========================================================================================================================================
# cnchar_df['Notes'].unique()
#**Maybe notes saying that it's a minority name/non-native name should be counted? Drop all the ones that say pseudonym/stage name? Maybe we can afford to drop certain examples of names, since we have so many of them...?
cnchar_df.drop(columns = ['Unnamed: 0', 'id', 'Family name', 'Given Name', 'Notes' ,'Unnamed: 6'], inplace = True)
# cnchar_df

Feature Engineering (Chinese Character)

In [6]:
# cnchar_df['alphabet'] = cnchar_df['fullname'].apply(langname)
# # cnchar_df
# cnchar_df['word_length'] = cnchar_df['fullname'].apply(get_name_length)
# # cnchar_df
# cnchar_df['num_tokens'] = cnchar_df['fullname'].apply(calculate_token_length)
# # cnchar_df
# cnchar_df['char_ngrams'] = cnchar_df['fullname'].apply(generate_char_ngrams)
# # cnchar_df
# cnchar_df['unigrams'] = cnchar_df['fullname'].apply(generate_char_unigrams)
# cnchar_df['bigrams'] = cnchar_df['fullname'].apply(generate_char_bigrams)
# cnchar_df['trigrams'] = cnchar_df['fullname'].apply(generate_char_trigrams)
# # cnchar_df
# cnchar_df['period_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('.'))
# cnchar_df['dash_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('-'))
# cnchar_df['space_freq'] = cnchar_df['fullname'].apply(lambda name: name.count(' '))
# cnchar_df['apostrophe_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('\''))
# # cnchar_df
#==========================================================================================================================================
#Transliteration Feature Engineering

cnchar_df.rename(columns={'fullname': 'original_fullname'}, inplace=True)

cnchar_df['transliteration'] = cnchar_df['original_fullname'].apply(transliterate_name)
cnchar_df['transliteration'] = cnchar_df['transliteration'].str.lower()
# cnchar_df
cnchar_df['alphabet'] = cnchar_df['original_fullname'].apply(langname)
# cnchar_df
cnchar_df['word_length'] = cnchar_df['transliteration'].apply(get_name_length)
# cnchar_df
cnchar_df['num_tokens'] = cnchar_df['transliteration'].apply(calculate_token_length)
# cnchar_df
cnchar_df['char_ngrams'] = cnchar_df['transliteration'].apply(generate_char_ngrams)
# cnchar_df
# cnchar_df['unigrams'] = cnchar_df['transliteration'].apply(generate_char_unigrams)
# cnchar_df['bigrams'] = cnchar_df['transliteration'].apply(generate_char_bigrams)
# cnchar_df['trigrams'] = cnchar_df['transliteration'].apply(generate_char_trigrams)
cnchar_df['unigrams'] = cnchar_df['transliteration'].apply(lambda name: list(name))
cnchar_df['bigrams'] = cnchar_df['transliteration'].apply(lambda name: list(ngrams(list(name), 2)))
cnchar_df['trigrams'] = cnchar_df['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))
# cnchar_df
cnchar_df['period_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('.'))
cnchar_df['dash_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('-'))
cnchar_df['space_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count(' '))
cnchar_df['apostrophe_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('\''))
# cnchar_df
# cnchar_df



unique_chars = set("".join(cnchar_df['transliteration']))
unique_chars_list = sorted(list(unique_chars))
print('OLD LIST:', unique_chars_list)
characters_to_check = ['(', ')', ',', '.', '0', '1', '3', '5', '6', '7', '8', '9', 'Q','·', '\u200f', '\u3000', '䓪', '（', '）', '𣋉', '𬸦']
pattern = '|'.join(map(re.escape, characters_to_check))
cnchar_df = cnchar_df[~cnchar_df['transliteration'].str.contains(pattern)]
cnchar_df = cnchar_df.reset_index(drop=True)
unique_chars = set("".join(cnchar_df['transliteration']))
unique_chars_list = sorted(list(unique_chars))
print('NEW LIST:', unique_chars_list)

OLD LIST: [' ', '(', ')', ',', '.', '0', '1', '3', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '·', '\u200f', '\u3000', '䓪', '（', '）', '𣋉', '𬸦']
NEW LIST: [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
# 6. avg_token_length
tokens = cnchar_df['transliteration'].apply(lambda name: name.split(' '))
# print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
# print(token_lengths[-5:])
cnchar_df['avg_token_length'] = token_lengths.apply(np.mean)
cnchar_df

Unnamed: 0,original_fullname,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.333333
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0,4.000000
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.000000
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0,3.666667
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0,3.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,龚翔宇,gong xiang yu,"[CJK, CJK, CJK]",13,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, g, , y, u]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,3.666667
11051,龚育之,gong yu zhi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (y,), (u,), ( ,...","[g, o, n, g, , y, u, , z, h, i]","[(g, o), (o, n), (n, g), (g, ), ( , y), (y, u...","[(g, o, n), (o, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.000000
11052,龚蓓苾,gong bei bi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (b,), (e,), (i,...","[g, o, n, g, , b, e, i, , b, i]","[(g, o), (o, n), (n, g), (g, ), ( , b), (b, e...","[(g, o, n), (o, n, g), (n, g, ), (g, , b), (...",0,0,2,0,3.000000
11053,龚贤永,gong xian yong,"[CJK, CJK, CJK]",14,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, , y, o, n, g]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,4.000000


In [8]:
#Chinese CHARACTERS feature engineering

In [9]:
unigram_fdist = create_lang_char_distribution(cnchar_df, 'transliteration')
print(len(unigram_fdist))
# unigram_fdist

27


In [10]:
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 ('a', ' '): 0,
 ('a', 'a'): 0,
 ('a', 'b'): 0,
 ('a', 'c'): 0,
 ('a', 'd'): 0,
 ('a', 'e'): 0,
 ('a', 'f'): 0,
 ('a', 'g'): 0,
 ('a', 'h'): 0,
 ('a', 'i'): 0,
 ('a', 'j'): 0,
 ('a', 'k'): 0,
 ('a', 'l'): 0,
 ('a', 'm'): 0,
 ('a', 'n'): 0,
 ('a', 'o'): 0,
 ('a', 'p'): 0,
 ('a', 'q'): 0,
 ('a', 'r'): 0,
 ('a', 's'): 0,
 ('a', 't'): 0,
 ('a', 'u'): 0,
 ('a', 'v'): 0,
 ('a', 'w'): 0,
 ('a', 'x'): 0,
 ('a', 'y'): 0,
 ('a', 'z'): 0,
 ('b', ' '): 0,
 ('b', 'a'): 0,
 ('b', 'b'): 0,
 ('b', 'c'): 0,
 ('b', 'd'): 0,
 ('b', 'e'): 0,
 ('b', 'f'): 0,
 ('b', 'g'): 0,
 ('b', '

In [11]:
# Creating the bigrams frequency distribution for the entire Chinese transliterated language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, cnchar_df, 'bigrams')
bigram_fdist

{(' ', ' '): 0.000601596966492872,
 (' ', 'a'): 0.001175848616326977,
 (' ', 'b'): 0.00530499143180078,
 (' ', 'c'): 0.007611113136690123,
 (' ', 'd'): 0.00613446603711671,
 (' ', 'e'): 0.0006745178109162504,
 (' ', 'f'): 0.0061618113537754765,
 (' ', 'g'): 0.005769861814999818,
 (' ', 'h'): 0.012633536296350311,
 (' ', 'i'): 0.0,
 (' ', 'j'): 0.01626134830641339,
 (' ', 'k'): 0.003044445254676049,
 (' ', 'l'): 0.01274291756298538,
 (' ', 'm'): 0.0068545593757975715,
 (' ', 'n'): 0.0028074525103000693,
 (' ', 'o'): 3.646042221168921e-05,
 (' ', 'p'): 0.0037463083822510666,
 (' ', 'q'): 0.008194479892077151,
 (' ', 'r'): 0.00391949538775659,
 (' ', 's'): 0.010363875013672658,
 (' ', 't'): 0.00463047362088453,
 (' ', 'u'): 0.0,
 (' ', 'v'): 0.0,
 (' ', 'w'): 0.008267400736500528,
 (' ', 'x'): 0.01480293141794582,
 (' ', 'y'): 0.02316148320997557,
 (' ', 'z'): 0.013563277062748387,
 ('a', ' '): 0.006590221314762825,
 ('a', 'a'): 0.0,
 ('a', 'b'): 0.0,
 ('a', 'c'): 0.0,
 ('a', 'd'): 0.0,
 

In [12]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
# initialized_unigrams

In [13]:
# UNIGRAMS individual frequency distributions
cnchar_df['indiv_unigrams_fdist'] = cnchar_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))
# checking that the functin worked for our first example
print(cnchar_df.iloc[0]['indiv_unigrams_fdist'])
cnchar_df.tail()


{' ': 0.16666666666666666, 'a': 0, 'b': 0, 'c': 0, 'd': 0.08333333333333333, 'e': 0, 'f': 0, 'g': 0.16666666666666666, 'h': 0, 'i': 0.25, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 0.16666666666666666, 'o': 0, 'p': 0.08333333333333333, 'q': 0, 'r': 0, 's': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0.08333333333333333, 'z': 0}


Unnamed: 0,original_fullname,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist
11050,龚翔宇,gong xiang yu,"[CJK, CJK, CJK]",13,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, g, , y, u]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,3.666667,"{' ': 0.15384615384615385, 'a': 0.076923076923..."
11051,龚育之,gong yu zhi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (y,), (u,), ( ,...","[g, o, n, g, , y, u, , z, h, i]","[(g, o), (o, n), (n, g), (g, ), ( , y), (y, u...","[(g, o, n), (o, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.0,"{' ': 0.18181818181818182, 'a': 0, 'b': 0, 'c'..."
11052,龚蓓苾,gong bei bi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (b,), (e,), (i,...","[g, o, n, g, , b, e, i, , b, i]","[(g, o), (o, n), (n, g), (g, ), ( , b), (b, e...","[(g, o, n), (o, n, g), (n, g, ), (g, , b), (...",0,0,2,0,3.0,"{' ': 0.18181818181818182, 'a': 0, 'b': 0.1818..."
11053,龚贤永,gong xian yong,"[CJK, CJK, CJK]",14,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, , y, o, n, g]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,4.0,"{' ': 0.14285714285714285, 'a': 0.071428571428..."
11054,龚鼎,gong ding,"[CJK, CJK]",9,2,"[(g,), (o,), (n,), (g,), ( ,), (d,), (i,), (n,...","[g, o, n, g, , d, i, n, g]","[(g, o), (o, n), (n, g), (g, ), ( , d), (d, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , d), (...",0,0,1,0,4.0,"{' ': 0.1111111111111111, 'a': 0, 'b': 0, 'c':..."


In [14]:
# BIGRAMS individual frequency distributions
cnchar_df['indiv_bigrams_fdist'] = cnchar_df['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))
print(cnchar_df.iloc[0]['indiv_bigrams_fdist'][('d', 'i')])
print(1 / len(cnchar_df.iloc[0]['bigrams']))

0.09090909090909091
0.09090909090909091


In [15]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
cnchar_df['indiv_unigrams_fdist'] = cnchar_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [16]:
# Calculating cosine similarity (UNIGRAMS)
cnchar_df['unigrams_cosine_sim'] = cnchar_df['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])
# viet_df

In [17]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
cnchar_df['indiv_bigrams_fdist'] = cnchar_df['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

# Calculating cosine similarity (BIGRAMS)
cnchar_df['bigrams_cosine_sim'] = cnchar_df['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])
cnchar_df

Unnamed: 0,original_fullname,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.333333,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.083333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.774279,0.548928
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0,4.000000,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.071428...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.811762,0.560151
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.000000,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.776390,0.510394
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.076923...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.841584,0.605839
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0,3.500000,"[[0.125, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710349,0.440812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,龚翔宇,gong xiang yu,"[CJK, CJK, CJK]",13,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, g, , y, u]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,3.666667,"[[0.15384615384615385, 0.07692307692307693, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.844943,0.665759
11051,龚育之,gong yu zhi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (y,), (u,), ( ,...","[g, o, n, g, , y, u, , z, h, i]","[(g, o), (o, n), (n, g), (g, ), ( , y), (y, u...","[(g, o, n), (o, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.000000,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.837370,0.501902
11052,龚蓓苾,gong bei bi,"[CJK, CJK, CJK]",11,3,"[(g,), (o,), (n,), (g,), ( ,), (b,), (e,), (i,...","[g, o, n, g, , b, e, i, , b, i]","[(g, o), (o, n), (n, g), (g, ), ( , b), (b, e...","[(g, o, n), (o, n, g), (n, g, ), (g, , b), (...",0,0,2,0,3.000000,"[[0.18181818181818182, 0.0, 0.1818181818181818...","[[0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.729526,0.363754
11053,龚贤永,gong xian yong,"[CJK, CJK, CJK]",14,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[g, o, n, g, , x, i, a, n, , y, o, n, g]","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0,4.000000,"[[0.14285714285714285, 0.07142857142857142, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.808012,0.637316


EXGR_Chinese names(Romantized).xlsx - CLEANING
===

In [18]:
# cnrom_df
cnrom_df['fullname'] = cnrom_df['fullname'].astype(str)
cnrom_df.rename(columns={'fullname': 'original_fullname'}, inplace=True)

cnrom_df['original_fullname'] = cnrom_df['original_fullname'].str.lower()

cnrom_df['transliteration'] = cnrom_df['original_fullname'].apply(lambda name: unidecode(name))
cnrom_df['transliteration'] = cnrom_df['transliteration'].str.lower()

cnrom_df

Unnamed: 0.1,Unnamed: 0,id,original_fullname,Family name,Given Name,Notes,transliteration
0,1806.0,http://www.wikidata.org/entity/Q2861472,8th arjia rinpoche,,,non-native,8th arjia rinpoche
1,9519.0,http://www.wikidata.org/entity/Q2107375,a lamusi,,,non-native,a lamusi
2,2157.0,http://www.wikidata.org/entity/Q16872,aaron kwok,,,,aaron kwok
3,22359.0,http://www.wikidata.org/entity/Q50366858,abduhamit abdugheni,,,non-native,abduhamit abdugheni
4,20305.0,http://www.wikidata.org/entity/Q2821416,abdul haq,,,non-native,abdul haq
...,...,...,...,...,...,...,...
12009,22920.0,http://www.wikidata.org/entity/Q8074892,zu xiaosun,,,,zu xiaosun
12010,436.0,http://www.wikidata.org/entity/Q8075316,zuo shusheng,,,,zuo shusheng
12011,5871.0,http://www.wikidata.org/entity/Q9090197,zuo xiaoqing,,,,zuo xiaoqing
12012,9205.0,http://www.wikidata.org/entity/Q24006407,zuo yiteng,,,,zuo yiteng


In [19]:
cnrom_df.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [20]:
cnrom_df['Notes'].unique()

array(['non-native', nan], dtype=object)

In [21]:
duplicates = cnrom_df['original_fullname'].duplicated()
# print(cnrom_df[duplicates])

In [22]:
cnrom_df.drop_duplicates(subset=['original_fullname'], keep='first', inplace=True)
# cnrom_df

In [23]:
#Get rid of all examples with 'non-native', to try and further narrow df down to just Chinese names
cnrom_df.drop(cnrom_df[cnrom_df['Notes'] == 'non-native'].index, inplace=True)
# cnrom_df

In [24]:
cnrom_df['Notes'].unique()

array([nan], dtype=object)

In [25]:
cnrom_df['alphabet'] = cnrom_df['original_fullname'].apply(langname)
# cnrom_df

In [26]:
cnrom_df['word_length'] = cnrom_df['original_fullname'].apply(get_name_length)
# cnrom_df

In [27]:
cnrom_df['num_tokens'] = cnrom_df['original_fullname'].apply(calculate_token_length)
# cnrom_df

In [28]:
cnrom_df['char_ngrams'] = cnrom_df['original_fullname'].apply(generate_char_ngrams)
# cnrom_df

In [29]:
cnrom_df['unigrams'] = cnrom_df['original_fullname'].apply(lambda name: list(name))
cnrom_df['bigrams'] = cnrom_df['original_fullname'].apply(lambda name: list(ngrams(list(name), 2)))
cnrom_df['trigrams'] = cnrom_df['original_fullname'].apply(lambda name: list(ngrams(list(name), 3)))
# cnrom_df

In [30]:
cnrom_df['period_freq'] = cnrom_df['original_fullname'].apply(lambda name: name.count('.'))
cnrom_df['dash_freq'] = cnrom_df['original_fullname'].apply(lambda name: name.count('-'))
cnrom_df['space_freq'] = cnrom_df['original_fullname'].apply(lambda name: name.count(' '))
cnrom_df['apostrophe_freq'] = cnrom_df['original_fullname'].apply(lambda name: name.count('\''))
cnrom_df

Unnamed: 0,original_fullname,Family name,Given Name,Notes,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq
2,aaron kwok,,,,aaron kwok,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10,2,"[(a,), (a,), (r,), (o,), (n,), ( ,), (k,), (w,...","[a, a, r, o, n, , k, w, o, k]","[(a, a), (a, r), (r, o), (o, n), (n, ), ( , k...","[(a, a, r), (a, r, o), (r, o, n), (o, n, ), (...",0,0,1,0
9,adhe tapontsang,,,,adhe tapontsang,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15,2,"[(a,), (d,), (h,), (e,), ( ,), (t,), (a,), (p,...","[a, d, h, e, , t, a, p, o, n, t, s, a, n, g]","[(a, d), (d, h), (h, e), (e, ), ( , t), (t, a...","[(a, d, h), (d, h, e), (h, e, ), (e, , t), (...",0,0,1,0
10,ai baojun,,,,ai baojun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9,2,"[(a,), (i,), ( ,), (b,), (a,), (o,), (j,), (u,...","[a, i, , b, a, o, j, u, n]","[(a, i), (i, ), ( , b), (b, a), (a, o), (o, j...","[(a, i, ), (i, , b), ( , b, a), (b, a, o), (...",0,0,1,0
11,ai guoxiang,,,,ai guoxiang,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,2,"[(a,), (i,), ( ,), (g,), (u,), (o,), (x,), (i,...","[a, i, , g, u, o, x, i, a, n, g]","[(a, i), (i, ), ( , g), (g, u), (u, o), (o, x...","[(a, i, ), (i, , g), ( , g, u), (g, u, o), (...",0,0,1,0
12,ai husheng,,,,ai husheng,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(a,), (i,), ( ,), (h,), (u,), (s,), (h,), (e,...","[a, i, , h, u, s, h, e, n, g]","[(a, i), (i, ), ( , h), (h, u), (u, s), (s, h...","[(a, i, ), (i, , h), ( , h, u), (h, u, s), (...",0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12009,zu xiaosun,,,,zu xiaosun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), ( ,), (x,), (i,), (a,), (o,), (s,...","[z, u, , x, i, a, o, s, u, n]","[(z, u), (u, ), ( , x), (x, i), (i, a), (a, o...","[(z, u, ), (u, , x), ( , x, i), (x, i, a), (...",0,0,1,0
12010,zuo shusheng,,,,zuo shusheng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (s,), (h,), (u,), (s,...","[z, u, o, , s, h, u, s, h, e, n, g]","[(z, u), (u, o), (o, ), ( , s), (s, h), (h, u...","[(z, u, o), (u, o, ), (o, , s), ( , s, h), (...",0,0,1,0
12011,zuo xiaoqing,,,,zuo xiaoqing,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (x,), (i,), (a,), (o,...","[z, u, o, , x, i, a, o, q, i, n, g]","[(z, u), (u, o), (o, ), ( , x), (x, i), (i, a...","[(z, u, o), (u, o, ), (o, , x), ( , x, i), (...",0,0,1,0
12012,zuo yiteng,,,,zuo yiteng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), (o,), ( ,), (y,), (i,), (t,), (e,...","[z, u, o, , y, i, t, e, n, g]","[(z, u), (u, o), (o, ), ( , y), (y, i), (i, t...","[(z, u, o), (u, o, ), (o, , y), ( , y, i), (...",0,0,1,0


In [31]:
cnrom_df['original_fullname'] = cnrom_df['original_fullname'].str.lower()
unique_chars = set("".join(str(val) for val in cnrom_df['original_fullname']))
unique_chars_list = sorted(list(unique_chars))
print(unique_chars_list)

[' ', "'", '(', ')', ',', '-', '.', '/', '1', '4', '6', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'ã', 'é', 'ê', 'ì', 'í', 'ô', 'ö', 'ù', 'ü', 'ā', 'đ', 'ē', 'ī', 'ū', 'ơ', 'ư', 'ǎ', 'ấ', 'ầ', 'ễ', '\u200f', 'ⁿ', '周', '立', '銘']


In [32]:
characters_to_check = ["'", '(', ')', ',', '-', '.', '/', '1', '4', '6', '8', '9',
                      '\u200f', 'ⁿ', '周', '立', '銘']
pattern = '|'.join(map(re.escape, characters_to_check))
cnrom_df['original_fullname'] = cnrom_df['original_fullname'].astype(str)
cnrom_df = cnrom_df[~cnrom_df['original_fullname'].str.contains(pattern)]

cnrom_df = cnrom_df.reset_index(drop=True)
unique_chars = set("".join(cnrom_df['original_fullname']))
unique_chars_list = sorted(list(unique_chars))
print(unique_chars_list)
cnrom_df

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'ã', 'é', 'ê', 'ì', 'í', 'ö', 'ù', 'ü', 'ā', 'đ', 'ē', 'ī', 'ū', 'ơ', 'ư', 'ǎ', 'ấ', 'ầ', 'ễ']


Unnamed: 0,original_fullname,Family name,Given Name,Notes,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq
0,aaron kwok,,,,aaron kwok,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10,2,"[(a,), (a,), (r,), (o,), (n,), ( ,), (k,), (w,...","[a, a, r, o, n, , k, w, o, k]","[(a, a), (a, r), (r, o), (o, n), (n, ), ( , k...","[(a, a, r), (a, r, o), (r, o, n), (o, n, ), (...",0,0,1,0
1,adhe tapontsang,,,,adhe tapontsang,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15,2,"[(a,), (d,), (h,), (e,), ( ,), (t,), (a,), (p,...","[a, d, h, e, , t, a, p, o, n, t, s, a, n, g]","[(a, d), (d, h), (h, e), (e, ), ( , t), (t, a...","[(a, d, h), (d, h, e), (h, e, ), (e, , t), (...",0,0,1,0
2,ai baojun,,,,ai baojun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9,2,"[(a,), (i,), ( ,), (b,), (a,), (o,), (j,), (u,...","[a, i, , b, a, o, j, u, n]","[(a, i), (i, ), ( , b), (b, a), (a, o), (o, j...","[(a, i, ), (i, , b), ( , b, a), (b, a, o), (...",0,0,1,0
3,ai guoxiang,,,,ai guoxiang,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,2,"[(a,), (i,), ( ,), (g,), (u,), (o,), (x,), (i,...","[a, i, , g, u, o, x, i, a, n, g]","[(a, i), (i, ), ( , g), (g, u), (u, o), (o, x...","[(a, i, ), (i, , g), ( , g, u), (g, u, o), (...",0,0,1,0
4,ai husheng,,,,ai husheng,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(a,), (i,), ( ,), (h,), (u,), (s,), (h,), (e,...","[a, i, , h, u, s, h, e, n, g]","[(a, i), (i, ), ( , h), (h, u), (u, s), (s, h...","[(a, i, ), (i, , h), ( , h, u), (h, u, s), (...",0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10473,zu xiaosun,,,,zu xiaosun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), ( ,), (x,), (i,), (a,), (o,), (s,...","[z, u, , x, i, a, o, s, u, n]","[(z, u), (u, ), ( , x), (x, i), (i, a), (a, o...","[(z, u, ), (u, , x), ( , x, i), (x, i, a), (...",0,0,1,0
10474,zuo shusheng,,,,zuo shusheng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (s,), (h,), (u,), (s,...","[z, u, o, , s, h, u, s, h, e, n, g]","[(z, u), (u, o), (o, ), ( , s), (s, h), (h, u...","[(z, u, o), (u, o, ), (o, , s), ( , s, h), (...",0,0,1,0
10475,zuo xiaoqing,,,,zuo xiaoqing,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (x,), (i,), (a,), (o,...","[z, u, o, , x, i, a, o, q, i, n, g]","[(z, u), (u, o), (o, ), ( , x), (x, i), (i, a...","[(z, u, o), (u, o, ), (o, , x), ( , x, i), (...",0,0,1,0
10476,zuo yiteng,,,,zuo yiteng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), (o,), ( ,), (y,), (i,), (t,), (e,...","[z, u, o, , y, i, t, e, n, g]","[(z, u), (u, o), (o, ), ( , y), (y, i), (i, t...","[(z, u, o), (u, o, ), (o, , y), ( , y, i), (...",0,0,1,0


In [33]:
# 6. avg_token_length
tokens = cnrom_df['original_fullname'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
cnrom_df['avg_token_length'] = token_lengths.apply(np.mean)
cnrom_df

10473        [zu, xiaosun]
10474      [zuo, shusheng]
10475      [zuo, xiaoqing]
10476        [zuo, yiteng]
10477    [zuoxiao, zuzhou]
Name: original_fullname, dtype: object 

10473    [2, 7]
10474    [3, 8]
10475    [3, 8]
10476    [3, 6]
10477    [7, 6]
Name: original_fullname, dtype: object


Unnamed: 0,original_fullname,Family name,Given Name,Notes,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length
0,aaron kwok,,,,aaron kwok,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10,2,"[(a,), (a,), (r,), (o,), (n,), ( ,), (k,), (w,...","[a, a, r, o, n, , k, w, o, k]","[(a, a), (a, r), (r, o), (o, n), (n, ), ( , k...","[(a, a, r), (a, r, o), (r, o, n), (o, n, ), (...",0,0,1,0,4.5
1,adhe tapontsang,,,,adhe tapontsang,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15,2,"[(a,), (d,), (h,), (e,), ( ,), (t,), (a,), (p,...","[a, d, h, e, , t, a, p, o, n, t, s, a, n, g]","[(a, d), (d, h), (h, e), (e, ), ( , t), (t, a...","[(a, d, h), (d, h, e), (h, e, ), (e, , t), (...",0,0,1,0,7.0
2,ai baojun,,,,ai baojun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9,2,"[(a,), (i,), ( ,), (b,), (a,), (o,), (j,), (u,...","[a, i, , b, a, o, j, u, n]","[(a, i), (i, ), ( , b), (b, a), (a, o), (o, j...","[(a, i, ), (i, , b), ( , b, a), (b, a, o), (...",0,0,1,0,4.0
3,ai guoxiang,,,,ai guoxiang,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,2,"[(a,), (i,), ( ,), (g,), (u,), (o,), (x,), (i,...","[a, i, , g, u, o, x, i, a, n, g]","[(a, i), (i, ), ( , g), (g, u), (u, o), (o, x...","[(a, i, ), (i, , g), ( , g, u), (g, u, o), (...",0,0,1,0,5.0
4,ai husheng,,,,ai husheng,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(a,), (i,), ( ,), (h,), (u,), (s,), (h,), (e,...","[a, i, , h, u, s, h, e, n, g]","[(a, i), (i, ), ( , h), (h, u), (u, s), (s, h...","[(a, i, ), (i, , h), ( , h, u), (h, u, s), (...",0,0,1,0,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10473,zu xiaosun,,,,zu xiaosun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), ( ,), (x,), (i,), (a,), (o,), (s,...","[z, u, , x, i, a, o, s, u, n]","[(z, u), (u, ), ( , x), (x, i), (i, a), (a, o...","[(z, u, ), (u, , x), ( , x, i), (x, i, a), (...",0,0,1,0,4.5
10474,zuo shusheng,,,,zuo shusheng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (s,), (h,), (u,), (s,...","[z, u, o, , s, h, u, s, h, e, n, g]","[(z, u), (u, o), (o, ), ( , s), (s, h), (h, u...","[(z, u, o), (u, o, ), (o, , s), ( , s, h), (...",0,0,1,0,5.5
10475,zuo xiaoqing,,,,zuo xiaoqing,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (x,), (i,), (a,), (o,...","[z, u, o, , x, i, a, o, q, i, n, g]","[(z, u), (u, o), (o, ), ( , x), (x, i), (i, a...","[(z, u, o), (u, o, ), (o, , x), ( , x, i), (...",0,0,1,0,5.5
10476,zuo yiteng,,,,zuo yiteng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), (o,), ( ,), (y,), (i,), (t,), (e,...","[z, u, o, , y, i, t, e, n, g]","[(z, u), (u, o), (o, ), ( , y), (y, i), (i, t...","[(z, u, o), (u, o, ), (o, , y), ( , y, i), (...",0,0,1,0,4.5


In [34]:
#Ngram stuff now!

unigram_fdist = create_lang_char_distribution(cnrom_df, 'original_fullname')
print(len(unigram_fdist))
# unigram_fdist

48


In [35]:
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', 'à'): 0,
 (' ', 'á'): 0,
 (' ', 'ã'): 0,
 (' ', 'é'): 0,
 (' ', 'ê'): 0,
 (' ', 'ì'): 0,
 (' ', 'í'): 0,
 (' ', 'ö'): 0,
 (' ', 'ù'): 0,
 (' ', 'ü'): 0,
 (' ', 'ā'): 0,
 (' ', 'đ'): 0,
 (' ', 'ē'): 0,
 (' ', 'ī'): 0,
 (' ', 'ū'): 0,
 (' ', 'ơ'): 0,
 (' ', 'ư'): 0,
 (' ', 'ǎ'): 0,
 (' ', 'ấ'): 0,
 (' ', 'ầ'): 0,
 (' ', 'ễ'): 0,
 ('a', ' '): 0,
 ('a', 'a'): 0,
 ('a', 'b'): 0,
 ('a', 'c'): 0,
 ('a', 'd'): 0,
 ('a', 'e'): 0,
 ('a', 'f'): 0,
 ('a', 'g'): 0,
 ('a', 'h'): 0,
 ('a', 'i'): 0,
 ('a', 'j'): 0,
 ('a', 'k'): 0,
 ('a', 'l'): 0,
 ('a', 'm'): 0,
 ('a', '

In [36]:
# Creating the bigrams frequency distribution for the entire Chinese rom language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, cnrom_df, 'bigrams')
bigram_fdist

{(' ', ' '): 1.019752608017295e-05,
 (' ', 'a'): 0.0009483699254560844,
 (' ', 'b'): 0.0030184677197311934,
 (' ', 'c'): 0.00843335406830303,
 (' ', 'd'): 0.0035589366019803596,
 (' ', 'e'): 0.00039770351712674503,
 (' ', 'f'): 0.003273405871735517,
 (' ', 'g'): 0.003303998449976036,
 (' ', 'h'): 0.00788268765997369,
 (' ', 'i'): 0.0001835554694431131,
 (' ', 'j'): 0.009238958628636693,
 (' ', 'k'): 0.003660911862782089,
 (' ', 'l'): 0.008586316959505624,
 (' ', 'm'): 0.003997430223427797,
 (' ', 'n'): 0.0014582462294647318,
 (' ', 'o'): 0.00011217278688190245,
 (' ', 'p'): 0.0021822705811570114,
 (' ', 'q'): 0.0031408380326932686,
 (' ', 'r'): 0.0017539744857897473,
 (' ', 's'): 0.007015897943158989,
 (' ', 't'): 0.004150393114630391,
 (' ', 'u'): 4.07901043206918e-05,
 (' ', 'v'): 6.11851564810377e-05,
 (' ', 'w'): 0.0064448364826693045,
 (' ', 'x'): 0.00898402047663237,
 (' ', 'y'): 0.01400120330807746,
 (' ', 'z'): 0.008372168911821992,
 (' ', 'à'): 0.0,
 (' ', 'á'): 0.0,
 (' ', 'ã

In [37]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
# initialized_unigrams

In [38]:
# UNIGRAMS individual frequency distributions
cnrom_df['indiv_unigrams_fdist'] = cnrom_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

print(cnrom_df.iloc[0]['indiv_unigrams_fdist'])
cnrom_df.head()


{' ': 0.1, 'a': 0.2, 'b': 0, 'c': 0, 'd': 0, 'e': 0, 'f': 0, 'g': 0, 'h': 0, 'i': 0, 'j': 0, 'k': 0.2, 'l': 0, 'm': 0, 'n': 0.1, 'o': 0.2, 'p': 0, 'q': 0, 'r': 0.1, 's': 0, 't': 0, 'u': 0, 'v': 0, 'w': 0.1, 'x': 0, 'y': 0, 'z': 0, 'à': 0, 'á': 0, 'ã': 0, 'é': 0, 'ê': 0, 'ì': 0, 'í': 0, 'ö': 0, 'ù': 0, 'ü': 0, 'ā': 0, 'đ': 0, 'ē': 0, 'ī': 0, 'ū': 0, 'ơ': 0, 'ư': 0, 'ǎ': 0, 'ấ': 0, 'ầ': 0, 'ễ': 0}


Unnamed: 0,original_fullname,Family name,Given Name,Notes,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist
0,aaron kwok,,,,aaron kwok,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10,2,"[(a,), (a,), (r,), (o,), (n,), ( ,), (k,), (w,...","[a, a, r, o, n, , k, w, o, k]","[(a, a), (a, r), (r, o), (o, n), (n, ), ( , k...","[(a, a, r), (a, r, o), (r, o, n), (o, n, ), (...",0,0,1,0,4.5,"{' ': 0.1, 'a': 0.2, 'b': 0, 'c': 0, 'd': 0, '..."
1,adhe tapontsang,,,,adhe tapontsang,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15,2,"[(a,), (d,), (h,), (e,), ( ,), (t,), (a,), (p,...","[a, d, h, e, , t, a, p, o, n, t, s, a, n, g]","[(a, d), (d, h), (h, e), (e, ), ( , t), (t, a...","[(a, d, h), (d, h, e), (h, e, ), (e, , t), (...",0,0,1,0,7.0,"{' ': 0.06666666666666667, 'a': 0.2, 'b': 0, '..."
2,ai baojun,,,,ai baojun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9,2,"[(a,), (i,), ( ,), (b,), (a,), (o,), (j,), (u,...","[a, i, , b, a, o, j, u, n]","[(a, i), (i, ), ( , b), (b, a), (a, o), (o, j...","[(a, i, ), (i, , b), ( , b, a), (b, a, o), (...",0,0,1,0,4.0,"{' ': 0.1111111111111111, 'a': 0.2222222222222..."
3,ai guoxiang,,,,ai guoxiang,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,2,"[(a,), (i,), ( ,), (g,), (u,), (o,), (x,), (i,...","[a, i, , g, u, o, x, i, a, n, g]","[(a, i), (i, ), ( , g), (g, u), (u, o), (o, x...","[(a, i, ), (i, , g), ( , g, u), (g, u, o), (...",0,0,1,0,5.0,"{' ': 0.09090909090909091, 'a': 0.181818181818..."
4,ai husheng,,,,ai husheng,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(a,), (i,), ( ,), (h,), (u,), (s,), (h,), (e,...","[a, i, , h, u, s, h, e, n, g]","[(a, i), (i, ), ( , h), (h, u), (u, s), (s, h...","[(a, i, ), (i, , h), ( , h, u), (h, u, s), (...",0,0,1,0,4.5,"{' ': 0.1, 'a': 0.1, 'b': 0, 'c': 0, 'd': 0, '..."


In [39]:
# BIGRAMS individual frequency distributions
cnrom_df['indiv_bigrams_fdist'] = cnrom_df['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))
print(cnrom_df.iloc[0]['indiv_bigrams_fdist'][('a', 'a')])
print(1 / len(cnrom_df.iloc[0]['bigrams']))

0.1111111111111111
0.1111111111111111


In [40]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
cnrom_df['indiv_unigrams_fdist'] = cnrom_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)
# Calculating cosine similarity (UNIGRAMS)
cnrom_df['unigrams_cosine_sim'] = cnrom_df['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])
# viet_df

In [41]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
cnrom_df['indiv_bigrams_fdist'] = cnrom_df['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

# Calculating cosine similarity (BIGRAMS)
cnrom_df['bigrams_cosine_sim'] = cnrom_df['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])
# cnrom_df

In [42]:
cnrom_df.drop(columns=["Family name", "Given Name", "Notes"], inplace=True)
cnrom_df

Unnamed: 0,original_fullname,transliteration,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,aaron kwok,aaron kwok,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10,2,"[(a,), (a,), (r,), (o,), (n,), ( ,), (k,), (w,...","[a, a, r, o, n, , k, w, o, k]","[(a, a), (a, r), (r, o), (o, n), (n, ), ( , k...","[(a, a, r), (a, r, o), (r, o, n), (o, n, ), (...",0,0,1,0,4.5,"[[0.1, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.520125,0.110000
1,adhe tapontsang,adhe tapontsang,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15,2,"[(a,), (d,), (h,), (e,), ( ,), (t,), (a,), (p,...","[a, d, h, e, , t, a, p, o, n, t, s, a, n, g]","[(a, d), (d, h), (h, e), (e, ), ( , t), (t, a...","[(a, d, h), (d, h, e), (h, e, ), (e, , t), (...",0,0,1,0,7.0,"[[0.06666666666666667, 0.2, 0.0, 0.0, 0.066666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.711498,0.366279
2,ai baojun,ai baojun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9,2,"[(a,), (i,), ( ,), (b,), (a,), (o,), (j,), (u,...","[a, i, , b, a, o, j, u, n]","[(a, i), (i, ), ( , b), (b, a), (a, o), (o, j...","[(a, i, ), (i, , b), ( , b, a), (b, a, o), (...",0,0,1,0,4.0,"[[0.1111111111111111, 0.2222222222222222, 0.11...","[[0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",0.742319,0.149964
3,ai guoxiang,ai guoxiang,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,2,"[(a,), (i,), ( ,), (g,), (u,), (o,), (x,), (i,...","[a, i, , g, u, o, x, i, a, n, g]","[(a, i), (i, ), ( , g), (g, u), (u, o), (o, x...","[(a, i, ), (i, , g), ( , g, u), (g, u, o), (...",0,0,1,0,5.0,"[[0.09090909090909091, 0.18181818181818182, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0,...",0.827701,0.474799
4,ai husheng,ai husheng,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(a,), (i,), ( ,), (h,), (u,), (s,), (h,), (e,...","[a, i, , h, u, s, h, e, n, g]","[(a, i), (i, ), ( , h), (h, u), (u, s), (s, h...","[(a, i, ), (i, , h), ( , h, u), (h, u, s), (...",0,0,1,0,4.5,"[[0.1, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.1, 0.2,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11...",0.833236,0.425660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10473,zu xiaosun,zu xiaosun,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), ( ,), (x,), (i,), (a,), (o,), (s,...","[z, u, , x, i, a, o, s, u, n]","[(z, u), (u, ), ( , x), (x, i), (i, a), (a, o...","[(z, u, ), (u, , x), ( , x, i), (x, i, a), (...",0,0,1,0,4.5,"[[0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.716897,0.231276
10474,zuo shusheng,zuo shusheng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (s,), (h,), (u,), (s,...","[z, u, o, , s, h, u, s, h, e, n, g]","[(z, u), (u, o), (o, ), ( , s), (s, h), (h, u...","[(z, u, o), (u, o, ), (o, , s), ( , s, h), (...",0,0,1,0,5.5,"[[0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.643322,0.357847
10475,zuo xiaoqing,zuo xiaoqing,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12,2,"[(z,), (u,), (o,), ( ,), (x,), (i,), (a,), (o,...","[z, u, o, , x, i, a, o, q, i, n, g]","[(z, u), (u, o), (o, ), ( , x), (x, i), (i, a...","[(z, u, o), (u, o, ), (o, , x), ( , x, i), (...",0,0,1,0,5.5,"[[0.08333333333333333, 0.08333333333333333, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.760418,0.410750
10476,zuo yiteng,zuo yiteng,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10,2,"[(z,), (u,), (o,), ( ,), (y,), (i,), (t,), (e,...","[z, u, o, , y, i, t, e, n, g]","[(z, u), (u, o), (o, ), ( , y), (y, i), (i, t...","[(z, u, o), (u, o, ), (o, , y), ( , y, i), (...",0,0,1,0,4.5,"[[0.1, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.1, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.760018,0.341783


EXGR_Vietnamese.xlsx - CLEANING
===

In [43]:
viet_df.drop(columns = ['Unnamed: 0', 'fullname','Family name','Given name'], inplace = True)
viet_df = viet_df.rename(columns={'id': 'fullname'})
viet_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# viet_df
viet_df['alphabet'] = viet_df['fullname'].apply(langname)
# viet_df
viet_df['word_length'] = viet_df['fullname'].apply(get_name_length)
# viet_df
viet_df['num_tokens'] = viet_df['fullname'].apply(calculate_token_length)
# viet_df
viet_df['char_ngrams'] = viet_df['fullname'].apply(generate_char_ngrams)
# viet_df
viet_df['period_freq'] = viet_df['fullname'].apply(lambda name: name.count('.'))
viet_df['dash_freq'] = viet_df['fullname'].apply(lambda name: name.count('-'))
viet_df['space_freq'] = viet_df['fullname'].apply(lambda name: name.count(' '))
viet_df['apostrophe_freq'] = viet_df['fullname'].apply(lambda name: name.count('\''))
viet_df['transliteration'] = viet_df['fullname'].apply(lambda name: unidecode(name))
viet_df['transliteration'] = viet_df['transliteration'].str.lower()
viet_df['fullname'] = viet_df['fullname'].str.lower()
viet_df[viet_df['fullname'] != viet_df['transliteration']][['fullname', 'transliteration']]
# unique_chars = set("".join(viet_df['transliteration']))
# unique_chars_list = sorted(list(unique_chars))
# print(unique_chars_list)
characters_to_check = ["'", '(', ')', ',', '-', '.', '1', '2', '5', '7', '8', '9']
pattern = '|'.join(map(re.escape, characters_to_check))
viet_df = viet_df[~viet_df['transliteration'].str.contains(pattern)]
viet_df = viet_df.reset_index(drop=True)
unique_chars = set("".join(viet_df['transliteration']))
unique_chars_list = sorted(list(unique_chars))
print(unique_chars_list)

#NOTE: DOING FOR TRANSLITERATION BC THERE'S TOO MANY ACCENTED CHARS. IN VIET LANGUAGE
#I changed to fullname - now that we know we're getting rid of trigrams

viet_df['unigrams'] = viet_df['fullname'].apply(lambda name: list(name))
viet_df['bigrams'] = viet_df['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
viet_df['trigrams'] = viet_df['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
# viet_df

# 6. avg_token_length
tokens = viet_df['fullname'].apply(lambda name: name.split(' '))
# print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
# print(token_lengths[-5:])
viet_df['avg_token_length'] = token_lengths.apply(np.mean)
viet_df

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,unigrams,bigrams,trigrams,avg_token_length
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(T,), (ừ,), ( ,), (H,), (o,), (à,), (n,), (g,...",0,0,2,0,tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...",4.000000
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (T,...",0,0,3,0,nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...",4.750000
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(N,), (i,), (c,), (k,), ( ,), (Ú,), (t,), (N,...",0,0,1,0,nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...",3.000000
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(C,), (a,), (o,), ( ,), (V,), (ă,), (n,), ( ,...",0,0,2,0,cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...",3.000000
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(T,), (ạ,), ( ,), (T,), (h,), (u,), ( ,), (T,...",0,0,2,0,ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...",3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (D,...",0,0,2,0,nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.333333
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (P,...",0,0,2,0,nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.666667
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(L,), (ê,), ( ,), (P,), (h,), (ổ,), (L, ê), (...",0,0,1,0,le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]",2.500000
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(V,), (u,), ( ,), (N,), (g,), (o,), (c,), ( ,...",0,0,2,0,vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...",3.000000


In [44]:
# def create_character_frequency_hashmap(df, names_col):
#     char_freqs = {}
#     for name in df[names_col]:
#         for char in name:
#             if char not in char_freqs.keys():
#                 char_freqs[char] = 1
#             else:
#                 char_freqs[char] += 1
#     return char_freqs

In [45]:
# viet_uni = create_character_frequency_hashmap(viet_df, 'unigrams')
# print(viet_uni)

In [46]:
# viet_bi = create_character_frequency_hashmap(viet_df, 'bigrams')
# print(viet_bi)

In [47]:
# viet_tri = create_character_frequency_hashmap(viet_df, 'trigrams')
# print(viet_tri)

In [48]:
# viet_unifreqdist = nltk.FreqDist(viet_uni).most_common(30)
# # viet_unifreqdist

In [49]:
# viet_unifd = pd.DataFrame.from_dict(viet_unifreqdist)
# viet_unifd.rename(columns={0: 'unigram', 1: 'count'}, inplace=True)
# viet_unifd.plot(x="unigram", y="count", kind="bar") 

In [50]:
# viet_bifreqdist = nltk.FreqDist(viet_bi).most_common(30)
# viet_bifd = pd.DataFrame.from_dict(viet_bifreqdist)
# viet_bifd.rename(columns={0: 'bigram', 1: 'count'}, inplace=True)
# viet_bifd.plot(x="bigram", y="count", kind="bar") 

In [51]:
# viet_trifreqdist = nltk.FreqDist(viet_tri).most_common(30)
# viet_trifd = pd.DataFrame.from_dict(viet_trifreqdist)
# viet_trifd.rename(columns={0: 'trigram', 1: 'count'}, inplace=True)
# viet_trifd.plot(x="trigram", y="count", kind="bar") 

In [52]:
#todo: do it with relative frequency out of the whole dataset

In [53]:
#How many total unigrams/bigrams/trigrams across the whole dataset? - viet_df

In [54]:
# all_unigrams = [unigram for row in viet_df['unigrams'] for unigram in row]
# total_unigrams = len(all_unigrams)
# print(f"Total number of unigrams (viet_df): {total_unigrams}")

In [55]:
# all_bigrams = [bigram for row in viet_df['bigrams'] for bigram in row]
# total_bigrams = len(all_bigrams)
# print(f"Total number of bigrams (viet_df): {total_bigrams}")

In [56]:
# all_trigrams = [trigram for row in viet_df['trigrams'] for trigram in row]
# total_trigrams = len(all_trigrams)
# print(f"Total number of trigrams (viet_df): {total_trigrams}")

In [57]:
# def create_gram_frequency_hashmap(df, col_name):
#     gram_freqs = {}
#     total_grams = 0

#     for grams_list in df[col_name]:
#         for gram in grams_list:
#             if len(gram) == 1 or len(gram) == 2 or len(gram) == 3:
#                 if gram not in gram_freqs.keys():
#                     gram_freqs[gram] = 1
#                 else:
#                     gram_freqs[gram] += 1
#                 total_grams += 1

#     gram_freqs_relative = {gram: count / total_grams for gram, count in gram_freqs.items()}

#     return gram_freqs_relative

# viet_unifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'unigrams')
# viet_bifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'bigrams')
# viet_trifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'trigrams')

# def plot_gram_frequency(gram_freq_dist, title, total_grams):
#     gram_freq_dist_sorted = sorted(gram_freq_dist.items(), key=lambda x: x[1], reverse=True)
#     gram_df_relative = pd.DataFrame(gram_freq_dist_sorted, columns=['gram', 'relative_frequency'])
#     top_30_gram_df_relative = gram_df_relative[:30]
#     ax = top_30_gram_df_relative.set_index('gram').plot(kind="bar", title=title)
#     ax.legend([f"Total number of {title.split()[0].capitalize()} ({total_grams})"])
#     plt.xlabel('Gram')
#     plt.ylabel('Relative Frequency')
#     plt.show()

# total_unigrams = 32024
# total_bigrams = 29654
# total_trigrams = 27284

# plot_gram_frequency(viet_unifreqdist_relative, 'Unigrams Relative Frequency', total_unigrams)
# plot_gram_frequency(viet_bifreqdist_relative, 'Bigrams Relative Frequency', total_bigrams)
# plot_gram_frequency(viet_trifreqdist_relative, 'Trigrams Relative Frequency', total_trigrams)

NEW FREQUENCY DISTRIBUTION + COSINE SIMILARITY WORK
---

In [58]:
unigram_fdist = create_lang_char_distribution(viet_df, 'fullname')
print(len(unigram_fdist))
# unigram_fdist

92


In [59]:
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', 'à'): 0,
 (' ', 'á'): 0,
 (' ', 'â'): 0,
 (' ', 'ã'): 0,
 (' ', 'ç'): 0,
 (' ', 'é'): 0,
 (' ', 'ê'): 0,
 (' ', 'ì'): 0,
 (' ', 'í'): 0,
 (' ', 'ð'): 0,
 (' ', 'ñ'): 0,
 (' ', 'ò'): 0,
 (' ', 'ó'): 0,
 (' ', 'ô'): 0,
 (' ', 'õ'): 0,
 (' ', 'ù'): 0,
 (' ', 'ú'): 0,
 (' ', 'ý'): 0,
 (' ', 'ă'): 0,
 (' ', 'ć'): 0,
 (' ', 'đ'): 0,
 (' ', 'ĩ'): 0,
 (' ', 'ũ'): 0,
 (' ', 'ơ'): 0,
 (' ', 'ư'): 0,
 (' ', 'ǹ'): 0,
 (' ', 'ạ'): 0,
 (' ', 'ả'): 0,
 (' ', 'ấ'): 0,
 (' ', 'ầ'): 0,
 (' ', 'ẩ'): 0,
 (' ', 'ẫ'): 0,
 (' ', 'ậ'): 0,
 (' ', 'ắ'): 0,
 (' ', 'ằ'): 0,
 (' ', '

In [60]:
# Creating the bigrams frequency distribution for the entire Viet language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, viet_df, 'bigrams')
bigram_fdist

{(' ', ' '): 0.0,
 (' ', 'a'): 0.0028203205883402947,
 (' ', 'b'): 0.004212630752204491,
 (' ', 'c'): 0.007604155510335225,
 (' ', 'd'): 0.007247152904216201,
 (' ', 'e'): 0.00017850130305951234,
 (' ', 'f'): 0.0,
 (' ', 'g'): 0.0013923101638641962,
 (' ', 'h'): 0.01717182535432509,
 (' ', 'i'): 0.00017850130305951234,
 (' ', 'j'): 0.0,
 (' ', 'k'): 0.006747349255649566,
 (' ', 'l'): 0.008282460461961372,
 (' ', 'm'): 0.006069044304023419,
 (' ', 'n'): 0.01199528756559923,
 (' ', 'o'): 0.0002142015636714148,
 (' ', 'p'): 0.0072114526436042985,
 (' ', 'q'): 0.006390346649530541,
 (' ', 'r'): 0.00017850130305951234,
 (' ', 's'): 0.0033201242369069292,
 (' ', 't'): 0.03716397129699047,
 (' ', 'u'): 0.0002142015636714148,
 (' ', 'v'): 0.015529613366177574,
 (' ', 'w'): 0.0,
 (' ', 'x'): 0.0026418192852807826,
 (' ', 'y'): 0.0004641033879547321,
 (' ', 'z'): 0.0,
 (' ', 'à'): 0.0,
 (' ', 'á'): 0.0002499018242833173,
 (' ', 'â'): 3.570026061190247e-05,
 (' ', 'ã'): 0.0,
 (' ', 'ç'): 0.0,
 ('

In [61]:
# Now: Individual Relative Frequency Distributions

In [62]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
# initialized_unigrams

In [63]:
# UNIGRAMS individual frequency distributions
viet_df['indiv_unigrams_fdist'] = viet_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))
# checking that the functin worked for our first example, 'supriyadi'
print(viet_df.iloc[0]['indiv_unigrams_fdist'])
viet_df.tail()
#tu hoàng thông

{' ': 0.14285714285714285, 'a': 0, 'b': 0, 'c': 0, 'd': 0, 'e': 0, 'f': 0, 'g': 0.14285714285714285, 'h': 0.14285714285714285, 'i': 0, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 0.14285714285714285, 'o': 0.07142857142857142, 'p': 0, 'q': 0, 'r': 0, 's': 0, 't': 0.14285714285714285, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0, 'à': 0.07142857142857142, 'á': 0, 'â': 0, 'ã': 0, 'ç': 0, 'é': 0, 'ê': 0, 'ì': 0, 'í': 0, 'ð': 0, 'ñ': 0, 'ò': 0, 'ó': 0, 'ô': 0.07142857142857142, 'õ': 0, 'ù': 0, 'ú': 0, 'ý': 0, 'ă': 0, 'ć': 0, 'đ': 0, 'ĩ': 0, 'ũ': 0, 'ơ': 0, 'ư': 0, 'ǹ': 0, 'ạ': 0, 'ả': 0, 'ấ': 0, 'ầ': 0, 'ẩ': 0, 'ẫ': 0, 'ậ': 0, 'ắ': 0, 'ằ': 0, 'ặ': 0, 'ế': 0, 'ề': 0, 'ể': 0, 'ễ': 0, 'ệ': 0, 'ỉ': 0, 'ị': 0, 'ọ': 0, 'ỏ': 0, 'ố': 0, 'ồ': 0, 'ổ': 0, 'ỗ': 0, 'ộ': 0, 'ớ': 0, 'ờ': 0, 'ở': 0, 'ợ': 0, 'ụ': 0, 'ủ': 0, 'ứ': 0, 'ừ': 0.07142857142857142, 'ử': 0, 'ữ': 0, 'ự': 0, 'ỳ': 0, 'ỵ': 0, 'ỷ': 0, 'ỹ': 0}


Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,unigrams,bigrams,trigrams,avg_token_length,indiv_unigrams_fdist
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (D,...",0,0,2,0,nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.333333,"{' ': 0.13333333333333333, 'a': 0, 'b': 0, 'c'..."
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (P,...",0,0,2,0,nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.666667,"{' ': 0.125, 'a': 0.0625, 'b': 0, 'c': 0.0625,..."
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(L,), (ê,), ( ,), (P,), (h,), (ổ,), (L, ê), (...",0,0,1,0,le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]",2.5,"{' ': 0.16666666666666666, 'a': 0, 'b': 0, 'c'..."
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(V,), (u,), ( ,), (N,), (g,), (o,), (c,), ( ,...",0,0,2,0,vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...",3.0,"{' ': 0.18181818181818182, 'a': 0.090909090909..."
2289,hoang ke viem,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",13,3,"[(H,), (o,), (a,), (n,), (g,), ( ,), (K,), (e,...",0,0,2,0,hoang ke viem,"[h, o, a, n, g, , k, e, , v, i, e, m]","[(h, o), (o, a), (a, n), (n, g), (g, ), ( , k...","[(h, o, a), (o, a, n), (a, n, g), (n, g, ), (...",3.666667,"{' ': 0.15384615384615385, 'a': 0.076923076923..."


In [64]:
# BIGRAMS individual frequency distributions
viet_df['indiv_bigrams_fdist'] = viet_df['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))
print(viet_df.iloc[0]['indiv_bigrams_fdist'][('t', 'ừ')])
print(1 / len(viet_df.iloc[0]['bigrams']))

0.07692307692307693
0.07692307692307693


In [65]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
viet_df['indiv_unigrams_fdist'] = viet_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

# Calculating cosine similarity (UNIGRAMS)
viet_df['unigrams_cosine_sim'] = viet_df['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])
# viet_df

In [66]:
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
viet_df['indiv_bigrams_fdist'] = viet_df['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

# Calculating cosine similarity (BIGRAMS)
viet_df['bigrams_cosine_sim'] = viet_df['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])
viet_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,unigrams,bigrams,trigrams,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(T,), (ừ,), ( ,), (H,), (o,), (à,), (n,), (g,...",0,0,2,0,tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...",4.000000,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07...",0.805625,0.508198
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (T,...",0,0,3,0,nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...",4.750000,"[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884792,0.667716
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(N,), (i,), (c,), (k,), ( ,), (Ú,), (t,), (N,...",0,0,1,0,nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...",3.000000,"[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.592690,0.005600
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(C,), (a,), (o,), ( ,), (V,), (ă,), (n,), ( ,...",0,0,2,0,cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...",3.000000,"[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665965,0.243176
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(T,), (ạ,), ( ,), (T,), (h,), (u,), ( ,), (T,...",0,0,2,0,ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...",3.000000,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.596114,0.288942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (D,...",0,0,2,0,nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.333333,"[[0.13333333333333333, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0...",0.765422,0.440203
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (P,...",0,0,2,0,nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...",4.666667,"[[0.125, 0.0625, 0.0, 0.0625, 0.0, 0.0625, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.901822,0.607800
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(L,), (ê,), ( ,), (P,), (h,), (ổ,), (L, ê), (...",0,0,1,0,le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]",2.500000,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.428565,0.112297
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(V,), (u,), ( ,), (N,), (g,), (o,), (c,), ( ,...",0,0,2,0,vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...",3.000000,"[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.890835,0.339892


In [67]:
#(TRIGRAMS STUFF THAT IS NOT TO BE INCLUDED ANYMORE BECAUSE TOO MUCH DATA.)

# # Finding all possible transliterated characters for trigrams
# all_possible_chars_translit = create_lang_char_distribution(viet_df, 'fullname').keys()
# print('# unique characters with transliteration:', len(all_possible_chars_translit))

# # Creating all possible trigrams from transliterated characters
# initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
# print('Length of trigrams fdist:', len(initialized_trigrams))

# # Changing trigrams column to become transliterated
# viet_df['trigrams'] = viet_df['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# # Creating the trigrams frequency distribution for the entire Malay language
# trigram_fdist = create_lang_gram_distribution(initialized_trigrams, viet_df, 'trigrams')

# viet_df
# viet_df['indiv_trigrams_fdist'] = viet_df['trigrams'].apply(lambda entry: initialized_trigrams.copy())

# '''
# Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

# trigrams_list: the list of trigrams for this current example.
# init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
# '''
# def set_indiv_trigram_dist(trigrams_list, init_trigrams):
#     trigrams_fdist_relative = init_trigrams
#     num_grams = len(trigrams_list)

#     for gram in trigrams_list:
#         trigrams_fdist_relative[gram] += 1 / num_grams

#     return trigrams_fdist_relative
# viet_df['indiv_trigrams_fdist'] = viet_df.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

# print(viet_df.loc[0, 'indiv_trigrams_fdist'][('t', 'u', ' ')])
# print(1 / len(viet_df.loc[0, 'trigrams'])) # manual calculation

# # # Checking 1st example
# # print(viet_df.loc[1, 'fullname'])
# # print(viet_df.loc[1, 'indiv_trigrams_fdist'][('s', 'i', 'h')])
# # print(1 / len(df_indo.loc[1, 'trigrams'])) # manual calculation

# # This cell cannot be run more than once!
# # Converting fdists to numpy arrays first so we can pass them into cosine_similarity
# viet_df['indiv_trigrams_fdist'] = viet_df['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
# trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

# # Calculating cosine similarity
# viet_df['trigrams_cosine_sim'] = viet_df['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
# viet_df.head()

In [68]:
# cnchar_df.to_csv('cnchar_df.csv', index=False)
# cnrom_df.to_csv('cnrom_df.csv', index=False)
# viet_df.to_csv('viet_df.csv', index=False)
cnchar_df.to_pickle('pickled_dataframes/cnchar_df.pkl.gz', compression='gzip')
cnrom_df.to_pickle('pickled_dataframes/cnrom_df.pkl.gz', compression='gzip')
viet_df.to_pickle('pickled_dataframes/viet_df.pkl.gz', compression='gzip')

print('DONE')

DONE
