In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from alphabet_detector import AlphabetDetector
import unicodedata
from langdetect import detect
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import pinyin
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity
import regex as re



nltk.download('stopwords')

cnchar_df = pd.read_excel('name_data/exigerData/EXGR_Chinese names(Character).xlsx')
cnrom_df = pd.read_excel("name_data/exigerData/EXGR_Chinese names(Romantized).xlsx")
viet_df = pd.read_excel("name_data/exigerData/EXGR_Vietnamese.xlsx")

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joliehuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#FUNCTIONS

#alphabet
def langname(name):
    if isinstance(name, str):
        result = []
        for char in name:
            try:
                char_name = unicodedata.name(char).split(' ')[0]
                result.append(char_name)
            except ValueError:
                # Handle characters without Unicode names
                result.append(char)
        return result
    else:
        return None  # Handle non-string or NaN values

#num_tokens (how many parts to the name?)
def calculate_token_length(name):
    if isinstance(name, str):
        # Check if the name contains Chinese characters
        if any('\u4e00' <= char <= '\u9fff' for char in name):
            # For Chinese names, count characters
            return len(name)
        else:
            # For non-Chinese names, split by spaces and count words
            return len(name.split())
    else:
        return None  # Handle non-string or NaN values

#char_ngrams
def generate_char_ngrams(text):
    if isinstance(text, str):
        # Tokenize the text into words
        words = text.split()
        
        # Create unigrams, bigrams, and trigrams
        unigrams = list(ngrams(text, 1))
        bigrams = list(ngrams(text, 2))
        trigrams = list(ngrams(text, 3))
        
        # Interpolated n-grams (combining unigrams, bigrams, and trigrams)
        interpolated_ngrams = unigrams + bigrams + trigrams
        
        return interpolated_ngrams
    else:
        return []

#name_length (length of the entire name string)
def get_name_length(fullname):
    if isinstance(fullname, str):
        return len(fullname)
    else:
        return np.nan 

#unigrams
def generate_char_unigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        unigrams = list(ngrams(characters, 1))
        
        return unigrams
    else:
        return []

#bigrams
def generate_char_bigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        bigrams = list(ngrams(characters, 2))
        
        return bigrams
    else:
        return []

#trigrams
def generate_char_trigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        trigrams = list(ngrams(characters, 3))
        
        return trigrams
    else:
        return []

# Function to transliterate a Chinese name
def transliterate_name(name):
    return pinyin.get(name, format="strip", delimiter=" ")


EXGR_Chinese names(Character).xlsx - CLEANING
===

In [3]:
pd.set_option('display.max_columns', None)
# cnchar_df

In [4]:
duplicates = cnchar_df['fullname'].duplicated()
# print(cnchar_df[duplicates])

In [5]:
cnchar_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# cnchar_df

In [6]:
# null_values = cnchar_df['fullname'].isnull()
# print(cnchar_df[null_values])
# NO NULL VALUES

DATA EXPLORATION

In [7]:
# cnchar_df['Notes'].unique()

**Maybe notes saying that it's a minority name/non-native name should be counted? Drop all the ones that say pseudonym/stage name? Maybe we can afford to drop certain examples of names, since we have so many of them...?

In [8]:
drop_notes = ['minority name', 'non-native', 'pseudonym', 'stage name',
       'should be侯赛因.江；minority name',
       '劉-family name of her husband;吳-her family name', 'satage name',
       '周-family name of her husband;梁-her family name', 'fake name',
       'pen name', '方-family name of her husband; 黄-her family name',
       '曹-family name of her husband;王-her family name',
       '朱-family name of her husband;李-her family name',
       '朱-family name of her husband;葉-her family name',
       '林-family name of her husband;鄭-her family name',
       '梁-family name of her husband;劉- her family name ',
       '梁-family name of her husband;高- her family name',
       'should be 泰迪.羅賓；non-native',
       '罗-family name of her husband;范-her family name',
       '范-family name of her husband;徐-her family name',
       '葉-family name of her husband；劉-her family name',
       '蘇-family name of her husband;周- her family name', 'buddhist',
       'family name should be in Chinese character 周',
       '陈-family name of her husband; 冯-her family name',
       '陳-family name of her husband;方-her family name',
       '高-family name of her husband;金-her family name',
       '黃-family name of her husband;馬-her family name']

cnchar_df = cnchar_df[~cnchar_df['Notes'].isin(drop_notes)]
# cnchar_df

In [9]:
# cnchar_df['Notes'].unique()

In [10]:
# cnchar_df['Unnamed: 6'].unique()

In [11]:
cnchar_df.drop(columns = ['Unnamed: 0', 'id', 'Family name', 'Given Name', 'Notes' ,'Unnamed: 6'], inplace = True)
# cnchar_df

Feature Engineering (Chinese Character)

In [12]:
cnchar_df['alphabet'] = cnchar_df['fullname'].apply(langname)
# cnchar_df

In [13]:
cnchar_df['word_length'] = cnchar_df['fullname'].apply(get_name_length)
# cnchar_df

In [14]:
cnchar_df['num_tokens'] = cnchar_df['fullname'].apply(calculate_token_length)
# cnchar_df

In [15]:
cnchar_df['char_ngrams'] = cnchar_df['fullname'].apply(generate_char_ngrams)
# cnchar_df

In [16]:
cnchar_df['unigrams'] = cnchar_df['fullname'].apply(generate_char_unigrams)
cnchar_df['bigrams'] = cnchar_df['fullname'].apply(generate_char_bigrams)
cnchar_df['trigrams'] = cnchar_df['fullname'].apply(generate_char_trigrams)
# cnchar_df

In [17]:
cnchar_df['period_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('.'))
cnchar_df['dash_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('-'))
cnchar_df['space_freq'] = cnchar_df['fullname'].apply(lambda name: name.count(' '))
cnchar_df['apostrophe_freq'] = cnchar_df['fullname'].apply(lambda name: name.count('\''))
# cnchar_df

In [18]:
cnchar_df['transliteration'] = cnchar_df['fullname'].apply(transliterate_name)
cnchar_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration
0,丁一平,"[CJK, CJK, CJK]",3,3,"[(丁,), (一,), (平,), (丁, 一), (一, 平), (丁, 一, 平)]","[(丁,), (一,), (平,)]","[(丁, 一), (一, 平)]","[(丁, 一, 平)]",0,0,0,0,ding yi ping
1,丁世雄,"[CJK, CJK, CJK]",3,3,"[(丁,), (世,), (雄,), (丁, 世), (世, 雄), (丁, 世, 雄)]","[(丁,), (世,), (雄,)]","[(丁, 世), (世, 雄)]","[(丁, 世, 雄)]",0,0,0,0,ding shi xiong
2,丁亦昕,"[CJK, CJK, CJK]",3,3,"[(丁,), (亦,), (昕,), (丁, 亦), (亦, 昕), (丁, 亦, 昕)]","[(丁,), (亦,), (昕,)]","[(丁, 亦), (亦, 昕)]","[(丁, 亦, 昕)]",0,0,0,0,ding yi xin
3,丁仲礼,"[CJK, CJK, CJK]",3,3,"[(丁,), (仲,), (礼,), (丁, 仲), (仲, 礼), (丁, 仲, 礼)]","[(丁,), (仲,), (礼,)]","[(丁, 仲), (仲, 礼)]","[(丁, 仲, 礼)]",0,0,0,0,ding zhong li
4,丁伟,"[CJK, CJK]",2,2,"[(丁,), (伟,), (丁, 伟)]","[(丁,), (伟,)]","[(丁, 伟)]",[],0,0,0,0,ding wei
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12038,龚翔宇,"[CJK, CJK, CJK]",3,3,"[(龚,), (翔,), (宇,), (龚, 翔), (翔, 宇), (龚, 翔, 宇)]","[(龚,), (翔,), (宇,)]","[(龚, 翔), (翔, 宇)]","[(龚, 翔, 宇)]",0,0,0,0,gong xiang yu
12039,龚育之,"[CJK, CJK, CJK]",3,3,"[(龚,), (育,), (之,), (龚, 育), (育, 之), (龚, 育, 之)]","[(龚,), (育,), (之,)]","[(龚, 育), (育, 之)]","[(龚, 育, 之)]",0,0,0,0,gong yu zhi
12040,龚蓓苾,"[CJK, CJK, CJK]",3,3,"[(龚,), (蓓,), (苾,), (龚, 蓓), (蓓, 苾), (龚, 蓓, 苾)]","[(龚,), (蓓,), (苾,)]","[(龚, 蓓), (蓓, 苾)]","[(龚, 蓓, 苾)]",0,0,0,0,gong bei bi
12041,龚贤永,"[CJK, CJK, CJK]",3,3,"[(龚,), (贤,), (永,), (龚, 贤), (贤, 永), (龚, 贤, 永)]","[(龚,), (贤,), (永,)]","[(龚, 贤), (贤, 永)]","[(龚, 贤, 永)]",0,0,0,0,gong xian yong


In [19]:
cnchar_df['t.alphabet'] = cnchar_df['transliteration'].apply(langname)
# cnchar_df
cnchar_df['t.word_length'] = cnchar_df['transliteration'].apply(get_name_length)
# cnchar_df
cnchar_df['t.num_tokens'] = cnchar_df['transliteration'].apply(calculate_token_length)
# cnchar_df
cnchar_df['t.char_ngrams'] = cnchar_df['transliteration'].apply(generate_char_ngrams)
# cnchar_df
cnchar_df['t.unigrams'] = cnchar_df['transliteration'].apply(generate_char_unigrams)
cnchar_df['t.bigrams'] = cnchar_df['transliteration'].apply(generate_char_bigrams)
cnchar_df['t.trigrams'] = cnchar_df['transliteration'].apply(generate_char_trigrams)
# cnchar_df
cnchar_df['t.period_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('.'))
cnchar_df['t.dash_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('-'))
cnchar_df['t.space_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count(' '))
cnchar_df['t.apostrophe_freq'] = cnchar_df['transliteration'].apply(lambda name: name.count('\''))
# cnchar_df
cnchar_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,t.alphabet,t.word_length,t.num_tokens,t.char_ngrams,t.unigrams,t.bigrams,t.trigrams,t.period_freq,t.dash_freq,t.space_freq,t.apostrophe_freq
0,丁一平,"[CJK, CJK, CJK]",3,3,"[(丁,), (一,), (平,), (丁, 一), (一, 平), (丁, 一, 平)]","[(丁,), (一,), (平,)]","[(丁, 一), (一, 平)]","[(丁, 一, 平)]",0,0,0,0,ding yi ping,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0
1,丁世雄,"[CJK, CJK, CJK]",3,3,"[(丁,), (世,), (雄,), (丁, 世), (世, 雄), (丁, 世, 雄)]","[(丁,), (世,), (雄,)]","[(丁, 世), (世, 雄)]","[(丁, 世, 雄)]",0,0,0,0,ding shi xiong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0
2,丁亦昕,"[CJK, CJK, CJK]",3,3,"[(丁,), (亦,), (昕,), (丁, 亦), (亦, 昕), (丁, 亦, 昕)]","[(丁,), (亦,), (昕,)]","[(丁, 亦), (亦, 昕)]","[(丁, 亦, 昕)]",0,0,0,0,ding yi xin,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0
3,丁仲礼,"[CJK, CJK, CJK]",3,3,"[(丁,), (仲,), (礼,), (丁, 仲), (仲, 礼), (丁, 仲, 礼)]","[(丁,), (仲,), (礼,)]","[(丁, 仲), (仲, 礼)]","[(丁, 仲, 礼)]",0,0,0,0,ding zhong li,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0
4,丁伟,"[CJK, CJK]",2,2,"[(丁,), (伟,), (丁, 伟)]","[(丁,), (伟,)]","[(丁, 伟)]",[],0,0,0,0,ding wei,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,)]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12038,龚翔宇,"[CJK, CJK, CJK]",3,3,"[(龚,), (翔,), (宇,), (龚, 翔), (翔, 宇), (龚, 翔, 宇)]","[(龚,), (翔,), (宇,)]","[(龚, 翔), (翔, 宇)]","[(龚, 翔, 宇)]",0,0,0,0,gong xiang yu,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",13,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0
12039,龚育之,"[CJK, CJK, CJK]",3,3,"[(龚,), (育,), (之,), (龚, 育), (育, 之), (龚, 育, 之)]","[(龚,), (育,), (之,)]","[(龚, 育), (育, 之)]","[(龚, 育, 之)]",0,0,0,0,gong yu zhi,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",11,3,"[(g,), (o,), (n,), (g,), ( ,), (y,), (u,), ( ,...","[(g,), (o,), (n,), (g,), ( ,), (y,), (u,), ( ,...","[(g, o), (o, n), (n, g), (g, ), ( , y), (y, u...","[(g, o, n), (o, n, g), (n, g, ), (g, , y), (...",0,0,2,0
12040,龚蓓苾,"[CJK, CJK, CJK]",3,3,"[(龚,), (蓓,), (苾,), (龚, 蓓), (蓓, 苾), (龚, 蓓, 苾)]","[(龚,), (蓓,), (苾,)]","[(龚, 蓓), (蓓, 苾)]","[(龚, 蓓, 苾)]",0,0,0,0,gong bei bi,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",11,3,"[(g,), (o,), (n,), (g,), ( ,), (b,), (e,), (i,...","[(g,), (o,), (n,), (g,), ( ,), (b,), (e,), (i,...","[(g, o), (o, n), (n, g), (g, ), ( , b), (b, e...","[(g, o, n), (o, n, g), (n, g, ), (g, , b), (...",0,0,2,0
12041,龚贤永,"[CJK, CJK, CJK]",3,3,"[(龚,), (贤,), (永,), (龚, 贤), (贤, 永), (龚, 贤, 永)]","[(龚,), (贤,), (永,)]","[(龚, 贤), (贤, 永)]","[(龚, 贤, 永)]",0,0,0,0,gong xian yong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",14,3,"[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[(g,), (o,), (n,), (g,), ( ,), (x,), (i,), (a,...","[(g, o), (o, n), (n, g), (g, ), ( , x), (x, i...","[(g, o, n), (o, n, g), (n, g, ), (g, , x), (...",0,0,2,0


EXGR_Chinese names(Romantized).xlsx - CLEANING
===

In [20]:
# cnrom_df

In [21]:
cnrom_df.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [22]:
cnrom_df['Notes'].unique()

array(['non-native', nan], dtype=object)

In [23]:
duplicates = cnrom_df['fullname'].duplicated()
# print(cnrom_df[duplicates])

In [24]:
cnrom_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# cnrom_df

In [25]:
#Get rid of all examples with 'non-native', to try and further narrow df down to just Chinese names
cnrom_df.drop(cnrom_df[cnrom_df['Notes'] == 'non-native'].index, inplace=True)
# cnrom_df

In [26]:
cnrom_df['Notes'].unique()

array([nan], dtype=object)

In [27]:
cnrom_df['alphabet'] = cnrom_df['fullname'].apply(langname)
# cnrom_df

In [28]:
cnrom_df['word_length'] = cnrom_df['fullname'].apply(get_name_length)
# cnrom_df

In [29]:
cnrom_df['num_tokens'] = cnrom_df['fullname'].apply(calculate_token_length)
# cnrom_df

In [30]:
cnrom_df['char_ngrams'] = cnrom_df['fullname'].apply(generate_char_ngrams)
# cnrom_df

In [31]:
cnrom_df['unigrams'] = cnrom_df['fullname'].apply(generate_char_unigrams)
cnrom_df['bigrams'] = cnrom_df['fullname'].apply(generate_char_bigrams)
cnrom_df['trigrams'] = cnrom_df['fullname'].apply(generate_char_trigrams)
# cnrom_df

In [32]:
# cnrom_df['period_freq'] = cnrom_df['fullname'].apply(lambda name: name.count('.'))
# cnrom_df['dash_freq'] = cnrom_df['fullname'].apply(lambda name: name.count('-'))
# cnrom_df['space_freq'] = cnrom_df['fullname'].apply(lambda name: name.count(' '))
# cnrom_df['apostrophe_freq'] = cnrom_df['fullname'].apply(lambda name: name.count('\''))
cnrom_df

Unnamed: 0,fullname,Family name,Given Name,Notes,alphabet,word_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams
2,Aaron Kwok,,,,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.0,2.0,"[(A,), (a,), (r,), (o,), (n,), ( ,), (K,), (w,...","[(A,), (a,), (r,), (o,), (n,), ( ,), (K,), (w,...","[(A, a), (a, r), (r, o), (o, n), (n, ), ( , K...","[(A, a, r), (a, r, o), (r, o, n), (o, n, ), (..."
9,Adhe Tapontsang,,,,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15.0,2.0,"[(A,), (d,), (h,), (e,), ( ,), (T,), (a,), (p,...","[(A,), (d,), (h,), (e,), ( ,), (T,), (a,), (p,...","[(A, d), (d, h), (h, e), (e, ), ( , T), (T, a...","[(A, d, h), (d, h, e), (h, e, ), (e, , T), (..."
10,Ai Baojun,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9.0,2.0,"[(A,), (i,), ( ,), (B,), (a,), (o,), (j,), (u,...","[(A,), (i,), ( ,), (B,), (a,), (o,), (j,), (u,...","[(A, i), (i, ), ( , B), (B, a), (a, o), (o, j...","[(A, i, ), (i, , B), ( , B, a), (B, a, o), (..."
11,Ai Guoxiang,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11.0,2.0,"[(A,), (i,), ( ,), (G,), (u,), (o,), (x,), (i,...","[(A,), (i,), ( ,), (G,), (u,), (o,), (x,), (i,...","[(A, i), (i, ), ( , G), (G, u), (u, o), (o, x...","[(A, i, ), (i, , G), ( , G, u), (G, u, o), (..."
12,Ai Husheng,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10.0,2.0,"[(A,), (i,), ( ,), (H,), (u,), (s,), (h,), (e,...","[(A,), (i,), ( ,), (H,), (u,), (s,), (h,), (e,...","[(A, i), (i, ), ( , H), (H, u), (u, s), (s, h...","[(A, i, ), (i, , H), ( , H, u), (H, u, s), (..."
...,...,...,...,...,...,...,...,...,...,...,...
12009,Zu Xiaosun,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10.0,2.0,"[(Z,), (u,), ( ,), (X,), (i,), (a,), (o,), (s,...","[(Z,), (u,), ( ,), (X,), (i,), (a,), (o,), (s,...","[(Z, u), (u, ), ( , X), (X, i), (i, a), (a, o...","[(Z, u, ), (u, , X), ( , X, i), (X, i, a), (..."
12010,Zuo Shusheng,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12.0,2.0,"[(Z,), (u,), (o,), ( ,), (S,), (h,), (u,), (s,...","[(Z,), (u,), (o,), ( ,), (S,), (h,), (u,), (s,...","[(Z, u), (u, o), (o, ), ( , S), (S, h), (h, u...","[(Z, u, o), (u, o, ), (o, , S), ( , S, h), (..."
12011,Zuo Xiaoqing,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12.0,2.0,"[(Z,), (u,), (o,), ( ,), (X,), (i,), (a,), (o,...","[(Z,), (u,), (o,), ( ,), (X,), (i,), (a,), (o,...","[(Z, u), (u, o), (o, ), ( , X), (X, i), (i, a...","[(Z, u, o), (u, o, ), (o, , X), ( , X, i), (..."
12012,Zuo Yiteng,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10.0,2.0,"[(Z,), (u,), (o,), ( ,), (Y,), (i,), (t,), (e,...","[(Z,), (u,), (o,), ( ,), (Y,), (i,), (t,), (e,...","[(Z, u), (u, o), (o, ), ( , Y), (Y, i), (i, t...","[(Z, u, o), (u, o, ), (o, , Y), ( , Y, i), (..."


EXGR_Vietnamese.xlsx - CLEANING
===

In [33]:
viet_df.drop(columns = ['Unnamed: 0', 'fullname','Family name','Given name'], inplace = True)
viet_df = viet_df.rename(columns={'id': 'fullname'})

In [34]:
viet_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# viet_df

In [35]:
viet_df['fullname'] = viet_df['fullname'].str.lower()
# viet_df

In [36]:
# # Assuming you have a DataFrame named viet_df
# unique_chars = set("".join(viet_df['transliteration']))

# # Convert the set of unique characters back to a sorted list for readability
# unique_chars_list = sorted(list(unique_chars))

# # Print the unique characters
# print(unique_chars_list)

In [37]:
viet_df['alphabet'] = viet_df['fullname'].apply(langname)
# viet_df

In [38]:
viet_df['word_length'] = viet_df['fullname'].apply(get_name_length)
# viet_df

In [39]:
viet_df['num_tokens'] = viet_df['fullname'].apply(calculate_token_length)
# viet_df

In [40]:
viet_df['char_ngrams'] = viet_df['fullname'].apply(generate_char_ngrams)
# viet_df

In [41]:
# viet_df['period_freq'] = viet_df['fullname'].apply(lambda name: name.count('.'))
# viet_df['dash_freq'] = viet_df['fullname'].apply(lambda name: name.count('-'))
# viet_df['space_freq'] = viet_df['fullname'].apply(lambda name: name.count(' '))
# viet_df['apostrophe_freq'] = viet_df['fullname'].apply(lambda name: name.count('\''))

In [42]:
viet_df['transliteration'] = viet_df['fullname'].apply(lambda name: unidecode(name))
viet_df[viet_df['fullname'] != viet_df['transliteration']][['fullname', 'transliteration']]

Unnamed: 0,fullname,transliteration
0,từ hoàng thông,tu hoang thong
1,nguyễn thị phương thảo,nguyen thi phuong thao
2,nguyễn mạnh dũng (cầu thủ bóng đá sinh 1981),nguyen manh dung (cau thu bong da sinh 1981)
3,nick út,nick ut
4,cao văn lầu,cao van lau
...,...,...
3173,bùi tiến dũng (thái bình),bui tien dung (thai binh)
3182,tòng thị phóng,tong thi phong
3183,lê hồng minh,le hong minh
3187,đinh thế huynh,dinh the huynh


In [43]:
# unique_chars = set("".join(viet_df['transliteration']))
# unique_chars_list = sorted(list(unique_chars))
# print(unique_chars_list)

In [44]:
characters_to_check = ["'", '(', ')', ',', '-', '.', '1', '2', '5', '7', '8', '9']
pattern = '|'.join(map(re.escape, characters_to_check))
viet_df = viet_df[~viet_df['transliteration'].str.contains(pattern)]
viet_df = viet_df.reset_index(drop=True)
unique_chars = set("".join(viet_df['transliteration']))
unique_chars_list = sorted(list(unique_chars))
print(unique_chars_list)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [45]:
#NOTE: DOING FOR TRANSLITERATION BC THERE'S TOO MANY ACCENTED CHARS. IN VIET LANGUAGE
#I changed to fullname - now that we know we're getting rid of trigrams

viet_df['unigrams'] = viet_df['fullname'].apply(lambda name: list(name))
viet_df['bigrams'] = viet_df['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
viet_df['trigrams'] = viet_df['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
# viet_df

In [46]:
viet_df.shape

(2290, 9)

In [47]:
# Assuming you have a DataFrame named viet_df
viet_df['transliteration'] = viet_df['transliteration'].str.lower()

In [48]:
def create_character_frequency_hashmap(df, names_col):
    char_freqs = {}
    for name in df[names_col]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
    return char_freqs

In [49]:
viet_uni = create_character_frequency_hashmap(viet_df, 'unigrams')
print(viet_uni)

{'t': 1609, 'ừ': 14, ' ': 4311, 'h': 2849, 'o': 747, 'à': 234, 'n': 4586, 'g': 1909, 'ô': 167, 'u': 1742, 'y': 856, 'ễ': 384, 'ị': 138, 'p': 468, 'ư': 256, 'ơ': 183, 'ả': 77, 'i': 1072, 'c': 709, 'k': 231, 'ú': 73, 'a': 1385, 'v': 568, 'ă': 219, 'l': 540, 'ầ': 162, 'ạ': 159, 'â': 165, 'd': 314, 'e': 490, 'đ': 260, 'ọ': 84, 'q': 196, 'b': 192, 'í': 55, 'r': 509, 'm': 565, 's': 147, 'ế': 99, 'ỳ': 38, 'w': 3, 'ù': 73, 'ự': 9, 'f': 12, 'ệ': 87, 'á': 120, 'é': 5, 'ũ': 62, 'ờ': 35, 'ề': 37, 'ồ': 59, 'ố': 39, 'ỗ': 25, 'ỹ': 17, 'ý': 43, 'ã': 10, 'ê': 255, 'x': 96, 'ặ': 29, 'ứ': 56, 'ủ': 5, 'ổ': 6, 'ằ': 15, 'ể': 12, 'ấ': 57, 'ắ': 28, 'ậ': 43, 'ì': 56, 'ò': 16, 'ợ': 20, 'ð': 5, 'z': 4, 'ĩ': 24, 'j': 15, 'ử': 16, 'ữ': 34, 'ộ': 19, 'ỷ': 3, 'ớ': 12, 'ụ': 27, 'õ': 21, 'ẩ': 10, 'ó': 3, 'ć': 1, 'ỵ': 3, 'ẫ': 2, 'ỉ': 3, 'ở': 3, 'ǹ': 1, 'ỏ': 1, 'ç': 1, 'ñ': 1}


In [50]:
viet_bi = create_character_frequency_hashmap(viet_df, 'bigrams')
print(viet_bi)

{('t', 'ừ'): 10, ('ừ', ' '): 4, (' ', 'h'): 481, ('h', 'o'): 249, ('o', 'à'): 127, ('à', 'n'): 147, ('n', 'g'): 1846, ('g', ' '): 570, (' ', 't'): 1041, ('t', 'h'): 608, ('h', 'ô'): 11, ('ô', 'n'): 91, ('g', 'u'): 591, ('u', 'y'): 758, ('y', 'ễ'): 371, ('ễ', 'n'): 375, ('n', ' '): 1417, ('h', 'ị'): 113, ('ị', ' '): 101, (' ', 'p'): 202, ('p', 'h'): 410, ('h', 'ư'): 103, ('ư', 'ơ'): 145, ('ơ', 'n'): 176, ('h', 'ả'): 43, ('ả', 'o'): 29, ('n', 'i'): 18, ('i', 'c'): 24, ('c', 'k'): 3, ('k', ' '): 2, (' ', 'ú'): 1, ('ú', 't'): 1, ('c', 'a'): 49, ('a', 'o'): 71, ('o', ' '): 116, (' ', 'v'): 435, ('v', 'ă'): 203, ('ă', 'n'): 218, (' ', 'l'): 232, ('l', 'ầ'): 1, ('ầ', 'u'): 5, ('t', 'ạ'): 5, ('ạ', ' '): 5, ('h', 'u'): 334, ('u', ' '): 170, ('h', 'â'): 31, ('â', 'u'): 19, ('d', 'i'): 50, ('i', 'n'): 262, ('n', 'h'): 797, ('h', ' '): 461, ('h', 'e'): 12, ('e', ' '): 94, ('y', 'n'): 23, ('đ', 'ô'): 7, (' ', 'n'): 336, ('h', 'i'): 223, ('y', 'e'): 211, ('e', 'n'): 257, ('l', 'a'): 58, ('a', 'n'): 

In [51]:
viet_tri = create_character_frequency_hashmap(viet_df, 'trigrams')
print(viet_tri)

{('t', 'ừ', ' '): 4, ('ừ', ' ', 'h'): 2, (' ', 'h', 'o'): 103, ('h', 'o', 'à'): 107, ('o', 'à', 'n'): 116, ('à', 'n', 'g'): 94, ('n', 'g', ' '): 570, ('g', ' ', 't'): 146, (' ', 't', 'h'): 545, ('t', 'h', 'ô'): 3, ('h', 'ô', 'n'): 5, ('ô', 'n', 'g'): 76, ('n', 'g', 'u'): 591, ('g', 'u', 'y'): 588, ('u', 'y', 'ễ'): 371, ('y', 'ễ', 'n'): 371, ('ễ', 'n', ' '): 367, ('n', ' ', 't'): 347, ('t', 'h', 'ị'): 110, ('h', 'ị', ' '): 101, ('ị', ' ', 'p'): 5, (' ', 'p', 'h'): 199, ('p', 'h', 'ư'): 43, ('h', 'ư', 'ơ'): 56, ('ư', 'ơ', 'n'): 145, ('ơ', 'n', 'g'): 144, ('t', 'h', 'ả'): 14, ('h', 'ả', 'o'): 14, ('n', 'i', 'c'): 4, ('i', 'c', 'k'): 2, ('c', 'k', ' '): 2, ('k', ' ', 'ú'): 1, (' ', 'ú', 't'): 1, ('c', 'a', 'o'): 35, ('a', 'o', ' '): 40, ('o', ' ', 'v'): 16, (' ', 'v', 'ă'): 195, ('v', 'ă', 'n'): 203, ('ă', 'n', ' '): 198, ('n', ' ', 'l'): 61, (' ', 'l', 'ầ'): 1, ('l', 'ầ', 'u'): 1, ('t', 'ạ', ' '): 5, ('ạ', ' ', 't'): 3, ('t', 'h', 'u'): 76, ('h', 'u', ' '): 40, ('u', ' ', 't'): 37, ('t', 

In [52]:
viet_unifreqdist = nltk.FreqDist(viet_uni).most_common(30)
# viet_unifreqdist

In [53]:
# viet_unifd = pd.DataFrame.from_dict(viet_unifreqdist)
# viet_unifd.rename(columns={0: 'unigram', 1: 'count'}, inplace=True)
# viet_unifd.plot(x="unigram", y="count", kind="bar") 

In [54]:
# viet_bifreqdist = nltk.FreqDist(viet_bi).most_common(30)
# viet_bifd = pd.DataFrame.from_dict(viet_bifreqdist)
# viet_bifd.rename(columns={0: 'bigram', 1: 'count'}, inplace=True)
# viet_bifd.plot(x="bigram", y="count", kind="bar") 

In [55]:
# viet_trifreqdist = nltk.FreqDist(viet_tri).most_common(30)
# viet_trifd = pd.DataFrame.from_dict(viet_trifreqdist)
# viet_trifd.rename(columns={0: 'trigram', 1: 'count'}, inplace=True)
# viet_trifd.plot(x="trigram", y="count", kind="bar") 

In [56]:
#todo: do it with relative frequency out of the whole dataset

In [57]:
#How many total unigrams/bigrams/trigrams across the whole dataset? - viet_df

In [58]:
# all_unigrams = [unigram for row in viet_df['unigrams'] for unigram in row]
# total_unigrams = len(all_unigrams)
# print(f"Total number of unigrams (viet_df): {total_unigrams}")

In [59]:
# all_bigrams = [bigram for row in viet_df['bigrams'] for bigram in row]
# total_bigrams = len(all_bigrams)
# print(f"Total number of bigrams (viet_df): {total_bigrams}")

In [60]:
# all_trigrams = [trigram for row in viet_df['trigrams'] for trigram in row]
# total_trigrams = len(all_trigrams)
# print(f"Total number of trigrams (viet_df): {total_trigrams}")

In [61]:
# def create_gram_frequency_hashmap(df, col_name):
#     gram_freqs = {}
#     total_grams = 0

#     for grams_list in df[col_name]:
#         for gram in grams_list:
#             if len(gram) == 1 or len(gram) == 2 or len(gram) == 3:
#                 if gram not in gram_freqs.keys():
#                     gram_freqs[gram] = 1
#                 else:
#                     gram_freqs[gram] += 1
#                 total_grams += 1

#     gram_freqs_relative = {gram: count / total_grams for gram, count in gram_freqs.items()}

#     return gram_freqs_relative

# viet_unifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'unigrams')
# viet_bifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'bigrams')
# viet_trifreqdist_relative = create_gram_frequency_hashmap(viet_df, 'trigrams')

# def plot_gram_frequency(gram_freq_dist, title, total_grams):
#     gram_freq_dist_sorted = sorted(gram_freq_dist.items(), key=lambda x: x[1], reverse=True)
#     gram_df_relative = pd.DataFrame(gram_freq_dist_sorted, columns=['gram', 'relative_frequency'])
#     top_30_gram_df_relative = gram_df_relative[:30]
#     ax = top_30_gram_df_relative.set_index('gram').plot(kind="bar", title=title)
#     ax.legend([f"Total number of {title.split()[0].capitalize()} ({total_grams})"])
#     plt.xlabel('Gram')
#     plt.ylabel('Relative Frequency')
#     plt.show()

# total_unigrams = 32024
# total_bigrams = 29654
# total_trigrams = 27284

# plot_gram_frequency(viet_unifreqdist_relative, 'Unigrams Relative Frequency', total_unigrams)
# plot_gram_frequency(viet_bifreqdist_relative, 'Bigrams Relative Frequency', total_bigrams)
# plot_gram_frequency(viet_trifreqdist_relative, 'Trigrams Relative Frequency', total_trigrams)

NEW FREQUENCY DISTRIBUTION + COSINE SIMILARITY WORK
---

In [62]:
# viet_df['fullname'] = viet_df['fullname'].str.lower()
# viet_df

In [63]:
#ALL NEW FREQ DIST STUFF + COSINE

In [64]:
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

In [65]:
unigram_fdist = create_lang_char_distribution(viet_df, 'fullname')
print(len(unigram_fdist))
# unigram_fdist

92


In [66]:
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', 'à'): 0,
 (' ', 'á'): 0,
 (' ', 'â'): 0,
 (' ', 'ã'): 0,
 (' ', 'ç'): 0,
 (' ', 'é'): 0,
 (' ', 'ê'): 0,
 (' ', 'ì'): 0,
 (' ', 'í'): 0,
 (' ', 'ð'): 0,
 (' ', 'ñ'): 0,
 (' ', 'ò'): 0,
 (' ', 'ó'): 0,
 (' ', 'ô'): 0,
 (' ', 'õ'): 0,
 (' ', 'ù'): 0,
 (' ', 'ú'): 0,
 (' ', 'ý'): 0,
 (' ', 'ă'): 0,
 (' ', 'ć'): 0,
 (' ', 'đ'): 0,
 (' ', 'ĩ'): 0,
 (' ', 'ũ'): 0,
 (' ', 'ơ'): 0,
 (' ', 'ư'): 0,
 (' ', 'ǹ'): 0,
 (' ', 'ạ'): 0,
 (' ', 'ả'): 0,
 (' ', 'ấ'): 0,
 (' ', 'ầ'): 0,
 (' ', 'ẩ'): 0,
 (' ', 'ẫ'): 0,
 (' ', 'ậ'): 0,
 (' ', 'ắ'): 0,
 (' ', 'ằ'): 0,
 (' ', '

In [67]:
# Creating the bigrams frequency distribution for the entire Indonesian language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, viet_df, 'bigrams')
bigram_fdist

{(' ', ' '): 0.0,
 (' ', 'a'): 0.0028203205883402947,
 (' ', 'b'): 0.004212630752204491,
 (' ', 'c'): 0.007604155510335225,
 (' ', 'd'): 0.007247152904216201,
 (' ', 'e'): 0.00017850130305951234,
 (' ', 'f'): 0.0,
 (' ', 'g'): 0.0013923101638641962,
 (' ', 'h'): 0.01717182535432509,
 (' ', 'i'): 0.00017850130305951234,
 (' ', 'j'): 0.0,
 (' ', 'k'): 0.006747349255649566,
 (' ', 'l'): 0.008282460461961372,
 (' ', 'm'): 0.006069044304023419,
 (' ', 'n'): 0.01199528756559923,
 (' ', 'o'): 0.0002142015636714148,
 (' ', 'p'): 0.0072114526436042985,
 (' ', 'q'): 0.006390346649530541,
 (' ', 'r'): 0.00017850130305951234,
 (' ', 's'): 0.0033201242369069292,
 (' ', 't'): 0.03716397129699047,
 (' ', 'u'): 0.0002142015636714148,
 (' ', 'v'): 0.015529613366177574,
 (' ', 'w'): 0.0,
 (' ', 'x'): 0.0026418192852807826,
 (' ', 'y'): 0.0004641033879547321,
 (' ', 'z'): 0.0,
 (' ', 'à'): 0.0,
 (' ', 'á'): 0.0002499018242833173,
 (' ', 'â'): 3.570026061190247e-05,
 (' ', 'ã'): 0.0,
 (' ', 'ç'): 0.0,
 ('

In [68]:
# Now: Individual Relative Frequency Distributions

In [69]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
# initialized_unigrams

In [70]:
# viet_df.iloc[0]

In [71]:
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

# UNIGRAMS individual frequency distributions
viet_df['indiv_unigrams_fdist'] = viet_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(viet_df.iloc[0]['indiv_unigrams_fdist'])

viet_df.tail()

#tu hoàng thông

{' ': 0.14285714285714285, 'a': 0, 'b': 0, 'c': 0, 'd': 0, 'e': 0, 'f': 0, 'g': 0.14285714285714285, 'h': 0.14285714285714285, 'i': 0, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 0.14285714285714285, 'o': 0.07142857142857142, 'p': 0, 'q': 0, 'r': 0, 's': 0, 't': 0.14285714285714285, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0, 'à': 0.07142857142857142, 'á': 0, 'â': 0, 'ã': 0, 'ç': 0, 'é': 0, 'ê': 0, 'ì': 0, 'í': 0, 'ð': 0, 'ñ': 0, 'ò': 0, 'ó': 0, 'ô': 0.07142857142857142, 'õ': 0, 'ù': 0, 'ú': 0, 'ý': 0, 'ă': 0, 'ć': 0, 'đ': 0, 'ĩ': 0, 'ũ': 0, 'ơ': 0, 'ư': 0, 'ǹ': 0, 'ạ': 0, 'ả': 0, 'ấ': 0, 'ầ': 0, 'ẩ': 0, 'ẫ': 0, 'ậ': 0, 'ắ': 0, 'ằ': 0, 'ặ': 0, 'ế': 0, 'ề': 0, 'ể': 0, 'ễ': 0, 'ệ': 0, 'ỉ': 0, 'ị': 0, 'ọ': 0, 'ỏ': 0, 'ố': 0, 'ồ': 0, 'ổ': 0, 'ỗ': 0, 'ộ': 0, 'ớ': 0, 'ờ': 0, 'ở': 0, 'ợ': 0, 'ụ': 0, 'ủ': 0, 'ứ': 0, 'ừ': 0.07142857142857142, 'ử': 0, 'ữ': 0, 'ự': 0, 'ỳ': 0, 'ỵ': 0, 'ỷ': 0, 'ỹ': 0}


Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,transliteration,unigrams,bigrams,trigrams,indiv_unigrams_fdist
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (d,...",nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","{' ': 0.13333333333333333, 'a': 0, 'b': 0, 'c'..."
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (p,...",nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","{' ': 0.125, 'a': 0.0625, 'b': 0, 'c': 0.0625,..."
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(l,), (ê,), ( ,), (p,), (h,), (ổ,), (l, ê), (...",le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]","{' ': 0.16666666666666666, 'a': 0, 'b': 0, 'c'..."
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(v,), (u,), ( ,), (n,), (g,), (o,), (c,), ( ,...",vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...","{' ': 0.18181818181818182, 'a': 0.090909090909..."
2289,hoang ke viem,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",13,3,"[(h,), (o,), (a,), (n,), (g,), ( ,), (k,), (e,...",hoang ke viem,"[h, o, a, n, g, , k, e, , v, i, e, m]","[(h, o), (o, a), (a, n), (n, g), (g, ), ( , k...","[(h, o, a), (o, a, n), (a, n, g), (n, g, ), (...","{' ': 0.15384615384615385, 'a': 0.076923076923..."


In [72]:
# BIGRAMS individual frequency distributions
viet_df['indiv_bigrams_fdist'] = viet_df['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))
print(viet_df.iloc[0]['indiv_bigrams_fdist'][('t', 'u')])
print(1 / len(viet_df.iloc[0]['bigrams']))

0
0.07692307692307693


In [73]:
# # Finding all possible transliterated characters for trigrams
# all_possible_chars_translit = create_lang_char_distribution(viet_df, 'fullname').keys()
# print('# unique characters with transliteration:', len(all_possible_chars_translit))

# # Creating all possible trigrams from transliterated characters
# initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
# print('Length of trigrams fdist:', len(initialized_trigrams))

# # Changing trigrams column to become transliterated
# viet_df['trigrams'] = viet_df['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# # Creating the trigrams frequency distribution for the entire Malay language
# trigram_fdist = create_lang_gram_distribution(initialized_trigrams, viet_df, 'trigrams')

In [74]:
# viet_df
# viet_df['indiv_trigrams_fdist'] = viet_df['trigrams'].apply(lambda entry: initialized_trigrams.copy())

In [75]:
# '''
# Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

# trigrams_list: the list of trigrams for this current example.
# init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
# '''
# def set_indiv_trigram_dist(trigrams_list, init_trigrams):
#     trigrams_fdist_relative = init_trigrams
#     num_grams = len(trigrams_list)

#     for gram in trigrams_list:
#         trigrams_fdist_relative[gram] += 1 / num_grams

#     return trigrams_fdist_relative
# viet_df['indiv_trigrams_fdist'] = viet_df.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

In [76]:
# print(viet_df.loc[0, 'indiv_trigrams_fdist'][('t', 'u', ' ')])
# print(1 / len(viet_df.loc[0, 'trigrams'])) # manual calculation

# # # Checking 1st example
# # print(viet_df.loc[1, 'fullname'])
# # print(viet_df.loc[1, 'indiv_trigrams_fdist'][('s', 'i', 'h')])
# # print(1 / len(df_indo.loc[1, 'trigrams'])) # manual calculation

In [77]:
#TODO: COSINE SIMILARITY for each of viet things

In [78]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
viet_df['indiv_unigrams_fdist'] = viet_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [79]:
# Calculating cosine similarity
viet_df['unigrams_cosine_sim'] = viet_df['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])
viet_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,transliteration,unigrams,bigrams,trigrams,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(t,), (ừ,), ( ,), (h,), (o,), (à,), (n,), (g,...",tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...","[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.805625
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(n,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (t,...",nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...","[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.884792
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(n,), (i,), (c,), (k,), ( ,), (ú,), (t,), (n,...",nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...","[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.592690
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(c,), (a,), (o,), ( ,), (v,), (ă,), (n,), ( ,...",cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...","[[0.18181818181818182, 0.09090909090909091, 0....","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.665965
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(t,), (ạ,), ( ,), (t,), (h,), (u,), ( ,), (t,...",ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...","[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.596114
...,...,...,...,...,...,...,...,...,...,...,...,...
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (d,...",nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.13333333333333333, 0.0, 0.0, 0.06666666666...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.765422
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (p,...",nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.125, 0.0625, 0.0, 0.0625, 0.0, 0.0625, 0.0...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.901822
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(l,), (ê,), ( ,), (p,), (h,), (ổ,), (l, ê), (...",le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]","[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.428565
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(v,), (u,), ( ,), (n,), (g,), (o,), (c,), ( ,...",vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...","[[0.18181818181818182, 0.09090909090909091, 0....","{(' ', ' '): 0, (' ', 'a'): 0, (' ', 'b'): 0, ...",0.890835


In [80]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
viet_df['indiv_bigrams_fdist'] = viet_df['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)


In [81]:
# Calculating cosine similarity
viet_df['bigrams_cosine_sim'] = viet_df['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])
viet_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,transliteration,unigrams,bigrams,trigrams,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(t,), (ừ,), ( ,), (h,), (o,), (à,), (n,), (g,...",tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...","[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07...",0.805625,0.508198
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(n,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (t,...",nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...","[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884792,0.667716
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(n,), (i,), (c,), (k,), ( ,), (ú,), (t,), (n,...",nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...","[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.592690,0.005600
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(c,), (a,), (o,), ( ,), (v,), (ă,), (n,), ( ,...",cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...","[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665965,0.243176
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(t,), (ạ,), ( ,), (t,), (h,), (u,), ( ,), (t,...",ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...","[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.596114,0.288942
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (d,...",nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.13333333333333333, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0...",0.765422,0.440203
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (p,...",nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.125, 0.0625, 0.0, 0.0625, 0.0, 0.0625, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.901822,0.607800
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(l,), (ê,), ( ,), (p,), (h,), (ổ,), (l, ê), (...",le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]","[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.428565,0.112297
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(v,), (u,), ( ,), (n,), (g,), (o,), (c,), ( ,...",vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...","[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.890835,0.339892


In [82]:
# # This cell cannot be run more than once!
# # Converting fdists to numpy arrays first so we can pass them into cosine_similarity
# viet_df['indiv_trigrams_fdist'] = viet_df['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
# trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

In [83]:
# # Calculating cosine similarity
# viet_df['trigrams_cosine_sim'] = viet_df['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
# viet_df.head()

In [84]:
viet_df

Unnamed: 0,fullname,alphabet,word_length,num_tokens,char_ngrams,transliteration,unigrams,bigrams,trigrams,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(t,), (ừ,), ( ,), (h,), (o,), (à,), (n,), (g,...",tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...","[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07...",0.805625,0.508198
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(n,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (t,...",nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...","[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884792,0.667716
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(n,), (i,), (c,), (k,), ( ,), (ú,), (t,), (n,...",nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...","[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.592690,0.005600
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(c,), (a,), (o,), ( ,), (v,), (ă,), (n,), ( ,...",cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...","[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665965,0.243176
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(t,), (ạ,), ( ,), (t,), (h,), (u,), ( ,), (t,...",ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...","[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.596114,0.288942
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,nguyen duc kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (d,...",nguyen duc kien,"[n, g, u, y, e, n, , d, u, c, , k, i, e, n]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.13333333333333333, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0...",0.765422,0.440203
2286,nguyen phuc thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(n,), (g,), (u,), (y,), (e,), (n,), ( ,), (p,...",nguyen phuc thai,"[n, g, u, y, e, n, , p, h, u, c, , t, h, a, i]","[(n, g), (g, u), (u, y), (y, e), (e, n), (n, ...","[(n, g, u), (g, u, y), (u, y, e), (y, e, n), (...","[[0.125, 0.0625, 0.0, 0.0625, 0.0, 0.0625, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.901822,0.607800
2287,lê phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(l,), (ê,), ( ,), (p,), (h,), (ổ,), (l, ê), (...",le pho,"[l, ê, , p, h, ổ]","[(l, ê), (ê, ), ( , p), (p, h), (h, ổ)]","[(l, ê, ), (ê, , p), ( , p, h), (p, h, ổ)]","[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.428565,0.112297
2288,vu ngoc nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(v,), (u,), ( ,), (n,), (g,), (o,), (c,), ( ,...",vu ngoc nha,"[v, u, , n, g, o, c, , n, h, a]","[(v, u), (u, ), ( , n), (n, g), (g, o), (o, c...","[(v, u, ), (u, , n), ( , n, g), (n, g, o), (...","[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.890835,0.339892


In [87]:
viet_df.to_csv('viet_df.csv', index=False)