In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from alphabet_detector import AlphabetDetector
import unicodedata
from langdetect import detect
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords

nltk.download('stopwords')

cnchar_df = pd.read_excel('name_data/exigerData/EXGR_Chinese names(Character).xlsx')
cnrom_df = pd.read_excel("name_data/exigerData/EXGR_Chinese names(Romantized).xlsx")
viet_df = pd.read_excel("name_data/exigerData/EXGR_Vietnamese.xlsx")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joliehuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Todo:
- clean: chinese char, chinese rom, viet

In [2]:
#FUNCTIONS
def langname(name):
    if isinstance(name, str):
        result = []
        for char in name:
            try:
                char_name = unicodedata.name(char).split(' ')[0]
                result.append(char_name)
            except ValueError:
                # Handle characters without Unicode names
                result.append(char)
        return result
    else:
        return None  # Handle non-string or NaN values


def calculate_token_length(name):
    if isinstance(name, str):
        # Check if the name contains Chinese characters
        if any('\u4e00' <= char <= '\u9fff' for char in name):
            # For Chinese names, count characters
            return len(name)
        else:
            # For non-Chinese names, split by spaces and count words
            return len(name.split())
    else:
        return None  # Handle non-string or NaN values


def generate_char_ngrams(text):
    if isinstance(text, str):
        # Tokenize the text into words
        words = text.split()
        
        # Create unigrams, bigrams, and trigrams
        unigrams = list(ngrams(text, 1))
        bigrams = list(ngrams(text, 2))
        trigrams = list(ngrams(text, 3))
        
        # Interpolated n-grams (combining unigrams, bigrams, and trigrams)
        interpolated_ngrams = unigrams + bigrams + trigrams
        
        return interpolated_ngrams
    else:
        return []

def get_name_length(fullname):
    if isinstance(fullname, str):
        return len(fullname)
    else:
        return np.nan 


EXGR_Chinese names(Character).xlsx - CLEANING
===

In [3]:
pd.set_option('display.max_columns', None)
# cnchar_df

In [4]:
duplicates = cnchar_df['fullname'].duplicated()
# print(cnchar_df[duplicates])

In [5]:
cnchar_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# cnchar_df

In [6]:
# null_values = cnchar_df['fullname'].isnull()
# print(cnchar_df[null_values])
# NO NULL VALUES

DATA EXPLORATION

In [7]:
# cnchar_df['Notes'].unique()

**Maybe notes saying that it's a minority name/non-native name should be counted? Drop all the ones that say pseudonym/stage name? Maybe we can afford to drop certain examples of names, since we have so many of them...?

In [8]:
drop_notes = ['minority name', 'non-native', 'pseudonym', 'stage name',
       'should be侯赛因.江；minority name',
       '劉-family name of her husband;吳-her family name', 'satage name',
       '周-family name of her husband;梁-her family name', 'fake name',
       'pen name', '方-family name of her husband; 黄-her family name',
       '曹-family name of her husband;王-her family name',
       '朱-family name of her husband;李-her family name',
       '朱-family name of her husband;葉-her family name',
       '林-family name of her husband;鄭-her family name',
       '梁-family name of her husband;劉- her family name ',
       '梁-family name of her husband;高- her family name',
       'should be 泰迪.羅賓；non-native',
       '罗-family name of her husband;范-her family name',
       '范-family name of her husband;徐-her family name',
       '葉-family name of her husband；劉-her family name',
       '蘇-family name of her husband;周- her family name', 'buddhist',
       'family name should be in Chinese character 周',
       '陈-family name of her husband; 冯-her family name',
       '陳-family name of her husband;方-her family name',
       '高-family name of her husband;金-her family name',
       '黃-family name of her husband;馬-her family name']

cnchar_df = cnchar_df[~cnchar_df['Notes'].isin(drop_notes)]
# cnchar_df

In [9]:
cnchar_df['Notes'].unique()

array([nan], dtype=object)

In [10]:
cnchar_df['Unnamed: 6'].unique()

array(['**Some commom compound Chinese surnames:东方，端木，独孤，公孙，上官，令狐，慕容，欧阳，司马，司徒，夏侯，西门，长孙，诸葛。',
       nan], dtype=object)

In [11]:
cnchar_df.drop(columns = ['Unnamed: 0', 'id', 'Family name', 'Given Name', 'Notes' ,'Unnamed: 6'], inplace = True)
# cnchar_df

Feature Engineering (Chinese Character)

In [12]:
cnchar_df['alphabet'] = cnchar_df['fullname'].apply(langname)
# cnchar_df

In [13]:
cnchar_df['word_length'] = cnchar_df['fullname'].apply(get_name_length)
# cnchar_df

In [14]:
cnchar_df['token_length'] = cnchar_df['fullname'].apply(calculate_token_length)
# cnchar_df

In [15]:
cnchar_df['char_ngrams'] = cnchar_df['fullname'].apply(generate_char_ngrams)
cnchar_df

Unnamed: 0,fullname,alphabet,word_length,token_length,char_ngrams
0,丁一平,"[CJK, CJK, CJK]",3,3,"[(丁,), (一,), (平,), (丁, 一), (一, 平), (丁, 一, 平)]"
1,丁世雄,"[CJK, CJK, CJK]",3,3,"[(丁,), (世,), (雄,), (丁, 世), (世, 雄), (丁, 世, 雄)]"
2,丁亦昕,"[CJK, CJK, CJK]",3,3,"[(丁,), (亦,), (昕,), (丁, 亦), (亦, 昕), (丁, 亦, 昕)]"
3,丁仲礼,"[CJK, CJK, CJK]",3,3,"[(丁,), (仲,), (礼,), (丁, 仲), (仲, 礼), (丁, 仲, 礼)]"
4,丁伟,"[CJK, CJK]",2,2,"[(丁,), (伟,), (丁, 伟)]"
...,...,...,...,...,...
12038,龚翔宇,"[CJK, CJK, CJK]",3,3,"[(龚,), (翔,), (宇,), (龚, 翔), (翔, 宇), (龚, 翔, 宇)]"
12039,龚育之,"[CJK, CJK, CJK]",3,3,"[(龚,), (育,), (之,), (龚, 育), (育, 之), (龚, 育, 之)]"
12040,龚蓓苾,"[CJK, CJK, CJK]",3,3,"[(龚,), (蓓,), (苾,), (龚, 蓓), (蓓, 苾), (龚, 蓓, 苾)]"
12041,龚贤永,"[CJK, CJK, CJK]",3,3,"[(龚,), (贤,), (永,), (龚, 贤), (贤, 永), (龚, 贤, 永)]"


EXGR_Chinese names(Romantized).xlsx - CLEANING
===

In [16]:
cnrom_df

Unnamed: 0.1,Unnamed: 0,id,fullname,Family name,Given Name,Notes
0,1806.0,http://www.wikidata.org/entity/Q2861472,8th Arjia Rinpoche,,,non-native
1,9519.0,http://www.wikidata.org/entity/Q2107375,A Lamusi,,,non-native
2,2157.0,http://www.wikidata.org/entity/Q16872,Aaron Kwok,,,
3,22359.0,http://www.wikidata.org/entity/Q50366858,Abduhamit Abdugheni,,,non-native
4,20305.0,http://www.wikidata.org/entity/Q2821416,Abdul Haq,,,non-native
...,...,...,...,...,...,...
12009,22920.0,http://www.wikidata.org/entity/Q8074892,Zu Xiaosun,,,
12010,436.0,http://www.wikidata.org/entity/Q8075316,Zuo Shusheng,,,
12011,5871.0,http://www.wikidata.org/entity/Q9090197,Zuo Xiaoqing,,,
12012,9205.0,http://www.wikidata.org/entity/Q24006407,Zuo Yiteng,,,


In [17]:
cnrom_df.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [18]:
cnrom_df['Notes'].unique()

array(['non-native', nan], dtype=object)

In [19]:
duplicates = cnrom_df['fullname'].duplicated()
# print(cnrom_df[duplicates])

In [20]:
cnrom_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
# cnrom_df

In [21]:
#Get rid of all examples with 'non-native', to try and further narrow df down to just Chinese names
cnrom_df.drop(cnrom_df[cnrom_df['Notes'] == 'non-native'].index, inplace=True)
# cnrom_df

In [22]:
cnrom_df['Notes'].unique()

array([nan], dtype=object)

In [23]:
cnrom_df['alphabet'] = cnrom_df['fullname'].apply(langname)
# cnrom_df

In [24]:
cnrom_df['word_length'] = cnrom_df['fullname'].apply(get_name_length)
# cnrom_df

In [25]:
cnrom_df['token_length'] = cnrom_df['fullname'].apply(calculate_token_length)
# cnrom_df

In [26]:
cnrom_df['char_ngrams'] = cnrom_df['fullname'].apply(generate_char_ngrams)
cnrom_df

Unnamed: 0,fullname,Family name,Given Name,Notes,alphabet,word_length,token_length,char_ngrams
2,Aaron Kwok,,,,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.0,2.0,"[(A,), (a,), (r,), (o,), (n,), ( ,), (K,), (w,..."
9,Adhe Tapontsang,,,,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",15.0,2.0,"[(A,), (d,), (h,), (e,), ( ,), (T,), (a,), (p,..."
10,Ai Baojun,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",9.0,2.0,"[(A,), (i,), ( ,), (B,), (a,), (o,), (j,), (u,..."
11,Ai Guoxiang,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11.0,2.0,"[(A,), (i,), ( ,), (G,), (u,), (o,), (x,), (i,..."
12,Ai Husheng,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10.0,2.0,"[(A,), (i,), ( ,), (H,), (u,), (s,), (h,), (e,..."
...,...,...,...,...,...,...,...,...
12009,Zu Xiaosun,,,,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",10.0,2.0,"[(Z,), (u,), ( ,), (X,), (i,), (a,), (o,), (s,..."
12010,Zuo Shusheng,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12.0,2.0,"[(Z,), (u,), (o,), ( ,), (S,), (h,), (u,), (s,..."
12011,Zuo Xiaoqing,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",12.0,2.0,"[(Z,), (u,), (o,), ( ,), (X,), (i,), (a,), (o,..."
12012,Zuo Yiteng,,,,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",10.0,2.0,"[(Z,), (u,), (o,), ( ,), (Y,), (i,), (t,), (e,..."


EXGR_Vietnamese.xlsx - CLEANING
===

In [27]:
viet_df

Unnamed: 0.1,Unnamed: 0,id,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q4466941,Từ Hoàng Thông,,,
1,http://www.wikidata.org/entity/Q27793245,Nguyễn Thị Phương Thảo,,,
2,http://www.wikidata.org/entity/Q10528061,Nguyễn Mạnh Dũng (cầu thủ bóng đá sinh 1981),,,
3,http://www.wikidata.org/entity/Q362337,Nick Út,,,
4,http://www.wikidata.org/entity/Q5034332,Cao Văn Lầu,,,
...,...,...,...,...,...
3195,http://www.wikidata.org/entity/Q1033947,Lê Chân Tông,,,
3196,http://www.wikidata.org/entity/Q52161415,Hồ Sỹ Giáp,,,
3197,http://www.wikidata.org/entity/Q7023137,Nguyễn Văn Thương,,,
3198,http://www.wikidata.org/entity/Q5923096,Hoàng Tích Chu,,,


In [28]:
viet_df.drop(columns = ['Unnamed: 0', 'fullname','Family name','Given name'], inplace = True)
viet_df = viet_df.rename(columns={'id': 'fullname'})

In [29]:
viet_df.drop_duplicates(subset=['fullname'], keep='first', inplace=True)
viet_df

Unnamed: 0,fullname
0,Từ Hoàng Thông
1,Nguyễn Thị Phương Thảo
2,Nguyễn Mạnh Dũng (cầu thủ bóng đá sinh 1981)
3,Nick Út
4,Cao Văn Lầu
...,...
3189,Nguyen Duc Kien
3190,Nguyen Phuc Thai
3192,Lê Phổ
3193,Vu Ngoc Nha


In [30]:
viet_df['alphabet'] = viet_df['fullname'].apply(langname)
# viet_df

In [31]:
viet_df['word_length'] = viet_df['fullname'].apply(get_name_length)
# viet_df

In [32]:
viet_df['token_length'] = viet_df['fullname'].apply(calculate_token_length)
# viet_df

In [33]:
viet_df['char_ngrams'] = viet_df['fullname'].apply(generate_char_ngrams)
viet_df

Unnamed: 0,fullname,alphabet,word_length,token_length,char_ngrams
0,Từ Hoàng Thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(T,), (ừ,), ( ,), (H,), (o,), (à,), (n,), (g,..."
1,Nguyễn Thị Phương Thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (T,..."
2,Nguyễn Mạnh Dũng (cầu thủ bóng đá sinh 1981),"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",44,9,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (M,..."
3,Nick Út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(N,), (i,), (c,), (k,), ( ,), (Ú,), (t,), (N,..."
4,Cao Văn Lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(C,), (a,), (o,), ( ,), (V,), (ă,), (n,), ( ,..."
...,...,...,...,...,...
3189,Nguyen Duc Kien,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",15,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (D,..."
3190,Nguyen Phuc Thai,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",16,3,"[(N,), (g,), (u,), (y,), (e,), (n,), ( ,), (P,..."
3192,Lê Phổ,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN]",6,2,"[(L,), (ê,), ( ,), (P,), (h,), (ổ,), (L, ê), (..."
3193,Vu Ngoc Nha,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",11,3,"[(V,), (u,), ( ,), (N,), (g,), (o,), (c,), ( ,..."
