In [1]:
import pandas as pd
import numpy as np
import unicodedata
import nltk
from nltk.util import ngrams

In [2]:
df  =  pd.read_excel('name_data/exigerData/EXGR_Korean names.xlsx')
print(df)

       Unnamed: 0                                        id       fullname  \
0               0    http://www.wikidata.org/entity/Q484396  Park Joo-bong   
1               1  http://www.wikidata.org/entity/Q55728351  KIM Jong hoon   
2               2  http://www.wikidata.org/entity/Q11266766            이민혁   
3               3  http://www.wikidata.org/entity/Q47492159         Lee Ho   
4               4   http://www.wikidata.org/entity/Q1075756            최민호   
...           ...                                       ...            ...   
21197       21197  http://www.wikidata.org/entity/Q11267007     Lee Han-wi   
21198       21198  http://www.wikidata.org/entity/Q12586843   Gil Jung-woo   
21199       21199    http://www.wikidata.org/entity/Q224639            이정희   
21200       21200  http://www.wikidata.org/entity/Q12586585            금태섭   
21201       21201  http://www.wikidata.org/entity/Q18684836      Oh Ui-sik   

       Family name  Given name  
0              NaN         NaN

In [3]:
#checking the shape(21202, 5)
df.shape

(21202, 5)

In [4]:
#checking the head 
df.head()

Unnamed: 0.1,Unnamed: 0,id,fullname,Family name,Given name
0,0,http://www.wikidata.org/entity/Q484396,Park Joo-bong,,
1,1,http://www.wikidata.org/entity/Q55728351,KIM Jong hoon,,
2,2,http://www.wikidata.org/entity/Q11266766,이민혁,,
3,3,http://www.wikidata.org/entity/Q47492159,Lee Ho,,
4,4,http://www.wikidata.org/entity/Q1075756,최민호,,


In [5]:
#checking how many null entries each columns have( 21202 for family and given name)
nan_count = np.sum(df.isnull(), axis = 0)
nan_count

Unnamed: 0         0
id                 0
fullname           0
Family name    21202
Given name     21202
dtype: int64

In [6]:
#checking if there's any duplicates(yes)
df['fullname'].duplicated().any()

True

In [7]:
#drop them from the fullname  and checking if nullentries went down(form 21202 to 19520)
df2 =df.drop_duplicates(subset=['fullname'])

nan_count = np.sum(df2.isnull(), axis = 0)
nan_count

Unnamed: 0         0
id                 0
fullname           0
Family name    19520
Given name     19520
dtype: int64

In [8]:
#now the duplicates are gone
df2['fullname'].duplicated().any()

False

In [14]:
#Drop columns that won't be used in df2
print(df2.columns)
df2 = df2.drop(columns=['Unnamed: 0', 'id', 'Family name', 'Given name'])


Index(['Unnamed: 0', 'id', 'fullname', 'Family name', 'Given name'], dtype='object')


In [15]:
print(df2)

            fullname
0      Park Joo-bong
1      KIM Jong hoon
2                이민혁
3             Lee Ho
4                최민호
...              ...
21197     Lee Han-wi
21198   Gil Jung-woo
21199            이정희
21200            금태섭
21201      Oh Ui-sik

[19520 rows x 1 columns]


Now For Feature Engineering 

In [22]:
#determining the lan
def Korean_lan(name):
    if isinstance(name, str):
        return [unicodedata.name(char).split(' ')[0] for char in name]
    else:
        return None

# Apply the function only to rows where 'fullname' is a string
df2['determine_alphabet'] = df2['fullname'].apply(Korean_lan)
print(df2['determine_alphabet'])

0        [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
1        [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
2                                 [HANGUL, HANGUL, HANGUL]
3               [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]
4                                 [HANGUL, HANGUL, HANGUL]
                               ...                        
21197    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21198    [LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...
21199                             [HANGUL, HANGUL, HANGUL]
21200                             [HANGUL, HANGUL, HANGUL]
21201    [LATIN, LATIN, SPACE, LATIN, LATIN, HYPHEN-MIN...
Name: determine_alphabet, Length: 19520, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['determine_alphabet'] = df2['fullname'].apply(Korean_lan)


In [23]:
#featuring the char_ngrams. un
def get_ngrams(text, n):
    if isinstance(text,str):
        name = list(text)
    ngrams_list =  list(ngrams(list(text), n))
    return ngrams_list

df2["unigrams"] = df2['fullname'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2["bigrams"] = df2['fullname'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2["trigrams"] = df2['fullname'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2['char_ngrams'] = df2["unigrams"] + df2["bigrams"] + df2["trigrams"]

In [26]:
#print(df2["unigrams"])
print(df2["bigrams"])
df2["trigrams"]
print{}

0        [(P, a), (a, r), (r, k), (k,  ), ( , J), (J, o...
1        [(K, I), (I, M), (M,  ), ( , J), (J, o), (o, n...
2                                         [(이, 민), (민, 혁)]
3                 [(L, e), (e, e), (e,  ), ( , H), (H, o)]
4                                         [(최, 민), (민, 호)]
                               ...                        
21197    [(L, e), (e, e), (e,  ), ( , H), (H, a), (a, n...
21198    [(G, i), (i, l), (l,  ), ( , J), (J, u), (u, n...
21199                                     [(이, 정), (정, 희)]
21200                                     [(금, 태), (태, 섭)]
21201    [(O, h), (h,  ), ( , U), (U, i), (i, -), (-, s...
Name: bigrams, Length: 19520, dtype: object


0        [(P, a, r), (a, r, k), (r, k,  ), (k,  , J), (...
1        [(K, I, M), (I, M,  ), (M,  , J), ( , J, o), (...
2                                              [(이, 민, 혁)]
3             [(L, e, e), (e, e,  ), (e,  , H), ( , H, o)]
4                                              [(최, 민, 호)]
                               ...                        
21197    [(L, e, e), (e, e,  ), (e,  , H), ( , H, a), (...
21198    [(G, i, l), (i, l,  ), (l,  , J), ( , J, u), (...
21199                                          [(이, 정, 희)]
21200                                          [(금, 태, 섭)]
21201    [(O, h,  ), (h,  , U), ( , U, i), (U, i, -), (...
Name: trigrams, Length: 19520, dtype: object

In [27]:
#feature for token length 
def token_length(name):
    if isinstance(name,str):
       return len(name.split())
    else:
       return None

df2['token_len'] = df2['fullname'].apply(token_length)
print(df2['token_len'])

0        2.0
1        3.0
2        1.0
3        2.0
4        1.0
        ... 
21197    2.0
21198    2.0
21199    1.0
21200    1.0
21201    2.0
Name: token_len, Length: 19520, dtype: float64


In [30]:
df2['period_freq'] = df2['fullname'].apply(lambda name: name.count('.') if isinstance(name, str) else [])
df2['dash_freq'] = df2['fullname'].apply(lambda name: name.count('-') if isinstance(name, str) else [])
df2['space_freq'] = df2['fullname'].apply(lambda name: name.count(' ' )if isinstance(name, str) else [])

In [31]:
df2


Unnamed: 0,fullname,unigrams,bigrams,trigrams,char_ngrams,token_len,period_freq,dash_freq,space_freq
0,Park Joo-bong,"[P, a, r, k, , J, o, o, -, b, o, n, g]","[(P, a), (a, r), (r, k), (k, ), ( , J), (J, o...","[(P, a, r), (a, r, k), (r, k, ), (k, , J), (...","[P, a, r, k, , J, o, o, -, b, o, n, g, (P, a)...",2.0,0,1,1
1,KIM Jong hoon,"[K, I, M, , J, o, n, g, , h, o, o, n]","[(K, I), (I, M), (M, ), ( , J), (J, o), (o, n...","[(K, I, M), (I, M, ), (M, , J), ( , J, o), (...","[K, I, M, , J, o, n, g, , h, o, o, n, (K, I)...",3.0,0,0,2
2,이민혁,"[이, 민, 혁]","[(이, 민), (민, 혁)]","[(이, 민, 혁)]","[이, 민, 혁, (이, 민), (민, 혁), (이, 민, 혁)]",1.0,0,0,0
3,Lee Ho,"[L, e, e, , H, o]","[(L, e), (e, e), (e, ), ( , H), (H, o)]","[(L, e, e), (e, e, ), (e, , H), ( , H, o)]","[L, e, e, , H, o, (L, e), (e, e), (e, ), ( ,...",2.0,0,0,1
4,최민호,"[최, 민, 호]","[(최, 민), (민, 호)]","[(최, 민, 호)]","[최, 민, 호, (최, 민), (민, 호), (최, 민, 호)]",1.0,0,0,0
...,...,...,...,...,...,...,...,...,...
21197,Lee Han-wi,"[L, e, e, , H, a, n, -, w, i]","[(L, e), (e, e), (e, ), ( , H), (H, a), (a, n...","[(L, e, e), (e, e, ), (e, , H), ( , H, a), (...","[L, e, e, , H, a, n, -, w, i, (L, e), (e, e),...",2.0,0,1,1
21198,Gil Jung-woo,"[G, i, l, , J, u, n, g, -, w, o, o]","[(G, i), (i, l), (l, ), ( , J), (J, u), (u, n...","[(G, i, l), (i, l, ), (l, , J), ( , J, u), (...","[G, i, l, , J, u, n, g, -, w, o, o, (G, i), (...",2.0,0,1,1
21199,이정희,"[이, 정, 희]","[(이, 정), (정, 희)]","[(이, 정, 희)]","[이, 정, 희, (이, 정), (정, 희), (이, 정, 희)]",1.0,0,0,0
21200,금태섭,"[금, 태, 섭]","[(금, 태), (태, 섭)]","[(금, 태, 섭)]","[금, 태, 섭, (금, 태), (태, 섭), (금, 태, 섭)]",1.0,0,0,0
