In [1]:
import pandas as pd
import numpy as np

# Feature engineering
import unicodedata
from nltk import ngrams
from unidecode import unidecode

## Data Cleaning

### Cleaning Indonesian Dataset

We have two Indonesian datasets, so let's compare them:

In [2]:
# Inspecting first Indonesian dataset
df_indo_1 = pd.read_excel('name_data/exigerData/EXGR_Indonesian names.xlsx')
df_indo_1.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q7313790,Bertrand Antolin,,
1,http://www.wikidata.org/entity/Q15138877,Ray Rizal,,
2,http://www.wikidata.org/entity/Q17411237,Samsuridjal Djauzi,,
3,http://www.wikidata.org/entity/Q12497619,Max Arifin,,
4,http://www.wikidata.org/entity/Q7475963,Donita,,


In [3]:
# Inspecting second Indonesian dataset
df_indo_2 = pd.read_excel('name_data/exigerData/EXGR_Indonesian names-2.xlsx')
df_indo_2.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name,Designation,Comment,Unnamed: 6,Unnamed: 7
0,http://www.wikidata.org/entity/Q7475963,Donita,,Donita,,It is common for Indonesians to have only 1 gi...,,1
1,http://www.wikidata.org/entity/Q7645143,Supriyadi,,Supriyadi,,,,1
2,http://www.wikidata.org/entity/Q7844727,Triyaningsih,,Triyaningsih,,,,1
3,http://www.wikidata.org/entity/Q12515781,Soerjadi,,Soerjadi,,,,1
4,http://www.wikidata.org/entity/Q12523244,Undunsyah,,Undunsyah,,,,1


Are the names in the datasets the same?

In [4]:
try:
    pd.testing.assert_series_equal(df_indo_1['fullname'], df_indo_2['fullname'], check_like = True)
except:
    print("Names are not the same")

Names are not the same


We will investigate this further later. For now, let's look at the comments in `df_indo_2`:

In [5]:
print(df_indo_2['Comment'].unique()[:5])
print('\nNumber of names with comments:', len(df_indo_2['Comment'].unique()))
df_indo_2.shape

['It is common for Indonesians to have only 1 given name without family name.'
 nan 'Real name: Mike Lucock'
 'This is a Javanese ancient name. The name is not used anymore. This person lived in 1028 - 1035.'
 'Royal family name of Java identified by the designation name.']

Number of names with comments: 34


(21727, 8)

Since these names seem uncommon exceptions and there are only a few of them, we will remove them to make the data cleaner:

In [6]:
df_indo_2 = df_indo_2[pd.isnull(df_indo_2['Comment'])]
df_indo_2.reset_index()
print(df_indo_2.shape)
df_indo_2['Comment'].unique()

(21689, 8)


array([nan], dtype=object)

Making dataframes consist only of the `fullname` column:

In [7]:
df_indo_1 = pd.DataFrame(df_indo_1['fullname'])
df_indo_2 = pd.DataFrame(df_indo_2['fullname'])
print('df_indo_1 shape:', df_indo_1.shape)
print('df_indo_2 shape:', df_indo_2.shape)

df_indo_1 shape: (21730, 1)
df_indo_2 shape: (21689, 1)


Combining both dataframes into `df_indo`:

In [8]:
# Combining datasets
df_indo = pd.concat([df_indo_1, df_indo_2], ignore_index = True)
print('Shape of combined data:', df_indo.shape)
df_indo.head()

Shape of combined data: (43419, 1)


Unnamed: 0,fullname
0,Bertrand Antolin
1,Ray Rizal
2,Samsuridjal Djauzi
3,Max Arifin
4,Donita


Looking for duplicate entries:

In [9]:
print(len(df_indo[df_indo.duplicated()]))

32085


Since around 3/4 of the combined data consists of duplicate entries, the datasets are very similar. Let's just use `df_indo_2` since we removed the names with comments from it:

In [10]:
df_indo = df_indo_2
print(df_indo.shape)
df_indo.head()

(21689, 1)


Unnamed: 0,fullname
1,Supriyadi
2,Triyaningsih
3,Soerjadi
4,Undunsyah
5,Soeripto


Are there duplicates?

In [11]:
len(df_indo[df_indo.duplicated()])

10376

Let's drop them:

In [12]:
# Dropping duplicates
df_indo.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_indo.duplicated()))
df_indo.shape

False


(11313, 1)

Let's see if there are any null entries for `fullname`:

In [13]:
np.any(df_indo['fullname'].isnull())

False

What about entries that aren't alphanumeric aside from spaces?

In [14]:
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))
non_alnum_names_indo[-10:]

590


['I. B. Rai Dharmawijaya Mantra',
 'Mansoer Daoed Dt. Palimo Kayo',
 'Habib Anis bin Alwi al-Habsyi',
 'Abu Bakar (bupati Bandung Barat)',
 'Muhammad Arsyad (pemain sepak bola)',
 'Muhammad Hamzah (personil The Fly)',
 'A. A. Ngurah Oka Ratmadi',
 'Firmansyah (pemain sepak bola kelahiran 1995)',
 'Mayjen TNI (Purn) Thomas Albert Umboh',
 'Prof. Dr. Ir. Antonius Suwanto, M.Sc']

Some names have parentheses. Let's see exactly how many:

In [15]:
paren_names_indo = [name for name in df_indo['fullname'] if name.find('(') != -1]
print(len(paren_names_indo))
paren_names_indo[:5]

67


['Basuki (pelawak)',
 'Mulyadi (politisi)',
 'Fatahillah (politisi)',
 'Trinity (penulis)',
 'Mentari (aktris)']

Removing the parentheses from the names and seeing how many names are non-alphanumeric:

In [16]:
df_indo['fullname'] = df_indo['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(') - 1])
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))

527


The number of names left isn't exactly 590 - 67 = 523 because some names with parentheses had other punctuation in it. Let's double check that there are no names with parentheses in the `fullname` column:

In [17]:
print([name for name in df_indo['fullname'] if name.find('(') != -1])
print([name for name in df_indo['fullname'] if name.find(')') != -1])

[]
[]


It's possible that we have some duplicate names after removing parentheses, so let's check for that:

In [18]:
print(len(df_indo[df_indo.duplicated()]))
print(df_indo.shape)
df_indo[df_indo.duplicated()]

63
(11313, 1)


Unnamed: 0,fullname
1676,Basuki
1810,Mulyadi
2716,Fatahillah
3080,Trinity
4798,Mentari
...,...
11285,Abu Bakar
11286,Muhammad Arsyad
11288,Muhammad Hamzah
11300,Firmansyah


Removing leading and trailing whitespace from names just in case:

In [19]:
df_indo['fullname'] = df_indo['fullname'].apply(str.strip)

Removing duplicates again:

In [20]:
df_indo.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_indo.duplicated()))
df_indo.shape

False


(11246, 1)

### Cleaning Malay Dataset

Let's look at the Malay dataset now:

In [21]:
df_malay = pd.read_excel('name_data/exigerData/EXGR_Malay names.xlsx')
df_malay.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q4769705,Annuar Rapaee,,
1,http://www.wikidata.org/entity/Q31186972,Tash Yong,,
2,http://www.wikidata.org/entity/Q468519,Fatmawati,,
3,http://www.wikidata.org/entity/Q4736793,Alto Linus,,
4,http://www.wikidata.org/entity/Q28837179,Mohamad Izzat Abdul Halil,,


Restricting the dataframe to just the `fullname` column:

In [22]:
df_malay = pd.DataFrame(df_malay['fullname'])
df_malay.head()

Unnamed: 0,fullname
0,Annuar Rapaee
1,Tash Yong
2,Fatmawati
3,Alto Linus
4,Mohamad Izzat Abdul Halil


Handling duplicate values:

In [23]:
# Checking for duplicate entries
print(len(df_malay[df_malay.duplicated()]))
df_malay.shape

2016


(4930, 1)

In [24]:
# Removing duplicate entries
df_malay.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_malay.duplicated()))
df_malay.shape

False


(2914, 1)

Are there any null entries in `fullname`?

In [25]:
print(np.any(df_malay['fullname'].isnull()))

False


Are any names non-alphanumeric aside from spaces?

In [26]:
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))
non_alnum_names_malay[-10:]

200


['Adenan Hj. Satem',
 'P. Balasubramaniam',
 "Abdullah Ma'ayat Shah",
 "Mohd Shahril Sa'ari",
 'E. E. C. Thuraisingham',
 'Ibrahim Ali (Malaysia)',
 'Faradina Mohd. Nadzir',
 'Aril (AF 7)',
 'Sultan Hisamuddin Alam Shah Al-Haj ibni Almarhum Sultan Alaeddin Sulaiman Shah',
 "Ismail Faruqi Asha'ri"]

Again, we have parentheses at the end, so let's remove them:

In [27]:
df_malay['fullname'] = df_malay['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(') - 1])
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))

192


Removing leading and trailing whitespace from names just in case:

In [28]:
df_malay['fullname'] = df_malay['fullname'].apply(str.strip)

Handling duplicates:

In [29]:
# Finding number of duplicates
print(len(df_malay[df_malay.duplicated()]))
df_malay.shape

6


(2914, 1)

In [30]:
# Removing duplicates
df_malay.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_malay.duplicated()))
df_malay.shape

False


(2908, 1)

## Feature Engineering

### Indonesian Dataset Feature Engineering

Here are the features we want to include:
1. `alphabet` - which alphabet is being used - unicode library
2. `char_ngrams` - character n-grams (unigram, bigram, trigram, interpolated) - `nltk.utils.ngrams()`
3. `word_ngrams` - word n-grams
4. `a_hat_freq` - binary feature: does it exist in the given name? - if too much work to have EACH accent character as a feature, can combine
5. `name_length` - length of name (total number of letters in full name)
6. `avg_token_length` - average length of a token (the avg number of characters in each word of the name)
7. `num_tokens` - how many “words” are in each name
8. `transliteration` - transliteration
9. `period_freq`, `dash_freq`, `apostrophe_freq` - frequency of punctuation
10. `space_freq` - how many spaces are in each name

Let's start with the Indonesian dataset.

In [31]:
# 1. alphabet
def get_alphabet(name):
    name_as_alphabet_list = []
    for char in name:
        name_as_alphabet_list.append(unicodedata.name(char).split(' ')[0])
    return name_as_alphabet_list

df_indo['alphabet'] = df_indo['fullname'].apply(get_alphabet)
df_indo.head()

Unnamed: 0,fullname,alphabet
0,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
1,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
2,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
3,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
4,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."


In [32]:
# 2. unigrams, bigrams, trigrams, char_ngrams
df_indo['unigrams'] = df_indo['fullname'].apply(lambda name: list(name))
df_indo['bigrams'] = df_indo['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
df_indo['trigrams'] = df_indo['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
df_indo['char_ngrams'] = df_indo['unigrams'] + df_indo['bigrams'] + df_indo['trigrams']
df_indo.head()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams
0,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, u, p, r, i, y, a, d, i]","[(S, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(S, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[S, u, p, r, i, y, a, d, i, (S, u), (u, p), (p..."
1,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[T, r, i, y, a, n, i, n, g, s, i, h]","[(T, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(T, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[T, r, i, y, a, n, i, n, g, s, i, h, (T, r), (..."
2,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, o, e, r, j, a, d, i]","[(S, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(S, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[S, o, e, r, j, a, d, i, (S, o), (o, e), (e, r..."
3,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[U, n, d, u, n, s, y, a, h]","[(U, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(U, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[U, n, d, u, n, s, y, a, h, (U, n), (n, d), (d..."
4,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, o, e, r, i, p, t, o]","[(S, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(S, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[S, o, e, r, i, p, t, o, (S, o), (o, e), (e, r..."


In [33]:
# 3. word_ngrams
df_indo['word_ngrams'] = df_indo['fullname'].apply(lambda name: name.split())
df_indo.tail()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams
11241,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[(E, d), (d, w), (w, e), (e, l), (l, ), ( , Y...","[(E, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[Edwel, Yusri, Datuak, Rajo, Gampo, Alam]"
11242,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[(S, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(S, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]"
11243,"Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[(P, r), (r, o), (o, f), (f, .), (., ), ( , D...","[(P, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]"
11244,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[(K, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(K, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam..."
11245,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[(L, o), (o, r), (r, d), (d, ), ( , K), (K, a...","[(L, o, r), (o, r, d), (r, d, ), (d, , K), (...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari..."


We will skip #4, `a_hat_freq`, for now because it is difficult to account for across multiple languages.

In [34]:
# 5. name_length
df_indo['name_length'] = df_indo['fullname'].apply(len)
df_indo.head()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length
0,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, u, p, r, i, y, a, d, i]","[(S, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(S, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[S, u, p, r, i, y, a, d, i, (S, u), (u, p), (p...",[Supriyadi],9
1,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[T, r, i, y, a, n, i, n, g, s, i, h]","[(T, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(T, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[T, r, i, y, a, n, i, n, g, s, i, h, (T, r), (...",[Triyaningsih],12
2,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, o, e, r, j, a, d, i]","[(S, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(S, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[S, o, e, r, j, a, d, i, (S, o), (o, e), (e, r...",[Soerjadi],8
3,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[U, n, d, u, n, s, y, a, h]","[(U, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(U, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[U, n, d, u, n, s, y, a, h, (U, n), (n, d), (d...",[Undunsyah],9
4,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[S, o, e, r, i, p, t, o]","[(S, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(S, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[S, o, e, r, i, p, t, o, (S, o), (o, e), (e, r...",[Soeripto],8


In [35]:
# 6. avg_token_length
tokens = df_indo['fullname'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
df_indo['avg_token_length'] = token_lengths.apply(np.mean)
df_indo.tail()

11241            [Edwel, Yusri, Datuak, Rajo, Gampo, Alam]
11242     [Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]
11243          [Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]
11244    [Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam...
11245    [Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari...
Name: fullname, dtype: object 

11241                      [5, 5, 6, 4, 5, 4]
11242                      [6, 9, 2, 4, 7, 8]
11243                      [5, 3, 3, 8, 8, 4]
11244                  [7, 5, 9, 3, 5, 7, 11]
11245    [4, 7, 7, 4, 3, 4, 4, 6, 4, 7, 7, 8]
Name: fullname, dtype: object


Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length
11241,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[(E, d), (d, w), (w, e), (e, l), (l, ), ( , Y...","[(E, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[Edwel, Yusri, Datuak, Rajo, Gampo, Alam]",34,4.833333
11242,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[(S, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(S, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]",41,6.0
11243,"Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[(P, r), (r, o), (o, f), (f, .), (., ), ( , D...","[(P, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]",36,5.166667
11244,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[(K, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(K, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam...",53,6.714286
11245,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[(L, o), (o, r), (r, d), (d, ), ( , K), (K, a...","[(L, o, r), (o, r, d), (r, d, ), (d, , K), (...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari...",76,5.416667


Finding `num_tokens`:

In [36]:
# 7. num_tokens
df_indo['num_tokens'] = df_indo['fullname'].apply(lambda name: len(name.split(' ')))
df_indo.tail()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens
11241,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[(E, d), (d, w), (w, e), (e, l), (l, ), ( , Y...","[(E, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[Edwel, Yusri, Datuak, Rajo, Gampo, Alam]",34,4.833333,6
11242,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[(S, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(S, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]",41,6.0,6
11243,"Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[(P, r), (r, o), (o, f), (f, .), (., ), ( , D...","[(P, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]",36,5.166667,6
11244,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[(K, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(K, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam...",53,6.714286,7
11245,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[(L, o), (o, r), (r, d), (d, ), ( , K), (K, a...","[(L, o, r), (o, r, d), (r, d, ), (d, , K), (...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari...",76,5.416667,12


In [37]:
# 8. transliteration
df_indo['transliteration'] = df_indo['fullname'].apply(lambda name: unidecode(name))
df_indo[df_indo['fullname'] != df_indo['transliteration']][['fullname', 'transliteration']]

Unnamed: 0,fullname,transliteration
1010,Cristian Gonzáles,Cristian Gonzales
1140,Śri Ajñadewi,Sri Ajnadewi
1211,Ipe Ma’aroef,Ipe Ma'aroef
2868,Fauzie As’ad,Fauzie As'ad
3181,Kiki María,Kiki Maria
3263,Abdullah Wasi’an,Abdullah Wasi'an
3302,Empu Prapañca,Empu Prapanca
4597,Manuel Carrascalão,Manuel Carrascalao
6796,Jaëll Hattu,Jaell Hattu
6805,Berthold Damshäuser,Berthold Damshauser


In [38]:
# 9. period_freq, dash_freq, apostrophe_freq
df_indo['period_freq'] = df_indo['fullname'].apply(lambda name: name.count('.'))
df_indo['dash_freq'] = df_indo['fullname'].apply(lambda name: name.count('-'))
df_indo['apostrophe_freq'] = df_indo['fullname'].apply(lambda name: name.count('\''))
df_indo.tail()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq
11241,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[(E, d), (d, w), (w, e), (e, l), (l, ), ( , Y...","[(E, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[Edwel, Yusri, Datuak, Rajo, Gampo, Alam]",34,4.833333,6,Edwel Yusri Datuak Rajo Gampo Alam,0,0,0
11242,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[(S, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(S, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]",41,6.0,6,Sultan Amaluddin Al Sani Perkasa Alamsyah,0,0,0
11243,"Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[(P, r), (r, o), (o, f), (f, .), (., ), ( , D...","[(P, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]",36,5.166667,6,"Prof. Dr. Ir. Antonius Suwanto, M.Sc",4,0,0
11244,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[(K, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(K, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam...",53,6.714286,7,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,0,0,0
11245,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[(L, o), (o, r), (r, d), (d, ), ( , K), (K, a...","[(L, o, r), (o, r, d), (r, d, ), (d, , K), (...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari...",76,5.416667,12,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,0,0,0


In [39]:
# 10. space_freq
df_indo['space_freq'] = df_indo['fullname'].apply(lambda name: name.count(' '))
df_indo.tail()

Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq
11241,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[(E, d), (d, w), (w, e), (e, l), (l, ), ( , Y...","[(E, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[E, d, w, e, l, , Y, u, s, r, i, , D, a, t, ...","[Edwel, Yusri, Datuak, Rajo, Gampo, Alam]",34,4.833333,6,Edwel Yusri Datuak Rajo Gampo Alam,0,0,0,5
11242,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[(S, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(S, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[S, u, l, t, a, n, , A, m, a, l, u, d, d, i, ...","[Sultan, Amaluddin, Al, Sani, Perkasa, Alamsyah]",41,6.0,6,Sultan Amaluddin Al Sani Perkasa Alamsyah,0,0,0,5
11243,"Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[(P, r), (r, o), (o, f), (f, .), (., ), ( , D...","[(P, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[P, r, o, f, ., , D, r, ., , I, r, ., , A, ...","[Prof., Dr., Ir., Antonius, Suwanto,, M.Sc]",36,5.166667,6,"Prof. Dr. Ir. Antonius Suwanto, M.Sc",4,0,0,5
11244,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[(K, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(K, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[K, a, n, j, e, n, g, , R, a, d, e, n, , T, ...","[Kanjeng, Raden, Tumengung, Mas, Ariya, Purnam...",53,6.714286,7,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,0,0,0,6
11245,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[(L, o), (o, r), (r, d), (d, ), ( , K), (K, a...","[(L, o, r), (o, r, d), (r, d, ), (d, , K), (...","[L, o, r, d, , K, a, n, j, e, n, g, , B, a, ...","[Lord, Kanjeng, Baginda, Atep, Bin, Lord, Dari...",76,5.416667,12,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,0,0,0,11


### Malay Dataset Feature Engineering

We will implement the same features for the Malay dataset.

In [40]:
df_malay.head()

Unnamed: 0,fullname
0,Annuar Rapaee
1,Tash Yong
2,Fatmawati
3,Alto Linus
4,Mohamad Izzat Abdul Halil


In [41]:
# 1. alphabet
df_malay['alphabet'] = df_malay['fullname'].apply(get_alphabet)

# 2. unigrams, bigrams, trigrams, char_ngrams
df_malay['unigrams'] = df_malay['fullname'].apply(lambda name: list(name))
df_malay['bigrams'] = df_malay['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
df_malay['trigrams'] = df_malay['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
df_malay['char_ngrams'] = df_malay['unigrams'] + df_malay['bigrams'] + df_malay['trigrams']

# 3. word_ngrams
df_malay['word_ngrams'] = df_malay['fullname'].apply(lambda name: name.split())

# 4. a_hat_freq - skipped

# 5. name_length
df_malay['name_length'] = df_malay['fullname'].apply(len)

# 6. avg_token_length
tokens = df_malay['fullname'].apply(lambda name: name.split(' '))
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
df_malay['avg_token_length'] = token_lengths.apply(np.mean)

# 7. num_tokens
df_malay['num_tokens'] = df_malay['fullname'].apply(lambda name: len(name.split(' ')))

# 8. transliteration
df_malay['transliteration'] = df_malay['fullname'].apply(lambda name: unidecode(name))
print("Names with special characters:")
print(pd.DataFrame(df_malay[df_malay['fullname'] != df_malay['transliteration']][['fullname', 'transliteration']]))

# 9. period_freq, dash_freq, apostrophe_freq
df_malay['period_freq'] = df_malay['fullname'].apply(lambda name: name.count('.'))
df_malay['dash_freq'] = df_malay['fullname'].apply(lambda name: name.count('-'))
df_malay['apostrophe_freq'] = df_malay['fullname'].apply(lambda name: name.count('\''))

# 10. space_freq
df_malay['space_freq'] = df_malay['fullname'].apply(lambda name: name.count(' '))

df_malay.head()

Names with special characters:
                                               fullname  \
1070                    Tunku Imran ibni Tuanku Ja’afar   
1992  Sultan Ahmad Shah Al-Musta’in Billah ibni Alma...   
2134   Tuanku Ja’afar ibni Almarhum Tuanku Abdul Rahman   
2259                                     Junior Eldstål   
2728                   Sultan Abdul Halim Mu’adzam Shah   

                                        transliteration  
1070                    Tunku Imran ibni Tuanku Ja'afar  
1992  Sultan Ahmad Shah Al-Musta'in Billah ibni Alma...  
2134   Tuanku Ja'afar ibni Almarhum Tuanku Abdul Rahman  
2259                                     Junior Eldstal  
2728                   Sultan Abdul Halim Mu'adzam Shah  


Unnamed: 0,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq
0,Annuar Rapaee,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[A, n, n, u, a, r, , R, a, p, a, e, e]","[(A, n), (n, n), (n, u), (u, a), (a, r), (r, ...","[(A, n, n), (n, n, u), (n, u, a), (u, a, r), (...","[A, n, n, u, a, r, , R, a, p, a, e, e, (A, n)...","[Annuar, Rapaee]",13,6.0,2,Annuar Rapaee,0,0,0,1
1,Tash Yong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[T, a, s, h, , Y, o, n, g]","[(T, a), (a, s), (s, h), (h, ), ( , Y), (Y, o...","[(T, a, s), (a, s, h), (s, h, ), (h, , Y), (...","[T, a, s, h, , Y, o, n, g, (T, a), (a, s), (s...","[Tash, Yong]",9,4.0,2,Tash Yong,0,0,0,1
2,Fatmawati,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[F, a, t, m, a, w, a, t, i]","[(F, a), (a, t), (t, m), (m, a), (a, w), (w, a...","[(F, a, t), (a, t, m), (t, m, a), (m, a, w), (...","[F, a, t, m, a, w, a, t, i, (F, a), (a, t), (t...",[Fatmawati],9,9.0,1,Fatmawati,0,0,0,0
3,Alto Linus,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[A, l, t, o, , L, i, n, u, s]","[(A, l), (l, t), (t, o), (o, ), ( , L), (L, i...","[(A, l, t), (l, t, o), (t, o, ), (o, , L), (...","[A, l, t, o, , L, i, n, u, s, (A, l), (l, t),...","[Alto, Linus]",10,4.5,2,Alto Linus,0,0,0,1
4,Mohamad Izzat Abdul Halil,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[M, o, h, a, m, a, d, , I, z, z, a, t, , A, ...","[(M, o), (o, h), (h, a), (a, m), (m, a), (a, d...","[(M, o, h), (o, h, a), (h, a, m), (a, m, a), (...","[M, o, h, a, m, a, d, , I, z, z, a, t, , A, ...","[Mohamad, Izzat, Abdul, Halil]",25,5.5,4,Mohamad Izzat Abdul Halil,0,0,0,3
