In [1]:
import pandas as pd
import numpy as np

### Cleaning Indonesian Dataset

We have two Indonesian datasets, so let's compare them:

In [2]:
# Inspecting first Indonesian dataset
pd.read_excel('name_data/exigerData/EXGR_Indonesian names.xlsx').head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q7313790,Bertrand Antolin,,
1,http://www.wikidata.org/entity/Q15138877,Ray Rizal,,
2,http://www.wikidata.org/entity/Q17411237,Samsuridjal Djauzi,,
3,http://www.wikidata.org/entity/Q12497619,Max Arifin,,
4,http://www.wikidata.org/entity/Q7475963,Donita,,


In [3]:
# Inspecting second Indonesian dataset
df_indo = pd.read_excel('name_data/exigerData/EXGR_Indonesian names-2.xlsx')
df_indo.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name,Designation,Comment,Unnamed: 6,Unnamed: 7
0,http://www.wikidata.org/entity/Q7475963,Donita,,Donita,,It is common for Indonesians to have only 1 gi...,,1
1,http://www.wikidata.org/entity/Q7645143,Supriyadi,,Supriyadi,,,,1
2,http://www.wikidata.org/entity/Q7844727,Triyaningsih,,Triyaningsih,,,,1
3,http://www.wikidata.org/entity/Q12515781,Soerjadi,,Soerjadi,,,,1
4,http://www.wikidata.org/entity/Q12523244,Undunsyah,,Undunsyah,,,,1


We will use the 2nd Indonesian dataset, `EXGR_Indonesian names-2.xlsx`, since it has more information than the first, `EXGR_Indonesian names.xlsx`. (We aren't using the extra info now but maybe we will later.) Let's drop the extra columns:

In [4]:
df_indo.drop(columns = ['Unnamed: 0', 'Designation', 'Comment', 'Unnamed: 6', 'Unnamed: 7'], inplace = True)
df_indo.head()

Unnamed: 0,fullname,Family name,Given name
0,Donita,,Donita
1,Supriyadi,,Supriyadi
2,Triyaningsih,,Triyaningsih
3,Soerjadi,,Soerjadi
4,Undunsyah,,Undunsyah


Let's see if there are any null entries for `fullname`:

In [5]:
np.any(df_indo['fullname'].isnull())

False

What about `Given name` and `Family name`?

In [6]:
print(np.any(df_indo['Given name'].isnull()))
df_indo[df_indo['Given name'].isnull()]

True


Unnamed: 0,fullname,Family name,Given name
10364,RO Tambunan,,
10365,Setiawan Dalimartha,,
10366,Ismet Iskandar,,
10367,Oktovianus Maniani,,
10368,Cucu Hidayat,,
...,...,...,...
21722,Sultan Amaluddin Al Sani Perkasa Alamsyah,,
21723,Maria Genoveva Natalia Desy Purnamasari Gunawan,,
21724,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,,
21725,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,,


In [7]:
df_indo[df_indo['Family name'].isnull()]

Unnamed: 0,fullname,Family name,Given name
0,Donita,,Donita
1,Supriyadi,,Supriyadi
2,Triyaningsih,,Triyaningsih
3,Soerjadi,,Soerjadi
4,Undunsyah,,Undunsyah
...,...,...,...
21722,Sultan Amaluddin Al Sani Perkasa Alamsyah,,
21723,Maria Genoveva Natalia Desy Purnamasari Gunawan,,
21724,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,,
21725,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,,


In [8]:
df_indo.shape

(21727, 3)

Around half of the entries have null values for `Given name` even though `fullname` is not null, and most of the entries for `Family name` are null, so we'll have to fill those in later.

Next, let's look for duplicate entries:

In [9]:
print(len(df_indo[df_indo.duplicated()]))

6646


Since we have duplicate entries, let's remove them:

In [10]:
df_indo.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_indo.duplicated()))
df_indo.shape

False


(15081, 3)

What about entries that aren't alphanumeric aside from spaces?

In [11]:
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))
non_alnum_names_indo

689


["Asma'i",
 'A.Tee',
 "Sa'duddin",
 'A.Rifai',
 'F.Rahardi',
 'Anne.J.Coto',
 'T-Sha',
 'KPH.Suryakusuma',
 "Syam'un",
 'Sangrama-Vijayottunggawarman',
 'Iwa-K',
 "Anang Ma'ruf",
 'T.B. Silalahi',
 "Daud Beureu'eh",
 'Ipe Ma’aroef',
 'Dorodjatun Kuntjoro-Jakti',
 'E.S. Ito',
 "Soedjana Sapi'ie",
 'M. Dhofir',
 "Mun'im Idris",
 'M. Syaaf',
 "As'ad Humam",
 'R.A. Srimulat',
 'R.M. Suwandi',
 'M. Syarifuddin',
 'Marie-Claire Barth',
 'P.G.O Noordraven',
 'Basuki (pelawak)',
 'K.P.H. Notoprojo',
 'R. Samadikun',
 'Mulyadi (politisi)',
 'M. Kamri',
 'H. Suwardi',
 'R. Soebekti',
 'R. Hartono',
 'Moch. Badrus',
 'J. Panglaykim',
 'Amran S.N.',
 "Achmad Asj'ari",
 'Mira W.',
 "Ma'ruf Amin",
 'S. Rukiah',
 'BM. Syamsudin',
 'M.A. Rachman',
 'R. Iskak',
 'H. Salzwedel',
 'RJ. Katamsi',
 'R. Soebijakto',
 'E.A. Mangindaan',
 'M. Nasroen',
 "Abdullah Syafi'i",
 'Fatahillah (politisi)',
 'Li-Young Lee',
 'Fauzie As’ad',
 'M. Shariefuddin',
 "Achmad Rifa'i",
 'Trinity (penulis)',
 'S.A.E. Nababan',

While many of these simply have other punctuation, some contain parentheses at the end indicating the individual's job or something else. Since we wouldn't want to count the parentheses as the surname, let's remove them:

In [12]:
df_indo['fullname'] = df_indo['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(')])
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))
non_alnum_names_indo

626


["Asma'i",
 'A.Tee',
 "Sa'duddin",
 'A.Rifai',
 'F.Rahardi',
 'Anne.J.Coto',
 'T-Sha',
 'KPH.Suryakusuma',
 "Syam'un",
 'Sangrama-Vijayottunggawarman',
 'Iwa-K',
 "Anang Ma'ruf",
 'T.B. Silalahi',
 "Daud Beureu'eh",
 'Ipe Ma’aroef',
 'Dorodjatun Kuntjoro-Jakti',
 'E.S. Ito',
 "Soedjana Sapi'ie",
 'M. Dhofir',
 "Mun'im Idris",
 'M. Syaaf',
 "As'ad Humam",
 'R.A. Srimulat',
 'R.M. Suwandi',
 'M. Syarifuddin',
 'Marie-Claire Barth',
 'P.G.O Noordraven',
 'K.P.H. Notoprojo',
 'R. Samadikun',
 'M. Kamri',
 'H. Suwardi',
 'R. Soebekti',
 'R. Hartono',
 'Moch. Badrus',
 'J. Panglaykim',
 'Amran S.N.',
 "Achmad Asj'ari",
 'Mira W.',
 "Ma'ruf Amin",
 'S. Rukiah',
 'BM. Syamsudin',
 'M.A. Rachman',
 'R. Iskak',
 'H. Salzwedel',
 'RJ. Katamsi',
 'R. Soebijakto',
 'E.A. Mangindaan',
 'M. Nasroen',
 "Abdullah Syafi'i",
 'Li-Young Lee',
 'Fauzie As’ad',
 'M. Shariefuddin',
 "Achmad Rifa'i",
 'S.A.E. Nababan',
 "Da'i Bachtiar",
 'M. Zainudin',
 'Ishadi S.K.',
 'S. Bagio',
 'Abdullah Wasi’an',
 'M. Sa

(There may be titles that we have to remove. Come back to this later)

Now, let's separate the names into their family and given names. For now, we'll define the given name as the part of the name before the first space and the family name as the part of the name following the last space. Let's first assign values to `Given name`:

In [13]:
df_indo['Given name'] = df_indo['fullname'].apply(lambda name: name if name.find(' ') == -1 else name[:name.find(' ')])
df_indo

Unnamed: 0,fullname,Family name,Given name
0,Donita,,Donita
1,Supriyadi,,Supriyadi
2,Triyaningsih,,Triyaningsih
3,Soerjadi,,Soerjadi
4,Undunsyah,,Undunsyah
...,...,...,...
15076,Mayjen TNI,,Mayjen
15077,Sultan Amaluddin Al Sani Perkasa Alamsyah,,Sultan
15078,"Prof. Dr. Ir. Antonius Suwanto, M.Sc",,Prof.
15079,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,,Kanjeng


Assigning values to `Family name`:

In [14]:
df_indo['Family name'] = df_indo['fullname'].apply(lambda name: np.nan if name.find(' ') == -1 else name[name.rfind(' ')+1:])
df_indo

Unnamed: 0,fullname,Family name,Given name
0,Donita,,Donita
1,Supriyadi,,Supriyadi
2,Triyaningsih,,Triyaningsih
3,Soerjadi,,Soerjadi
4,Undunsyah,,Undunsyah
...,...,...,...
15076,Mayjen TNI,,Mayjen
15077,Sultan Amaluddin Al Sani Perkasa Alamsyah,Alamsyah,Sultan
15078,"Prof. Dr. Ir. Antonius Suwanto, M.Sc",M.Sc,Prof.
15079,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,Hadiningrat,Kanjeng


(Maybe we should have the middle names too?)

### Cleaning Malay Dataset

Let's look at the Malay dataset now:

In [15]:
df_malay = pd.read_excel('name_data/exigerData/EXGR_Malay names.xlsx')
df_malay.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q4769705,Annuar Rapaee,,
1,http://www.wikidata.org/entity/Q31186972,Tash Yong,,
2,http://www.wikidata.org/entity/Q468519,Fatmawati,,
3,http://www.wikidata.org/entity/Q4736793,Alto Linus,,
4,http://www.wikidata.org/entity/Q28837179,Mohamad Izzat Abdul Halil,,


Removing unnamed column:

In [16]:
df_malay.drop(columns = 'Unnamed: 0', inplace = True)
df_malay.head()

Unnamed: 0,fullname,Family name,Given name
0,Annuar Rapaee,,
1,Tash Yong,,
2,Fatmawati,,
3,Alto Linus,,
4,Mohamad Izzat Abdul Halil,,


Handling duplicate values:

In [17]:
# Checking for duplicate entries
print(len(df_malay.duplicated()))

4930


In [18]:
# Removing duplicate entries
df_malay.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_malay.duplicated()))

False


Are any names non-alphanumeric aside from spaces?

In [19]:
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))
non_alnum_names_malay

200


['Ling Ban San @ Teng Boon Soon',
 'Zulkifeli Mohd. Zin',
 'Raja Tun Uda Al-Haj bin Raja Muhammad',
 'P. Ramlee',
 'S. Hamid',
 'Rueben Thevandran a/l Ramanath',
 'Nora (penyanyi)',
 'Tunku Alif Hussein Saifuddin Al-Amin',
 'Soekarno M. Noer',
 'M. Kulasegaran V. Murugeson',
 'M.Asojan a/l Muniyandy',
 'K. Rajagopal',
 'Afifi al-Akiti',
 "Mohd Shahril Saa'ri",
 'Md. Jais Bin Haji Sarday',
 'K. Gurusamy',
 'Sivarasa K. Rasiah',
 'Abdullah C.D',
 'Raveentharan a/l V. Subramaniam',
 'Jajang C. Noer',
 'Devamany S. Krishnasamy',
 'Imam Bonjol,Tuanku',
 'K. Sasi Kumar',
 'N. Surendran',
 'Zainal Abidin Ahmad (PKR)',
 'Wan Syuhada Md. Amin',
 'M. Manogaran',
 'S. Manikumar',
 'Rustam A. Sani',
 'Zulkifli Mohamad Al-Bakri',
 'Manoharan A/L Malayalam',
 'Wan Mohammad Khair-il Anuar Wan Ahmad',
 'M. G. Pandithan',
 'Abdullah @ Md. Khalid bin Md. Ali',
 'P. Uthayakumar',
 'G. Palanivel',
 'P. G. Lim',
 'S. Subramaniam',
 'Sultan Abdul Jalil Nasruddin Muhtaram Shah ibni al-Marhum Sultan Idris Mur

Again, we have parentheses at the end, so let's remove them:

In [20]:
df_malay['fullname'] = df_malay['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(')])
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))
non_alnum_names_malay

192


['Ling Ban San @ Teng Boon Soon',
 'Zulkifeli Mohd. Zin',
 'Raja Tun Uda Al-Haj bin Raja Muhammad',
 'P. Ramlee',
 'S. Hamid',
 'Rueben Thevandran a/l Ramanath',
 'Tunku Alif Hussein Saifuddin Al-Amin',
 'Soekarno M. Noer',
 'M. Kulasegaran V. Murugeson',
 'M.Asojan a/l Muniyandy',
 'K. Rajagopal',
 'Afifi al-Akiti',
 "Mohd Shahril Saa'ri",
 'Md. Jais Bin Haji Sarday',
 'K. Gurusamy',
 'Sivarasa K. Rasiah',
 'Abdullah C.D',
 'Raveentharan a/l V. Subramaniam',
 'Jajang C. Noer',
 'Devamany S. Krishnasamy',
 'Imam Bonjol,Tuanku',
 'K. Sasi Kumar',
 'N. Surendran',
 'Wan Syuhada Md. Amin',
 'M. Manogaran',
 'S. Manikumar',
 'Rustam A. Sani',
 'Zulkifli Mohamad Al-Bakri',
 'Manoharan A/L Malayalam',
 'Wan Mohammad Khair-il Anuar Wan Ahmad',
 'M. G. Pandithan',
 'Abdullah @ Md. Khalid bin Md. Ali',
 'P. Uthayakumar',
 'G. Palanivel',
 'P. G. Lim',
 'S. Subramaniam',
 'Sultan Abdul Jalil Nasruddin Muhtaram Shah ibni al-Marhum Sultan Idris Murshidul Aadzam Shah',
 'Wong B. K.',
 'M.P. Drahama

(Handle titles?)

Are there any null entries in `fullname`?

In [21]:
print(np.any(df_malay['fullname'].isnull()))

False


Let's assign values to `Family name` and `Given name`, following the same criteria we did before for the Indonesian names:

In [22]:
df_malay['Given name'] = df_malay['fullname'].apply(lambda name: name if name.find(' ') == -1 else name[:name.find(' ')])
df_malay

Unnamed: 0,fullname,Family name,Given name
0,Annuar Rapaee,,Annuar
1,Tash Yong,,Tash
2,Fatmawati,,Fatmawati
3,Alto Linus,,Alto
4,Mohamad Izzat Abdul Halil,,Mohamad
...,...,...,...
2909,Salmah binti Ismail,,Salmah
2910,Abdul Aziz Sheikh Fadzir,,Abdul
2911,Sultan Ibrahim ibni Sultan Abu Bakar,,Sultan
2912,Ahmad Yaakob,,Ahmad


In [23]:
df_malay['Family name'] = df_malay['fullname'].apply(lambda name: np.nan if name.find(' ') == -1 else name[name.rfind(' ')+1:])
df_malay

Unnamed: 0,fullname,Family name,Given name
0,Annuar Rapaee,Rapaee,Annuar
1,Tash Yong,Yong,Tash
2,Fatmawati,,Fatmawati
3,Alto Linus,Linus,Alto
4,Mohamad Izzat Abdul Halil,Halil,Mohamad
...,...,...,...
2909,Salmah binti Ismail,Ismail,Salmah
2910,Abdul Aziz Sheikh Fadzir,Fadzir,Abdul
2911,Sultan Ibrahim ibni Sultan Abu Bakar,Bakar,Sultan
2912,Ahmad Yaakob,Yaakob,Ahmad
