# Indonesian and Malay Data Cleaning

In [1]:
import pandas as pd
import numpy as np

# Feature engineering
import unicodedata
from nltk import ngrams
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity

### Cleaning Indonesian Dataset

We have two Indonesian datasets, so let's compare them:

In [2]:
# Inspecting first Indonesian dataset
df_indo_1 = pd.read_excel('../name_data/exiger_datasets/EXGR_Indonesian names.xlsx')
df_indo_1.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q7313790,Bertrand Antolin,,
1,http://www.wikidata.org/entity/Q15138877,Ray Rizal,,
2,http://www.wikidata.org/entity/Q17411237,Samsuridjal Djauzi,,
3,http://www.wikidata.org/entity/Q12497619,Max Arifin,,
4,http://www.wikidata.org/entity/Q7475963,Donita,,


In [3]:
# Inspecting second Indonesian dataset
df_indo_2 = pd.read_excel('../name_data/exiger_datasets/EXGR_Indonesian names-2.xlsx')
df_indo_2.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name,Designation,Comment,Unnamed: 6,Unnamed: 7
0,http://www.wikidata.org/entity/Q7475963,Donita,,Donita,,It is common for Indonesians to have only 1 gi...,,1
1,http://www.wikidata.org/entity/Q7645143,Supriyadi,,Supriyadi,,,,1
2,http://www.wikidata.org/entity/Q7844727,Triyaningsih,,Triyaningsih,,,,1
3,http://www.wikidata.org/entity/Q12515781,Soerjadi,,Soerjadi,,,,1
4,http://www.wikidata.org/entity/Q12523244,Undunsyah,,Undunsyah,,,,1


Are the names in the datasets the same?

In [4]:
try:
    pd.testing.assert_series_equal(df_indo_1['fullname'], df_indo_2['fullname'], check_like = True)
except:
    print("Names are not the same")

Names are not the same


We will investigate this further later. For now, let's look at the comments in `df_indo_2`:

In [5]:
print(df_indo_2['Comment'].unique()[:5])
print('\nNumber of names with comments:', len(df_indo_2['Comment'].unique()))
df_indo_2.shape

['It is common for Indonesians to have only 1 given name without family name.'
 nan 'Real name: Mike Lucock'
 'This is a Javanese ancient name. The name is not used anymore. This person lived in 1028 - 1035.'
 'Royal family name of Java identified by the designation name.']

Number of names with comments: 34


(21727, 8)

Since these names seem uncommon exceptions and there are only a few of them, we will remove them to make the data cleaner:

In [6]:
df_indo_2 = df_indo_2[pd.isnull(df_indo_2['Comment'])]
df_indo_2.reset_index()
print(df_indo_2.shape)
df_indo_2['Comment'].unique()

(21689, 8)


array([nan], dtype=object)

Making dataframes consist only of the `fullname` column:

In [7]:
df_indo_1 = pd.DataFrame(df_indo_1['fullname'])
df_indo_2 = pd.DataFrame(df_indo_2['fullname'])
print('df_indo_1 shape:', df_indo_1.shape)
print('df_indo_2 shape:', df_indo_2.shape)

df_indo_1 shape: (21730, 1)
df_indo_2 shape: (21689, 1)


Combining both dataframes into `df_indo`:

In [8]:
# Combining datasets
df_indo = pd.concat([df_indo_1, df_indo_2], ignore_index = True)
print('Shape of combined data:', df_indo.shape)
df_indo.head()

Shape of combined data: (43419, 1)


Unnamed: 0,fullname
0,Bertrand Antolin
1,Ray Rizal
2,Samsuridjal Djauzi
3,Max Arifin
4,Donita


Looking for duplicate entries:

In [9]:
print(len(df_indo[df_indo.duplicated()]))

32085


Since around 3/4 of the combined data consists of duplicate entries, the datasets are very similar. Let's just use `df_indo_2` since we removed the names with comments from it:

In [10]:
df_indo = df_indo_2
print(df_indo.shape)
df_indo.head()

(21689, 1)


Unnamed: 0,fullname
1,Supriyadi
2,Triyaningsih
3,Soerjadi
4,Undunsyah
5,Soeripto


Are there duplicates?

In [11]:
len(df_indo[df_indo.duplicated()])

10376

Let's drop them:

In [12]:
# Dropping duplicates
df_indo.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_indo.duplicated()))
df_indo.shape

False


(11313, 1)

Let's see if there are any null entries for `fullname`:

In [13]:
np.any(df_indo['fullname'].isnull())

False

What about entries that aren't alphanumeric aside from spaces?

In [14]:
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))
non_alnum_names_indo[-10:]

590


['I. B. Rai Dharmawijaya Mantra',
 'Mansoer Daoed Dt. Palimo Kayo',
 'Habib Anis bin Alwi al-Habsyi',
 'Abu Bakar (bupati Bandung Barat)',
 'Muhammad Arsyad (pemain sepak bola)',
 'Muhammad Hamzah (personil The Fly)',
 'A. A. Ngurah Oka Ratmadi',
 'Firmansyah (pemain sepak bola kelahiran 1995)',
 'Mayjen TNI (Purn) Thomas Albert Umboh',
 'Prof. Dr. Ir. Antonius Suwanto, M.Sc']

Some names have parentheses. Let's see exactly how many:

In [15]:
paren_names_indo = [name for name in df_indo['fullname'] if name.find('(') != -1]
print(len(paren_names_indo))
paren_names_indo[:5]

67


['Basuki (pelawak)',
 'Mulyadi (politisi)',
 'Fatahillah (politisi)',
 'Trinity (penulis)',
 'Mentari (aktris)']

Removing the parentheses from the names and seeing how many names are non-alphanumeric:

In [16]:
df_indo['fullname'] = df_indo['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(') - 1])
non_alnum_names_indo = [name for name in df_indo['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_indo))

527


The number of names left isn't exactly 590 - 67 = 523 because some names with parentheses had other punctuation in it. Let's double check that there are no names with parentheses in the `fullname` column:

In [17]:
print([name for name in df_indo['fullname'] if name.find('(') != -1])
print([name for name in df_indo['fullname'] if name.find(')') != -1])

[]
[]


Let's make every name lowercase to help us with feature engineering. We will first save the original `fullname` column into a new column, `original_fullname`, and then turn `fullname` into lowercase:

In [18]:
df_indo['original_fullname'] = df_indo['fullname']
df_indo['fullname'] = df_indo['fullname'].apply(str.lower)
df_indo

Unnamed: 0,fullname,original_fullname
0,supriyadi,Supriyadi
1,triyaningsih,Triyaningsih
2,soerjadi,Soerjadi
3,undunsyah,Undunsyah
4,soeripto,Soeripto
...,...,...
11308,mayjen tni,Mayjen TNI
11309,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah
11310,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc"
11311,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...


After looking at character frequencies for ngram frequency distributions (a later step), we discovered that a few names contain a number, `'/'`, or `'"'`. Let's confirm this:

In [19]:
print(df_indo.shape)
df_indo[df_indo['fullname'].apply(lambda name: any(char.isdigit() or char == '/' or char == '"' for char in name))]

(11313, 2)


Unnamed: 0,fullname,original_fullname
5092,ratmi b-29,Ratmi B-29
7934,andy /rif,Andy /rif
8610,"isaak ""tjaak"" pattiwael","Isaak ""Tjaak"" Pattiwael"
11109,"hendrikus v. ""henk"" zomers","Hendrikus V. ""Henk"" Zomers"


We will remove them to improve the runtime of trigrams individual frequency distributions:

In [20]:
# There are definitely better ways to do this
df_indo = df_indo[[name.find('2') == -1 and name.find('9') == -1 for name in df_indo['fullname']]]
df_indo = df_indo[[name.find('/') == -1 for name in df_indo['fullname']]]
df_indo = df_indo[[name.find('"') == -1 for name in df_indo['fullname']]]

df_indo.shape

(11309, 2)

It's possible that we have some duplicate names after doing the above steps, so let's check for that:

In [21]:
print(len(df_indo[df_indo.duplicated()]))
print(df_indo.shape)
df_indo[df_indo.duplicated()]

63
(11309, 2)


Unnamed: 0,fullname,original_fullname
1676,basuki,Basuki
1810,mulyadi,Mulyadi
2716,fatahillah,Fatahillah
3080,trinity,Trinity
4798,mentari,Mentari
...,...,...
11285,abu bakar,Abu Bakar
11286,muhammad arsyad,Muhammad Arsyad
11288,muhammad hamzah,Muhammad Hamzah
11300,firmansyah,Firmansyah


Removing leading and trailing whitespace from names just in case:

In [22]:
df_indo['fullname'] = df_indo['fullname'].apply(str.strip)

Removing duplicates again:

In [23]:
df_indo.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_indo.duplicated()))
df_indo.shape

False


(11246, 2)

### Cleaning Malay Dataset

Let's look at the Malay dataset now:

In [24]:
df_malay = pd.read_excel('../name_data/exiger_datasets/EXGR_Malay names.xlsx')
df_malay.head()

Unnamed: 0.1,Unnamed: 0,fullname,Family name,Given name
0,http://www.wikidata.org/entity/Q4769705,Annuar Rapaee,,
1,http://www.wikidata.org/entity/Q31186972,Tash Yong,,
2,http://www.wikidata.org/entity/Q468519,Fatmawati,,
3,http://www.wikidata.org/entity/Q4736793,Alto Linus,,
4,http://www.wikidata.org/entity/Q28837179,Mohamad Izzat Abdul Halil,,


Restricting the dataframe to just the `fullname` column:

In [25]:
df_malay = pd.DataFrame(df_malay['fullname'])
df_malay.head()

Unnamed: 0,fullname
0,Annuar Rapaee
1,Tash Yong
2,Fatmawati
3,Alto Linus
4,Mohamad Izzat Abdul Halil


Handling duplicate values:

In [26]:
# Checking for duplicate entries
print(len(df_malay[df_malay.duplicated()]))
df_malay.shape

2016


(4930, 1)

In [27]:
# Removing duplicate entries
df_malay.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_malay.duplicated()))
df_malay.shape

False


(2914, 1)

Are there any null entries in `fullname`?

In [28]:
print(np.any(df_malay['fullname'].isnull()))

False


Are any names non-alphanumeric aside from spaces?

In [29]:
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))
non_alnum_names_malay[-10:]

200


['Adenan Hj. Satem',
 'P. Balasubramaniam',
 "Abdullah Ma'ayat Shah",
 "Mohd Shahril Sa'ari",
 'E. E. C. Thuraisingham',
 'Ibrahim Ali (Malaysia)',
 'Faradina Mohd. Nadzir',
 'Aril (AF 7)',
 'Sultan Hisamuddin Alam Shah Al-Haj ibni Almarhum Sultan Alaeddin Sulaiman Shah',
 "Ismail Faruqi Asha'ri"]

Again, we have parentheses at the end, so let's remove them:

In [30]:
df_malay['fullname'] = df_malay['fullname'].apply(lambda name: name if name.find('(') == -1 else name[:name.find('(') - 1])
non_alnum_names_malay = [name for name in df_malay['fullname'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_malay))

192


Turning names lowercase:

In [31]:
df_malay['original_fullname'] = df_malay['fullname']
df_malay['fullname'] = df_malay['fullname'].apply(str.lower)
df_malay

Unnamed: 0,fullname,original_fullname
0,annuar rapaee,Annuar Rapaee
1,tash yong,Tash Yong
2,fatmawati,Fatmawati
3,alto linus,Alto Linus
4,mohamad izzat abdul halil,Mohamad Izzat Abdul Halil
...,...,...
2909,salmah binti ismail,Salmah binti Ismail
2910,abdul aziz sheikh fadzir,Abdul Aziz Sheikh Fadzir
2911,sultan ibrahim ibni sultan abu bakar,Sultan Ibrahim ibni Sultan Abu Bakar
2912,ahmad yaakob,Ahmad Yaakob


Removing leading and trailing whitespace from names just in case:

In [32]:
df_malay['fullname'] = df_malay['fullname'].apply(str.strip)

Handling duplicates:

In [33]:
# Finding number of duplicates
print(len(df_malay[df_malay.duplicated()]))
df_malay.shape

6


(2914, 2)

In [34]:
# Removing duplicates
df_malay.drop_duplicates(inplace = True, ignore_index = True)
print(np.any(df_malay.duplicated()))
df_malay.shape

False


(2908, 2)

## Feature Engineering

### Indonesian Dataset Feature Engineering

Here are the features we want to include:
1. `alphabet` - which alphabet is being used - unicode library
2. `char_ngrams` - character n-grams (unigram, bigram, trigram, interpolated) - `nltk.utils.ngrams()`
3. `word_ngrams` - word n-grams
4. `a_hat_freq` - binary feature: does it exist in the given name? - if too much work to have EACH accent character as a feature, can combine
5. `name_length` - length of name (total number of letters in full name)
6. `avg_token_length` - average length of a token (the avg number of characters in each word of the name)
7. `num_tokens` - how many “words” are in each name
8. `transliteration` - transliteration
9. `period_freq`, `dash_freq`, `apostrophe_freq` - frequency of punctuation
10. `space_freq` - how many spaces are in each name

Let's start with the Indonesian dataset.

In [35]:
# 1. alphabet
def get_alphabet(name):
    name_as_alphabet_list = []
    for char in name:
        name_as_alphabet_list.append(unicodedata.name(char).split(' ')[0])
    return name_as_alphabet_list

df_indo['alphabet'] = df_indo['fullname'].apply(get_alphabet)
df_indo.head()

Unnamed: 0,fullname,original_fullname,alphabet
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT..."


In [36]:
# 2. unigrams, bigrams, trigrams, char_ngrams
df_indo['unigrams'] = df_indo['fullname'].apply(lambda name: list(name))
df_indo['bigrams'] = df_indo['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
df_indo['trigrams'] = df_indo['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
df_indo['char_ngrams'] = df_indo['unigrams'] + df_indo['bigrams'] + df_indo['trigrams']
df_indo.head()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p..."
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (..."
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r..."
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d..."
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r..."


In [37]:
# 3. word_ngrams
df_indo['word_ngrams'] = df_indo['fullname'].apply(lambda name: name.split())
df_indo.tail()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]"
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]"
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]"
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam..."
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari..."


We will skip #4, `a_hat_freq`, for now because it is difficult to account for across multiple languages.

In [38]:
# 5. name_length
df_indo['name_length'] = df_indo['fullname'].apply(len)
df_indo.head()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8


In [39]:
# 6. avg_token_length
tokens = df_indo['fullname'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
df_indo['avg_token_length'] = token_lengths.apply(np.mean)
df_indo.tail()

11241            [edwel, yusri, datuak, rajo, gampo, alam]
11242     [sultan, amaluddin, al, sani, perkasa, alamsyah]
11243          [prof., dr., ir., antonius, suwanto,, m.sc]
11244    [kanjeng, raden, tumengung, mas, ariya, purnam...
11245    [lord, kanjeng, baginda, atep, bin, lord, dari...
Name: fullname, dtype: object 

11241                      [5, 5, 6, 4, 5, 4]
11242                      [6, 9, 2, 4, 7, 8]
11243                      [5, 3, 3, 8, 8, 4]
11244                  [7, 5, 9, 3, 5, 7, 11]
11245    [4, 7, 7, 4, 3, 4, 4, 6, 4, 7, 7, 8]
Name: fullname, dtype: object


Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]",34,4.833333
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]",41,6.0
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]",36,5.166667
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam...",53,6.714286
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari...",76,5.416667


Finding `num_tokens`:

In [40]:
# 7. num_tokens
df_indo['num_tokens'] = df_indo['fullname'].apply(lambda name: len(name.split(' ')))
df_indo.tail()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]",34,4.833333,6
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]",41,6.0,6
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]",36,5.166667,6
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam...",53,6.714286,7
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari...",76,5.416667,12


In [41]:
# 8. transliteration
df_indo['transliteration'] = df_indo['fullname'].apply(lambda name: unidecode(name))
df_indo[df_indo['fullname'] != df_indo['transliteration']][['fullname', 'transliteration']]

Unnamed: 0,fullname,transliteration
1010,cristian gonzáles,cristian gonzales
1140,śri ajñadewi,sri ajnadewi
1211,ipe ma’aroef,ipe ma'aroef
2868,fauzie as’ad,fauzie as'ad
3181,kiki maría,kiki maria
3263,abdullah wasi’an,abdullah wasi'an
3302,empu prapañca,empu prapanca
4598,manuel carrascalão,manuel carrascalao
6798,jaëll hattu,jaell hattu
6807,berthold damshäuser,berthold damshauser


In [42]:
# 9. period_freq, dash_freq, apostrophe_freq
df_indo['period_freq'] = df_indo['fullname'].apply(lambda name: name.count('.'))
df_indo['dash_freq'] = df_indo['fullname'].apply(lambda name: name.count('-'))
df_indo['apostrophe_freq'] = df_indo['fullname'].apply(lambda name: name.count('\''))
df_indo.tail()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]",34,4.833333,6,edwel yusri datuak rajo gampo alam,0,0,0
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]",41,6.0,6,sultan amaluddin al sani perkasa alamsyah,0,0,0
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]",36,5.166667,6,"prof. dr. ir. antonius suwanto, m.sc",4,0,0
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam...",53,6.714286,7,kanjeng raden tumengung mas ariya purnama hadi...,0,0,0
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari...",76,5.416667,12,lord kanjeng baginda atep bin lord dari segala...,0,0,0


In [43]:
# 10. space_freq
df_indo['space_freq'] = df_indo['fullname'].apply(lambda name: name.count(' '))
df_indo.tail()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]",34,4.833333,6,edwel yusri datuak rajo gampo alam,0,0,0,5
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]",41,6.0,6,sultan amaluddin al sani perkasa alamsyah,0,0,0,5
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]",36,5.166667,6,"prof. dr. ir. antonius suwanto, m.sc",4,0,0,5
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam...",53,6.714286,7,kanjeng raden tumengung mas ariya purnama hadi...,0,0,0,6
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari...",76,5.416667,12,lord kanjeng baginda atep bin lord dari segala...,0,0,0,11


### Malay Dataset Feature Engineering

We will implement the same features for the Malay dataset.

In [44]:
df_malay.head()

Unnamed: 0,fullname,original_fullname
0,annuar rapaee,Annuar Rapaee
1,tash yong,Tash Yong
2,fatmawati,Fatmawati
3,alto linus,Alto Linus
4,mohamad izzat abdul halil,Mohamad Izzat Abdul Halil


In [45]:
# 1. alphabet
df_malay['alphabet'] = df_malay['fullname'].apply(get_alphabet)

# 2. unigrams, bigrams, trigrams, char_ngrams
df_malay['unigrams'] = df_malay['fullname'].apply(lambda name: list(name))
df_malay['bigrams'] = df_malay['fullname'].apply(lambda name: list(ngrams(list(name), 2)))
df_malay['trigrams'] = df_malay['fullname'].apply(lambda name: list(ngrams(list(name), 3)))
df_malay['char_ngrams'] = df_malay['unigrams'] + df_malay['bigrams'] + df_malay['trigrams']

# 3. word_ngrams
df_malay['word_ngrams'] = df_malay['fullname'].apply(lambda name: name.split())

# 4. a_hat_freq - skipped

# 5. name_length
df_malay['name_length'] = df_malay['fullname'].apply(len)

# 6. avg_token_length
tokens = df_malay['fullname'].apply(lambda name: name.split(' '))
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
df_malay['avg_token_length'] = token_lengths.apply(np.mean)

# 7. num_tokens
df_malay['num_tokens'] = df_malay['fullname'].apply(lambda name: len(name.split(' ')))

# 8. transliteration
df_malay['transliteration'] = df_malay['fullname'].apply(lambda name: unidecode(name))
print("Names with special characters:")
print(pd.DataFrame(df_malay[df_malay['fullname'] != df_malay['transliteration']][['fullname', 'transliteration']]))

# 9. period_freq, dash_freq, apostrophe_freq
df_malay['period_freq'] = df_malay['fullname'].apply(lambda name: name.count('.'))
df_malay['dash_freq'] = df_malay['fullname'].apply(lambda name: name.count('-'))
df_malay['apostrophe_freq'] = df_malay['fullname'].apply(lambda name: name.count('\''))

# 10. space_freq
df_malay['space_freq'] = df_malay['fullname'].apply(lambda name: name.count(' '))

df_malay.head()

Names with special characters:
                                               fullname  \
1070                    tunku imran ibni tuanku ja’afar   
1992  sultan ahmad shah al-musta’in billah ibni alma...   
2134   tuanku ja’afar ibni almarhum tuanku abdul rahman   
2259                                     junior eldstål   
2728                   sultan abdul halim mu’adzam shah   

                                        transliteration  
1070                    tunku imran ibni tuanku ja'afar  
1992  sultan ahmad shah al-musta'in billah ibni alma...  
2134   tuanku ja'afar ibni almarhum tuanku abdul rahman  
2259                                     junior eldstal  
2728                   sultan abdul halim mu'adzam shah  


Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq
0,annuar rapaee,Annuar Rapaee,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[a, n, n, u, a, r, , r, a, p, a, e, e]","[(a, n), (n, n), (n, u), (u, a), (a, r), (r, ...","[(a, n, n), (n, n, u), (n, u, a), (u, a, r), (...","[a, n, n, u, a, r, , r, a, p, a, e, e, (a, n)...","[annuar, rapaee]",13,6.0,2,annuar rapaee,0,0,0,1
1,tash yong,Tash Yong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[t, a, s, h, , y, o, n, g]","[(t, a), (a, s), (s, h), (h, ), ( , y), (y, o...","[(t, a, s), (a, s, h), (s, h, ), (h, , y), (...","[t, a, s, h, , y, o, n, g, (t, a), (a, s), (s...","[tash, yong]",9,4.0,2,tash yong,0,0,0,1
2,fatmawati,Fatmawati,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[f, a, t, m, a, w, a, t, i]","[(f, a), (a, t), (t, m), (m, a), (a, w), (w, a...","[(f, a, t), (a, t, m), (t, m, a), (m, a, w), (...","[f, a, t, m, a, w, a, t, i, (f, a), (a, t), (t...",[fatmawati],9,9.0,1,fatmawati,0,0,0,0
3,alto linus,Alto Linus,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[a, l, t, o, , l, i, n, u, s]","[(a, l), (l, t), (t, o), (o, ), ( , l), (l, i...","[(a, l, t), (l, t, o), (t, o, ), (o, , l), (...","[a, l, t, o, , l, i, n, u, s, (a, l), (l, t),...","[alto, linus]",10,4.5,2,alto linus,0,0,0,1
4,mohamad izzat abdul halil,Mohamad Izzat Abdul Halil,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, o, h, a, m, a, d, , i, z, z, a, t, , a, ...","[(m, o), (o, h), (h, a), (a, m), (m, a), (a, d...","[(m, o, h), (o, h, a), (h, a, m), (a, m, a), (...","[m, o, h, a, m, a, d, , i, z, z, a, t, , a, ...","[mohamad, izzat, abdul, halil]",25,5.5,4,mohamad izzat abdul halil,0,0,0,3


### Indonesian Dataset: `ngram` Frequency Distributions

**WARNING: DO NOT RUN THE CELLS IN THIS SECTION MORE THAN ONCE!**

Each run increments the frequency distributions, so multiple runs will make the distributions inaccurate. If you need to re-run any cells, please restart the kernel and run all cells.

Our tasks in this section are to:
1. Create `ngram` relative frequency distributions across the entire **language**.
2. Create `ngram` relative frequency distributions for each **individual example**.
3. Compare the distributions.


#### 1. Language Relative Frequency Distributions

We will first make a function for character/unigram frequency distribution that will return a hashmap.
- The keys will be composed of every unique character that appears in this dataset, which we will use to create every possible bigram/trigram for those corresponding distributions.
- Each value is equal to the number of times the character appears in the dataset divided by the total number of characters in the dataset.

In [46]:
'''
Function that returns the relative frequency distribution for characters, aka unigrams, across the entire language.
Returns a hashmap sorted by the ASCII values of the keys in ascending order.

df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

Creating the unigrams frequency distribution:

In [47]:
# Creating the unigrams frequency distribution for the entire Indonesian language
unigram_fdist = create_lang_char_distribution(df_indo, 'fullname')
print(len(unigram_fdist))
unigram_fdist

44


{' ': 0.08276406157308593,
 "'": 0.0003547118577727765,
 ',': 1.2231443371475051e-05,
 '-': 0.0003791747445157266,
 '.': 0.0031862909982692507,
 'a': 0.1667268045965764,
 'b': 0.016616415820148858,
 'c': 0.009253086910520876,
 'd': 0.0441493948493392,
 'e': 0.04197219792921664,
 'f': 0.01045176836092543,
 'g': 0.017894601652468,
 'h': 0.03426638860518735,
 'i': 0.08912441212625295,
 'j': 0.013075412964106829,
 'k': 0.019111630267929766,
 'l': 0.03099447750331778,
 'm': 0.04122607988355666,
 'n': 0.07453230018408322,
 'o': 0.04553154795031588,
 'p': 0.012372104970247015,
 'q': 0.0007277708806027655,
 'r': 0.06448416945441647,
 's': 0.05043635674227737,
 't': 0.0384128479081174,
 'u': 0.04299963917242054,
 'v': 0.004568444099245931,
 'w': 0.013356736161650756,
 'x': 0.00059322500351654,
 'y': 0.024420076691149938,
 'z': 0.005871092818308024,
 'á': 1.2231443371475051e-05,
 'ã': 1.8347165057212577e-05,
 'ä': 6.1157216857375255e-06,
 'ç': 6.1157216857375255e-06,
 'é': 6.1157216857375255e-06

We now have every character that occurs in the Indonesian dataset, so we can use this to create all possible bigrams and trigrams. We will start with bigrams:

In [48]:
'''
Function that returns all possible bigrams as a hashmap. Each possible bigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams
    

In [49]:
# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', "'"): 0,
 (' ', ','): 0,
 (' ', '-'): 0,
 (' ', '.'): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', 'á'): 0,
 (' ', 'ã'): 0,
 (' ', 'ä'): 0,
 (' ', 'ç'): 0,
 (' ', 'é'): 0,
 (' ', 'ë'): 0,
 (' ', 'í'): 0,
 (' ', 'ñ'): 0,
 (' ', 'ö'): 0,
 (' ', 'ú'): 0,
 (' ', 'ü'): 0,
 (' ', 'ś'): 0,
 (' ', '’'): 0,
 ("'", ' '): 0,
 ("'", "'"): 0,
 ("'", ','): 0,
 ("'", '-'): 0,
 ("'", '.'): 0,
 ("'", 'a'): 0,
 ("'", 'b'): 0,
 ("'", 'c'): 0,
 ("'", 'd'): 0,
 ("'", 'e'): 0,
 ("'", 'f'): 0,
 ("'", 'g'): 0,
 ("'", 'h'): 0,
 ("'", 'i'): 0,
 ("'", 'j'): 0,
 ("'", 'k'): 0,
 ("'", 'l'): 0,
 ("'", 'm'): 0,
 ("'", '

Now that we have an initialized hashmap of all possible bigrams, we can look at our `bigrams` column and increment the hashmap accordingly. We will create a function that can do this for both bigrams and trigrams.

In [50]:
'''
Function that returns the relative frequency distribution for -grams (bigrams, trigrams, etc.) across the entire language.
Returns a hashmap.

initialized_grams: a hashmap with all possible -grams as keys and all values initialized to 0. This parameter is copied in the function.
df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

Creating the bigrams frequency distribution:

In [51]:
# Creating the bigrams frequency distribution for the entire Indonesian language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df_indo, 'bigrams')
bigram_fdist

{(' ', ' '): 6.5674111921821535e-06,
 (' ', "'"): 6.5674111921821535e-06,
 (' ', ','): 0.0,
 (' ', '-'): 0.0,
 (' ', '.'): 0.0,
 (' ', 'a'): 0.009995599834501238,
 (' ', 'b'): 0.0037368569683516456,
 (' ', 'c'): 0.0018717121897719139,
 (' ', 'd'): 0.004649727124064965,
 (' ', 'e'): 0.001326617060820795,
 (' ', 'f'): 0.0021672456934201108,
 (' ', 'g'): 0.0021541108710357464,
 (' ', 'h'): 0.0049912325060584364,
 (' ', 'i'): 0.002528453308990129,
 (' ', 'j'): 0.0016615550316220848,
 (' ', 'k'): 0.004321356564455857,
 (' ', 'l'): 0.002390537673954304,
 (' ', 'm'): 0.00688921434059908,
 (' ', 'n'): 0.0033625145303972626,
 (' ', 'o'): 0.0008012241654462227,
 (' ', 'p'): 0.005569164690970466,
 (' ', 'q'): 0.00011821340145927877,
 (' ', 'r'): 0.004997799917250619,
 (' ', 's'): 0.014054259951269809,
 (' ', 't'): 0.003802531080273467,
 (' ', 'u'): 0.0008209263990227692,
 (' ', 'v'): 0.000597634418488576,
 (' ', 'w'): 0.0033559471192050805,
 (' ', 'x'): 7.880893430618584e-05,
 (' ', 'y'): 0.00166

We define a similar method for trigrams:

In [52]:
'''
Function that returns all possible trigrams as a hashmap. Each possible trigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

However, we are going to use **transliterated trigrams**. Without transliteration, the length of each individual frequency distribution is around 110,000. Since each example needs a copy, too much memory is used and the kernel crashes. With transliteration, the length of each becomes 42,000, which allows the code to run.

In [53]:
# Finding all possible transliterated characters
all_possible_chars_translit = create_lang_char_distribution(df_indo, 'transliteration').keys()
print(len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print(len(initialized_trigrams))
initialized_trigrams

31
29791


{(' ', ' ', ' '): 0,
 (' ', ' ', "'"): 0,
 (' ', ' ', ','): 0,
 (' ', ' ', '-'): 0,
 (' ', ' ', '.'): 0,
 (' ', ' ', 'a'): 0,
 (' ', ' ', 'b'): 0,
 (' ', ' ', 'c'): 0,
 (' ', ' ', 'd'): 0,
 (' ', ' ', 'e'): 0,
 (' ', ' ', 'f'): 0,
 (' ', ' ', 'g'): 0,
 (' ', ' ', 'h'): 0,
 (' ', ' ', 'i'): 0,
 (' ', ' ', 'j'): 0,
 (' ', ' ', 'k'): 0,
 (' ', ' ', 'l'): 0,
 (' ', ' ', 'm'): 0,
 (' ', ' ', 'n'): 0,
 (' ', ' ', 'o'): 0,
 (' ', ' ', 'p'): 0,
 (' ', ' ', 'q'): 0,
 (' ', ' ', 'r'): 0,
 (' ', ' ', 's'): 0,
 (' ', ' ', 't'): 0,
 (' ', ' ', 'u'): 0,
 (' ', ' ', 'v'): 0,
 (' ', ' ', 'w'): 0,
 (' ', ' ', 'x'): 0,
 (' ', ' ', 'y'): 0,
 (' ', ' ', 'z'): 0,
 (' ', "'", ' '): 0,
 (' ', "'", "'"): 0,
 (' ', "'", ','): 0,
 (' ', "'", '-'): 0,
 (' ', "'", '.'): 0,
 (' ', "'", 'a'): 0,
 (' ', "'", 'b'): 0,
 (' ', "'", 'c'): 0,
 (' ', "'", 'd'): 0,
 (' ', "'", 'e'): 0,
 (' ', "'", 'f'): 0,
 (' ', "'", 'g'): 0,
 (' ', "'", 'h'): 0,
 (' ', "'", 'i'): 0,
 (' ', "'", 'j'): 0,
 (' ', "'", 'k'): 0,
 (' ', "'", '

In [54]:
# Changing trigrams column to become transliterated
df_indo['trigrams'] = df_indo['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Indonesian language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df_indo, 'trigrams')
trigram_fdist

{(' ', ' ', ' '): 0.0,
 (' ', ' ', "'"): 0.0,
 (' ', ' ', ','): 0.0,
 (' ', ' ', '-'): 0.0,
 (' ', ' ', '.'): 0.0,
 (' ', ' ', 'a'): 0.0,
 (' ', ' ', 'b'): 0.0,
 (' ', ' ', 'c'): 0.0,
 (' ', ' ', 'd'): 0.0,
 (' ', ' ', 'e'): 0.0,
 (' ', ' ', 'f'): 0.0,
 (' ', ' ', 'g'): 0.0,
 (' ', ' ', 'h'): 0.0,
 (' ', ' ', 'i'): 0.0,
 (' ', ' ', 'j'): 0.0,
 (' ', ' ', 'k'): 0.0,
 (' ', ' ', 'l'): 0.0,
 (' ', ' ', 'm'): 7.091142453960758e-06,
 (' ', ' ', 'n'): 0.0,
 (' ', ' ', 'o'): 0.0,
 (' ', ' ', 'p'): 0.0,
 (' ', ' ', 'q'): 0.0,
 (' ', ' ', 'r'): 0.0,
 (' ', ' ', 's'): 0.0,
 (' ', ' ', 't'): 0.0,
 (' ', ' ', 'u'): 0.0,
 (' ', ' ', 'v'): 0.0,
 (' ', ' ', 'w'): 0.0,
 (' ', ' ', 'x'): 0.0,
 (' ', ' ', 'y'): 0.0,
 (' ', ' ', 'z'): 0.0,
 (' ', "'", ' '): 0.0,
 (' ', "'", "'"): 0.0,
 (' ', "'", ','): 0.0,
 (' ', "'", '-'): 0.0,
 (' ', "'", '.'): 0.0,
 (' ', "'", 'a'): 7.091142453960758e-06,
 (' ', "'", 'b'): 0.0,
 (' ', "'", 'c'): 0.0,
 (' ', "'", 'd'): 0.0,
 (' ', "'", 'e'): 0.0,
 (' ', "'", 'f'): 0.0

#### 2. Individual Relative Frequency Distributions

We can use the `initialized_bigrams` and `initialized_trigrams` hashmaps to create individual frequency distributions for each example. First, let's make an `initialized_unigrams` so we can do the same for unigrams:

In [55]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
initialized_unigrams

{' ': 0,
 "'": 0,
 ',': 0,
 '-': 0,
 '.': 0,
 'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0,
 'á': 0,
 'ã': 0,
 'ä': 0,
 'ç': 0,
 'é': 0,
 'ë': 0,
 'í': 0,
 'ñ': 0,
 'ö': 0,
 'ú': 0,
 'ü': 0,
 'ś': 0,
 '’': 0}

Now let's define a function for individual frequency distributions. The idea is that we should be able to `apply` this function to the `unigrams`, `bigrams`, or `trigrams` column, so we are taking in a list of -grams corresponding to one example. We should also take in an initialized hashmap corresponding to the type of ngram, and we are returning this hashmap but with the relative frequency of the example's -grams.

In [56]:
'''
Function to be applied to an ngrams column. Returns a hashmap of the relative frequency distribution for the current example.

grams_list: the list of -grams for this current example.
initialized_grams: a hashmap of all possible unigrams, bigrams, or trigrams as the keys and all values set to 0. This parameter is copied in the function.
'''
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

Creating individual frequency distributions for unigrams and bigrams:

In [57]:
# UNIGRAMS individual frequency distributions
df_indo['indiv_unigrams_fdist'] = df_indo['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(df_indo.iloc[0]['indiv_unigrams_fdist'])

df_indo.tail()

{' ': 0, "'": 0, ',': 0, '-': 0, '.': 0, 'a': 0.1111111111111111, 'b': 0, 'c': 0, 'd': 0.1111111111111111, 'e': 0, 'f': 0, 'g': 0, 'h': 0, 'i': 0.2222222222222222, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 0, 'o': 0, 'p': 0.1111111111111111, 'q': 0, 'r': 0.1111111111111111, 's': 0.1111111111111111, 't': 0, 'u': 0.1111111111111111, 'v': 0, 'w': 0, 'x': 0, 'y': 0.1111111111111111, 'z': 0, 'á': 0, 'ã': 0, 'ä': 0, 'ç': 0, 'é': 0, 'ë': 0, 'í': 0, 'ñ': 0, 'ö': 0, 'ú': 0, 'ü': 0, 'ś': 0, '’': 0}


Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq,indiv_unigrams_fdist
11241,edwel yusri datuak rajo gampo alam,Edwel Yusri Datuak Rajo Gampo Alam,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[(e, d), (d, w), (w, e), (e, l), (l, ), ( , y...","[(e, d, w), (d, w, e), (w, e, l), (e, l, ), (...","[e, d, w, e, l, , y, u, s, r, i, , d, a, t, ...","[edwel, yusri, datuak, rajo, gampo, alam]",34,4.833333,6,edwel yusri datuak rajo gampo alam,0,0,0,5,"{' ': 0.14705882352941177, ''': 0, ',': 0, '-'..."
11242,sultan amaluddin al sani perkasa alamsyah,Sultan Amaluddin Al Sani Perkasa Alamsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[(s, u), (u, l), (l, t), (t, a), (a, n), (n, ...","[(s, u, l), (u, l, t), (l, t, a), (t, a, n), (...","[s, u, l, t, a, n, , a, m, a, l, u, d, d, i, ...","[sultan, amaluddin, al, sani, perkasa, alamsyah]",41,6.0,6,sultan amaluddin al sani perkasa alamsyah,0,0,0,5,"{' ': 0.12195121951219512, ''': 0, ',': 0, '-'..."
11243,"prof. dr. ir. antonius suwanto, m.sc","Prof. Dr. Ir. Antonius Suwanto, M.Sc","[LATIN, LATIN, LATIN, LATIN, FULL, SPACE, LATI...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[(p, r), (r, o), (o, f), (f, .), (., ), ( , d...","[(p, r, o), (r, o, f), (o, f, .), (f, ., ), (...","[p, r, o, f, ., , d, r, ., , i, r, ., , a, ...","[prof., dr., ir., antonius, suwanto,, m.sc]",36,5.166667,6,"prof. dr. ir. antonius suwanto, m.sc",4,0,0,5,"{' ': 0.1388888888888889, ''': 0, ',': 0.02777..."
11244,kanjeng raden tumengung mas ariya purnama hadi...,Kanjeng Raden Tumengung Mas Ariya Purnama Hadi...,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[(k, a), (a, n), (n, j), (j, e), (e, n), (n, g...","[(k, a, n), (a, n, j), (n, j, e), (j, e, n), (...","[k, a, n, j, e, n, g, , r, a, d, e, n, , t, ...","[kanjeng, raden, tumengung, mas, ariya, purnam...",53,6.714286,7,kanjeng raden tumengung mas ariya purnama hadi...,0,0,0,6,"{' ': 0.11320754716981131, ''': 0, ',': 0, '-'..."
11245,lord kanjeng baginda atep bin lord dari segala...,Lord Kanjeng Baginda Atep Bin Lord Dari Segala...,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[(l, o), (o, r), (r, d), (d, ), ( , k), (k, a...","[(l, o, r), (o, r, d), (r, d, ), (d, , k), (...","[l, o, r, d, , k, a, n, j, e, n, g, , b, a, ...","[lord, kanjeng, baginda, atep, bin, lord, dari...",76,5.416667,12,lord kanjeng baginda atep bin lord dari segala...,0,0,0,11,"{' ': 0.14473684210526316, ''': 0, ',': 0, '-'..."


In [58]:
# BIGRAMS individual frequency distributions
df_indo['indiv_bigrams_fdist'] = df_indo['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

# checking that it works for 'supriyadi'
print(df_indo.iloc[0]['indiv_bigrams_fdist'][('s', 'u')])
print(1 / len(df_indo.iloc[0]['bigrams']))

df_indo.head()

0.125
0.125


Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,num_tokens,transliteration,period_freq,dash_freq,apostrophe_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.0,1,supriyadi,0,0,0,0,"{' ': 0, ''': 0, ',': 0, '-': 0, '.': 0, 'a': ...","{(' ', ' '): 0, (' ', '''): 0, (' ', ','): 0, ..."
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.0,1,triyaningsih,0,0,0,0,"{' ': 0, ''': 0, ',': 0, '-': 0, '.': 0, 'a': ...","{(' ', ' '): 0, (' ', '''): 0, (' ', ','): 0, ..."
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.0,1,soerjadi,0,0,0,0,"{' ': 0, ''': 0, ',': 0, '-': 0, '.': 0, 'a': ...","{(' ', ' '): 0, (' ', '''): 0, (' ', ','): 0, ..."
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.0,1,undunsyah,0,0,0,0,"{' ': 0, ''': 0, ',': 0, '-': 0, '.': 0, 'a': ...","{(' ', ' '): 0, (' ', '''): 0, (' ', ','): 0, ..."
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.0,1,soeripto,0,0,0,0,"{' ': 0, ''': 0, ',': 0, '-': 0, '.': 0, 'a': ...","{(' ', ' '): 0, (' ', '''): 0, (' ', ','): 0, ..."


For trigrams, we must first change the `trigrams` column to transliterated trigrams. This ensures that in the function `set_indiv_trigram_dist()` defined a few cells below, we won't look up a trigram with special characters since it would not exist in the initialized distribution (`init_trigrams`), which we transliterated.

**CAUTION**: The following two cells will take longer to run. With unigrams and bigrams we were able to use `create_indiv_gram_distribution()`, but I separated the copying and incrementing parts of the function into two cells to make it easier for the kernel to run.

In [59]:
df_indo['indiv_trigrams_fdist'] = df_indo['trigrams'].apply(lambda entry: initialized_trigrams.copy())

In [60]:
'''
Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

trigrams_list: the list of trigrams for this current example.
init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
'''
def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
df_indo['indiv_trigrams_fdist'] = df_indo.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

Checking some examples to verify that it worked:

In [61]:
# Checking 0th example
print(df_indo.loc[0, 'indiv_trigrams_fdist'][('s', 'u', 'p')])
print(1 / len(df_indo.loc[0, 'trigrams'])) # manual calculation

# Checking 1st example
print(df_indo.loc[1, 'fullname'])
print(df_indo.loc[1, 'indiv_trigrams_fdist'][('s', 'i', 'h')])
print(1 / len(df_indo.loc[1, 'trigrams'])) # manual calculation

0.14285714285714285
0.14285714285714285
triyaningsih
0.1
0.1


#### 3. Comparing Distributions

Finding unigrams cosine similarity:

In [62]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_indo['indiv_unigrams_fdist'] = df_indo['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [63]:
# Calculating cosine similarity
df_indo['unigrams_cosine_sim'] = df_indo['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

Finding bigrams cosine similarity:

In [64]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_indo['indiv_bigrams_fdist'] = df_indo['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

In [65]:
# Calculating cosine similarity
df_indo['bigrams_cosine_sim'] = df_indo['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])

**CAUTION**: The following cell may take longer to run. Finding trigrams cosine similarity:

In [66]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_indo['indiv_trigrams_fdist'] = df_indo['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

In [67]:
# Calculating cosine similarity
df_indo['trigrams_cosine_sim'] = df_indo['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
df_indo.head()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,period_freq,dash_freq,apostrophe_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.664809,0.25064,0.085949
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.08333333333333333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.686625,0.353292,0.117226
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.688312,0.197139,0.090295
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.581396,0.155386,0.060083
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.463215,0.176917,0.052811


### Malay Dataset: `ngram` Frequency Distributions

We will perform the same operations on the Malay dataset. We will reuse our variables to ease the memory load.

#### 1. Language Relative Frequency Distributions

In [68]:
# -------------- 1. Language relative frequency distributions --------------

# ----- UNIGRAMS -----

# Creating the unigrams frequency distribution for the entire Malay language
unigram_fdist = create_lang_char_distribution(df_malay, 'fullname')
print('# unique characters without transliteration:', len(unigram_fdist))

# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
# We are reusing this variable
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())

# ----- BIGRAMS -----

# Creating the bigrams frequency distribution for the entire Malay language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df_malay, 'bigrams')

# ----- TRIGRAMS -----

# Finding all possible transliterated characters for trigrams
all_possible_chars_translit = create_lang_char_distribution(df_malay, 'transliteration').keys()
print('# unique characters with transliteration:', len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print('Length of trigrams fdist:', len(initialized_trigrams))

# Changing trigrams column to become transliterated
df_malay['trigrams'] = df_malay['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Malay language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df_malay, 'trigrams')

# unique characters without transliteration: 37
# unique characters with transliteration: 35
Length of trigrams fdist: 42875


#### 2. Individual Relative Frequency Distributions

In [69]:
# -------------- 2. Individual relative frequency distributions --------------

# ----- UNIGRAMS -----

initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}

# UNIGRAMS individual frequency distributions
df_malay['indiv_unigrams_fdist'] = df_malay['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# ----- BIGRAMS -----

# BIGRAMS individual frequency distributions
df_malay['indiv_bigrams_fdist'] = df_malay['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

In [70]:
# ----- TRIGRAMS -----

df_malay['indiv_trigrams_fdist'] = df_malay['trigrams'].apply(lambda entry: initialized_trigrams.copy())

In [71]:
# TRIGRAMS individual frequency distributions
df_malay['indiv_trigrams_fdist'] = df_malay.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

#### 3. Comparing Distributions

In [72]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_malay['indiv_unigrams_fdist'] = df_malay['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [73]:
# Calculating cosine similarity
df_malay['unigrams_cosine_sim'] = df_malay['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

Finding bigrams cosine similarity:

In [74]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_malay['indiv_bigrams_fdist'] = df_malay['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

In [75]:
# Calculating cosine similarity
df_malay['bigrams_cosine_sim'] = df_malay['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])

Finding trigrams cosine similarity:

In [76]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df_malay['indiv_trigrams_fdist'] = df_malay['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

In [77]:
# Calculating cosine similarity
df_malay['trigrams_cosine_sim'] = df_malay['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
df_malay.head()

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,period_freq,dash_freq,apostrophe_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
0,annuar rapaee,Annuar Rapaee,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[a, n, n, u, a, r, , r, a, p, a, e, e]","[(a, n), (n, n), (n, u), (u, a), (a, r), (r, ...","[(a, n, n), (n, n, u), (n, u, a), (u, a, r), (...","[a, n, n, u, a, r, , r, a, p, a, e, e, (a, n)...","[annuar, rapaee]",13,6.0,...,0,0,0,1,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.740095,0.253373,0.063757
1,tash yong,Tash Yong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[t, a, s, h, , y, o, n, g]","[(t, a), (a, s), (s, h), (h, ), ( , y), (y, o...","[(t, a, s), (a, s, h), (s, h, ), (h, , y), (...","[t, a, s, h, , y, o, n, g, (t, a), (a, s), (s...","[tash, yong]",9,4.0,...,0,0,0,1,"[[0.1111111111111111, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.672545,0.215969,0.073571
2,fatmawati,Fatmawati,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[f, a, t, m, a, w, a, t, i]","[(f, a), (a, t), (t, m), (m, a), (a, w), (w, a...","[(f, a, t), (a, t, m), (t, m, a), (m, a, w), (...","[f, a, t, m, a, w, a, t, i, (f, a), (a, t), (t...",[fatmawati],9,9.0,...,0,0,0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.614505,0.152035,0.022338
3,alto linus,Alto Linus,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[a, l, t, o, , l, i, n, u, s]","[(a, l), (l, t), (t, o), (o, ), ( , l), (l, i...","[(a, l, t), (l, t, o), (t, o, ), (o, , l), (...","[a, l, t, o, , l, i, n, u, s, (a, l), (l, t),...","[alto, linus]",10,4.5,...,0,0,0,1,"[[0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.70378,0.19172,0.028755
4,mohamad izzat abdul halil,Mohamad Izzat Abdul Halil,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, o, h, a, m, a, d, , i, z, z, a, t, , a, ...","[(m, o), (o, h), (h, a), (a, m), (m, a), (a, d...","[(m, o, h), (o, h, a), (h, a, m), (a, m, a), (...","[m, o, h, a, m, a, d, , i, z, z, a, t, , a, ...","[mohamad, izzat, abdul, halil]",25,5.5,...,0,0,0,3,"[[0.12, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.865755,0.520675,0.377271


### Exporting cleaned datasets

In [79]:
df_indo.to_pickle('../pickled_dataframes/df_indo.pkl.gz', compression='gzip')
df_malay.to_pickle('../pickled_dataframes/df_malay.pkl.gz', compression='gzip')