# Turkish Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Reading in the Turkish nams 
df_turk  =  pd.read_excel('../name_data/exiger_datasets/EXGR_Turkish names2.xlsx')
print(df_turk)

                                            id                      label_tr  \
0       http://www.wikidata.org/entity/Q282782                          Ceza   
1       http://www.wikidata.org/entity/Q288019                        Gülşen   
2       http://www.wikidata.org/entity/Q236047                        Hadise   
3       http://www.wikidata.org/entity/Q236920                         Emrah   
4       http://www.wikidata.org/entity/Q182948                        Göksel   
...                                        ...                           ...   
18769  http://www.wikidata.org/entity/Q6053953     Elif Nur Bozkurt Tandoğan   
18770  http://www.wikidata.org/entity/Q6085044    Muhammed Ali Fatih Erbakan   
18771  http://www.wikidata.org/entity/Q6041731      İsmail Hamit Özer Derbil   
18772  http://www.wikidata.org/entity/Q6071196        Vedat Ali Özkan Kayacı   
18773     http://www.wikidata.org/entity/Q5031  Elder Paisios of Mount Athos   

      comment          Given name Middl

In [3]:
#printing out the shape(18774,8)
df_turk.shape

(18774, 8)

In [4]:
#Printing out the head
df_turk.head()

Unnamed: 0,id,label_tr,comment,Given name,Middle Name,Family name,Maiden Name,Unnamed: 7
0,http://www.wikidata.org/entity/Q282782,Ceza,x,,,,,1
1,http://www.wikidata.org/entity/Q288019,Gülşen,x,,,,,1
2,http://www.wikidata.org/entity/Q236047,Hadise,x,,,,,1
3,http://www.wikidata.org/entity/Q236920,Emrah,x,,,,,1
4,http://www.wikidata.org/entity/Q182948,Göksel,x,,,,,1


In [5]:
#checking if there is any null entries in the datasets(yes)
df_turk.isnull().values.any()

True

In [6]:
# checking what's in the comment columns
df_turk['comment']

0          x
1          x
2          x
3          x
4          x
        ... 
18769    NaN
18770    NaN
18771      x
18772    NaN
18773      x
Name: comment, Length: 18774, dtype: object

In [7]:
df_turk.isnull().head()

Unnamed: 0,id,label_tr,comment,Given name,Middle Name,Family name,Maiden Name,Unnamed: 7
0,False,False,False,True,True,True,True,False
1,False,False,False,True,True,True,True,False
2,False,False,False,True,True,True,True,False
3,False,False,False,True,True,True,True,False
4,False,False,False,True,True,True,True,False


In [8]:
#checking how many null entries each columns have 
nan_count = np.sum(df_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18318
Given name       455
Middle Name    18772
Family name      459
Maiden Name    18763
Unnamed: 7         0
dtype: int64

In [9]:
# checking if the datasets have any duplicates(True == yes)
df_turk['label_tr'].duplicated().any()

True

In [10]:
#check if there is any null entries in label_tr(no)
np.any(df_turk['label_tr'].isnull())

False

In [11]:
#just checking the shape before dropping dupliates (18774,8)
df_turk.shape

(18774, 8)

In [12]:
#Dropping the duplicates(place in a new variable)
df2_turk = df_turk.drop_duplicates(subset=['label_tr'])
print(df2_turk)

                                            id                      label_tr  \
0       http://www.wikidata.org/entity/Q282782                          Ceza   
1       http://www.wikidata.org/entity/Q288019                        Gülşen   
2       http://www.wikidata.org/entity/Q236047                        Hadise   
3       http://www.wikidata.org/entity/Q236920                         Emrah   
4       http://www.wikidata.org/entity/Q182948                        Göksel   
...                                        ...                           ...   
18769  http://www.wikidata.org/entity/Q6053953     Elif Nur Bozkurt Tandoğan   
18770  http://www.wikidata.org/entity/Q6085044    Muhammed Ali Fatih Erbakan   
18771  http://www.wikidata.org/entity/Q6041731      İsmail Hamit Özer Derbil   
18772  http://www.wikidata.org/entity/Q6071196        Vedat Ali Özkan Kayacı   
18773     http://www.wikidata.org/entity/Q5031  Elder Paisios of Mount Athos   

      comment          Given name Middl

In [13]:
#checking if there are still duplicates(no duplicates)
df2_turk['label_tr'].duplicated().any()


False

In [14]:
#Just checking if th shape went down(which it did. Went form (18774,8) to (18491,8))
df2_turk.shape

(18491, 8)

In [15]:
# checking if the null values went down (it did) 
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name       453
Middle Name    18489
Family name      457
Maiden Name    18480
Unnamed: 7         0
dtype: int64

In [16]:
#printing the col_names with null values
condition = nan_count != 0
col_names = nan_count[condition].index
nan_cols = list(col_names)
print(nan_cols)

['comment', 'Given name', 'Middle Name', 'Family name', 'Maiden Name']


In [17]:
print(df2_turk['comment'].unique())
print(df2_turk['Given name'].unique())
print(df2_turk['Middle Name'].unique())
print(df2_turk['Family name'].unique())
print(df2_turk['Maiden Name'].unique())

['x' nan]
[nan 'Melahat' 'Freddy' ... 'Elif Nur Bozkurt ' 'Muhammed Ali Fatih'
 'Vedat Ali Özkan']
[nan 'Sahin' 'Kemal']
[nan 'Abbasova' 'Scholl' ... 'Kapıcıoğlu' 'Ali Marandi' 'Bektur']
[nan 'Önder' 'Kafaoğlu' 'Türker' 'Doğan' 'Karahan' 'Sabancı' 'Ekşi' 'İnan'
 'Tosun' 'Sayan']


In [18]:
# drop comments row with x 
comments_drop_with_x =  df2_turk['comment'] == 'x'
df2_turk = df2_turk.drop(df2_turk[df2_turk['comment'] == 'x'].index)
print(df2_turk)
#print(comments_drop)
#df2_turk['comment']

                                             id                    label_tr  \
125     http://www.wikidata.org/entity/Q3508136            Melahat Abbasova   
126       http://www.wikidata.org/entity/Q96602         Freddy Sahin-Scholl   
127      http://www.wikidata.org/entity/Q217097             Richard Kingson   
129      http://www.wikidata.org/entity/Q110126                  Hrant Dink   
130       http://www.wikidata.org/entity/Q42079               Ricky Winslow   
...                                         ...                         ...   
18764  http://www.wikidata.org/entity/Q49703809   Nil İpek Hülagü Öztürkmen   
18765  http://www.wikidata.org/entity/Q24230049      Fatma Betül Sayan Kaya   
18769   http://www.wikidata.org/entity/Q6053953   Elif Nur Bozkurt Tandoğan   
18770   http://www.wikidata.org/entity/Q6085044  Muhammed Ali Fatih Erbakan   
18772   http://www.wikidata.org/entity/Q6071196      Vedat Ali Özkan Kayacı   

      comment          Given name Middle Name Famil

In [19]:
#check the shape again( went form (18491,8) to (18037, 8))
df2_turk.shape

(18037, 8)

In [20]:
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name         0
Middle Name    18035
Family name        4
Maiden Name    18026
Unnamed: 7         0
dtype: int64

In [21]:
non_alnum_names_turk = [name for name in df2_turk['label_tr'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_turk))
non_alnum_names_turk

30


['Freddy Sahin-Scholl',
 'Perihan Önder-Ridder',
 'Erhan-Can Kartal',
 'Ali-Özgür Özdil',
 'Mehmet Aga-Oglu',
 'Ahmed Agha-Oghlu',
 'Hatice Aksoy-Woinek',
 'Bugha al-Sharabi',
 'Iffat Al-Thunayan',
 "Nev'îzâde Atâyî",
 'Üstün Bilgen-Reinart',
 'Mehpare Bozyigit-Kirchmann',
 'Elif Çağlar-Muslu',
 "Cem'i Demiroğlu",
 "Neş'e Erdok",
 'Asuman Kafaoğlu-Büke',
 'Elçin Kürşat-Ahlers',
 'Gönül Sen-Menzel',
 'Tülay Sözbir-Seidel',
 "Gaybi Sun'ullah",
 'Ayshe Talay-Ongan',
 'Sevgi Türker-Terlemez',
 "Mümtaz'er Türköne",
 'Sabiha Bânu Yalkut-Breddermann',
 "Temel Nücûm'i Göksel",
 'Molla Ahmed-i Cezirî',
 "Dipika O'Neill Joti",
 "Abdurrahman Necati Kara'a",
 'Kerimüddin Mahmud-i Aksarayî',
 'Salah al-Din Zarkub']

In [22]:
#drop extra column we don't need 
df2_turk = df2_turk.drop(columns = ['Unnamed: 7'])

In [23]:
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name         0
Middle Name    18035
Family name        4
Maiden Name    18026
dtype: int64

In [24]:
#Drop columns that won't be used in df2
print(df2_turk.columns)
df2_turk = df2_turk.drop(columns= [ 'comment', 'Given name', 'Middle Name', 'Family name',
       'Maiden Name'])


Index(['id', 'label_tr', 'comment', 'Given name', 'Middle Name', 'Family name',
       'Maiden Name'],
      dtype='object')


In [25]:
#Create lowercase colun
df2_turk['original_fullname'] = df2_turk['label_tr']
df2_turk['fullname'] = df2_turk['label_tr'].apply(str.lower)
df2_turk

Unnamed: 0,id,label_tr,original_fullname,fullname
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow
...,...,...,...,...
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan


In [26]:
#just drop label_tr because original_fullname is the same
print(df2_turk.columns)
df2_turk = df2_turk.drop(columns= [ 'label_tr'])

Index(['id', 'label_tr', 'original_fullname', 'fullname'], dtype='object')


In [27]:
#Checking again for null values(none)
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                   0
original_fullname    0
fullname             0
dtype: int64

In [28]:
#now let's check if there is any duplicates
df2_turk['fullname'].duplicated().any()

False

In [29]:
df2_turk = df2_turk.reset_index(drop=True)
print(df2_turk)
df2_turk.shape


                                             id           original_fullname  \
0       http://www.wikidata.org/entity/Q3508136            Melahat Abbasova   
1         http://www.wikidata.org/entity/Q96602         Freddy Sahin-Scholl   
2        http://www.wikidata.org/entity/Q217097             Richard Kingson   
3        http://www.wikidata.org/entity/Q110126                  Hrant Dink   
4         http://www.wikidata.org/entity/Q42079               Ricky Winslow   
...                                         ...                         ...   
18032  http://www.wikidata.org/entity/Q49703809   Nil İpek Hülagü Öztürkmen   
18033  http://www.wikidata.org/entity/Q24230049      Fatma Betül Sayan Kaya   
18034   http://www.wikidata.org/entity/Q6053953   Elif Nur Bozkurt Tandoğan   
18035   http://www.wikidata.org/entity/Q6085044  Muhammed Ali Fatih Erbakan   
18036   http://www.wikidata.org/entity/Q6071196      Vedat Ali Özkan Kayacı   

                         fullname  
0              

(18037, 3)

Now For Feature Engineering 

In [30]:
# add transaliteration
df2_turk['transliteration'] = df2_turk['fullname'].apply(lambda name: unidecode(name))

In [31]:
def get_alphabet(name):
    name_as_alphabet_list = []
    for char in name:
        name_as_alphabet_list.append(unicodedata.name(char).split(' ')[0])
    return name_as_alphabet_list

df2_turk['alphabet'] = df2_turk['transliteration'].apply(get_alphabet)
df2_turk.head()
df2_turk.shape

(18037, 5)

In [32]:
#char grams
def get_ngrams(text, n):
    if isinstance(text,str):
        name = list(text)
    ngrams_list =  list(ngrams(list(text), n))
    return ngrams_list

df2_turk["unigrams"] = df2_turk['transliteration'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2_turk["bigrams"] = df2_turk['transliteration'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2_turk["trigrams"] = df2_turk['transliteration'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2_turk['char_ngrams'] = df2_turk["unigrams"] + df2_turk["bigrams"] + df2_turk["trigrams"]
print(df2_turk)

                                             id           original_fullname  \
0       http://www.wikidata.org/entity/Q3508136            Melahat Abbasova   
1         http://www.wikidata.org/entity/Q96602         Freddy Sahin-Scholl   
2        http://www.wikidata.org/entity/Q217097             Richard Kingson   
3        http://www.wikidata.org/entity/Q110126                  Hrant Dink   
4         http://www.wikidata.org/entity/Q42079               Ricky Winslow   
...                                         ...                         ...   
18032  http://www.wikidata.org/entity/Q49703809   Nil İpek Hülagü Öztürkmen   
18033  http://www.wikidata.org/entity/Q24230049      Fatma Betül Sayan Kaya   
18034   http://www.wikidata.org/entity/Q6053953   Elif Nur Bozkurt Tandoğan   
18035   http://www.wikidata.org/entity/Q6085044  Muhammed Ali Fatih Erbakan   
18036   http://www.wikidata.org/entity/Q6071196      Vedat Ali Özkan Kayacı   

                         fullname             trans

In [33]:
# feature to check the name length
def name_length(name):
    return len(name.replace(' ', ''))

df2_turk['name_length'] = df2_turk['transliteration'].apply(name_length)

print(df2_turk['name_length'])

0        15
1        18
2        14
3         9
4        12
         ..
18032    22
18033    19
18034    22
18035    23
18036    19
Name: name_length, Length: 18037, dtype: int64


In [34]:
# feature to check the name length
def token_length(name):
    return len(name.split())

df2_turk['num_tokens'] = df2_turk['transliteration'].apply(token_length)

print(df2_turk['num_tokens'])

0        2
1        2
2        2
3        2
4        2
        ..
18032    4
18033    4
18034    4
18035    4
18036    4
Name: num_tokens, Length: 18037, dtype: int64


In [35]:
tokens = df2_turk['transliteration'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
df2_turk['avg_token_length'] = token_lengths.apply(np.mean)
df2_turk.tail()

18032     [nil, ipek, hulagu, ozturkmen]
18033        [fatma, betul, sayan, kaya]
18034     [elif, nur, bozkurt, tandogan]
18035    [muhammed, ali, fatih, erbakan]
18036        [vedat, ali, ozkan, kayaci]
Name: transliteration, dtype: object 

18032    [3, 4, 6, 9]
18033    [5, 5, 5, 4]
18034    [4, 3, 7, 8]
18035    [8, 3, 5, 7]
18036    [5, 3, 5, 6]
Name: transliteration, dtype: object


Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length
18032,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,nil ipek hulagu ozturkmen,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...","[(n, i), (i, l), (l, ), ( , i), (i, p), (p, e...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...",22,4,5.5
18033,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,fatma betül sayan kaya,fatma betul sayan kaya,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...",19,4,4.75
18034,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,elif nur bozkurt tandogan,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.5
18035,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,muhammed ali fatih erbakan,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75
18036,http://www.wikidata.org/entity/Q6071196,Vedat Ali Özkan Kayacı,vedat ali özkan kayacı,vedat ali ozkan kayaci,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[v, e, d, a, t, , a, l, i, , o, z, k, a, n, ...","[(v, e), (e, d), (d, a), (a, t), (t, ), ( , a...","[(v, e, d), (e, d, a), (d, a, t), (a, t, ), (...","[v, e, d, a, t, , a, l, i, , o, z, k, a, n, ...",19,4,4.75


In [36]:
df2_turk['period_freq'] = df2_turk['transliteration'].apply(lambda name: name.count('.'))
df2_turk['dash_freq'] = df2_turk['transliteration'].apply(lambda name: name.count('-'))
df2_turk['space_freq'] = df2_turk['transliteration'].apply(lambda name: name.count(' '))

In [37]:
# Have to do the transltion funtion 
def create_character_frequency_hashmap(df, names_col):
    char_freqs = {}
    for name in df[names_col]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
    return char_freqs


In [38]:
df2_turk


Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq
0,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,melahat abbasova,melahat abbasova,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,2,7.50,0,0,1
1,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,freddy sahin-scholl,freddy sahin-scholl,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,2,9.00,0,1,1
2,http://www.wikidata.org/entity/Q217097,Richard Kingson,richard kingson,richard kingson,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,2,7.00,0,0,1
3,http://www.wikidata.org/entity/Q110126,Hrant Dink,hrant dink,hrant dink,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,2,4.50,0,0,1
4,http://www.wikidata.org/entity/Q42079,Ricky Winslow,ricky winslow,ricky winslow,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,2,6.00,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18032,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,nil ipek hulagu ozturkmen,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...","[(n, i), (i, l), (l, ), ( , i), (i, p), (p, e...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...",22,4,5.50,0,0,3
18033,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,fatma betül sayan kaya,fatma betul sayan kaya,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...",19,4,4.75,0,0,3
18034,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,elif nur bozkurt tandogan,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.50,0,0,3
18035,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,muhammed ali fatih erbakan,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75,0,0,3


In [39]:
'''
Function that returns the relative frequency distribution for characters, aka unigrams, across the entire language.
Returns a hashmap sorted by the ASCII values of the keys in ascending order.

df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

In [40]:
# Creating the unigrams frequency distribution for the entire Indonesian language
unigram_fdist = create_lang_char_distribution(df2_turk, 'transliteration')
print(len(unigram_fdist))
unigram_fdist

29


{' ': 0.08591498633546536,
 "'": 3.307602938805211e-05,
 '-': 9.095908081714331e-05,
 'a': 0.12794221617665907,
 'b': 0.01800576349812087,
 'c': 0.024844232574100643,
 'd': 0.022028635572442706,
 'e': 0.08375264091422145,
 'f': 0.011080469844997456,
 'g': 0.0251336478312461,
 'h': 0.026526975569217792,
 'i': 0.07448721818189336,
 'j': 0.0005044094481677947,
 'k': 0.043602475740799694,
 'l': 0.05286789847312779,
 'm': 0.04406140564855892,
 'n': 0.06277003477117589,
 'o': 0.03160414608028379,
 'p': 0.006879814112714839,
 'q': 1.2403511020519542e-05,
 'r': 0.061045946739323675,
 's': 0.042635001881199175,
 't': 0.04346603711957398,
 'u': 0.056245787974382615,
 'v': 0.00839717696089173,
 'w': 7.442106612311725e-05,
 'x': 2.89415257145456e-05,
 'y': 0.026464958014115195,
 'z': 0.01949831932425672}

In [41]:
'''
Function that returns all possible bigrams as a hashmap. Each possible bigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams
    

In [42]:
# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', "'"): 0,
 (' ', '-'): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 ("'", ' '): 0,
 ("'", "'"): 0,
 ("'", '-'): 0,
 ("'", 'a'): 0,
 ("'", 'b'): 0,
 ("'", 'c'): 0,
 ("'", 'd'): 0,
 ("'", 'e'): 0,
 ("'", 'f'): 0,
 ("'", 'g'): 0,
 ("'", 'h'): 0,
 ("'", 'i'): 0,
 ("'", 'j'): 0,
 ("'", 'k'): 0,
 ("'", 'l'): 0,
 ("'", 'm'): 0,
 ("'", 'n'): 0,
 ("'", 'o'): 0,
 ("'", 'p'): 0,
 ("'", 'q'): 0,
 ("'", 'r'): 0,
 ("'", 's'): 0,
 ("'", 't'): 0,
 ("'", 'u'): 0,
 ("'", 'v'): 0,
 ("'", 'w'): 0,
 ("'", 'x'): 0,
 ("'", 'y'): 0,
 ("'", 'z'): 0,
 ('-', ' '): 0,
 ('-', "'"): 0,
 ('-', '-'): 0,
 ('-', 'a'): 0,
 ('-', '

In [43]:
'''
Function that returns the relative frequency distribution for -grams (bigrams, trigrams, etc.) across the entire language.
Returns a hashmap.

initialized_grams: a hashmap with all possible -grams as keys and all values initialized to 0. This parameter is copied in the function.
df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

In [44]:
# Creating the bigrams frequency distribution for the entire turkish language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df2_turk, 'bigrams')
bigram_fdist

{(' ', ' '): 0.0,
 (' ', "'"): 0.0,
 (' ', '-'): 0.0,
 (' ', 'a'): 0.011490863601840682,
 (' ', 'b'): 0.00671491757137113,
 (' ', 'c'): 0.0057007550373050974,
 (' ', 'd'): 0.004378322834293884,
 (' ', 'e'): 0.005566724746459367,
 (' ', 'f'): 0.0012732877630344459,
 (' ', 'g'): 0.005052941964884064,
 (' ', 'h'): 0.0024393512933923067,
 (' ', 'i'): 0.0022338381807621857,
 (' ', 'j'): 2.2338381807621857e-05,
 (' ', 'k'): 0.010239914220613858,
 (' ', 'l'): 0.00035294643256042534,
 (' ', 'm'): 0.0026180583478532817,
 (' ', 'n'): 0.0014653978465799938,
 (' ', 'o'): 0.0067953357458785685,
 (' ', 'p'): 0.001469865522941518,
 (' ', 'q'): 0.0,
 (' ', 'r'): 0.0010633069740428004,
 (' ', 's'): 0.008622615377742036,
 (' ', 't'): 0.006098378233480767,
 (' ', 'u'): 0.0028101684313988295,
 (' ', 'v'): 0.0005807979269981683,
 (' ', 'w'): 1.7870705446097486e-05,
 (' ', 'x'): 0.0,
 (' ', 'y'): 0.004552562212393334,
 (' ', 'z'): 0.0008399231559665818,
 ("'", ' '): 0.0,
 ("'", "'"): 0.0,
 ("'", '-'): 0.0,


In [45]:
'''
Function that returns all possible trigrams as a hashmap. Each possible trigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

In [46]:
# Finding all possible transliterated characters
all_possible_chars_translit = create_lang_char_distribution(df2_turk, 'transliteration').keys()
print(len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print(len(initialized_trigrams))
initialized_trigrams

29
24389


{(' ', ' ', ' '): 0,
 (' ', ' ', "'"): 0,
 (' ', ' ', '-'): 0,
 (' ', ' ', 'a'): 0,
 (' ', ' ', 'b'): 0,
 (' ', ' ', 'c'): 0,
 (' ', ' ', 'd'): 0,
 (' ', ' ', 'e'): 0,
 (' ', ' ', 'f'): 0,
 (' ', ' ', 'g'): 0,
 (' ', ' ', 'h'): 0,
 (' ', ' ', 'i'): 0,
 (' ', ' ', 'j'): 0,
 (' ', ' ', 'k'): 0,
 (' ', ' ', 'l'): 0,
 (' ', ' ', 'm'): 0,
 (' ', ' ', 'n'): 0,
 (' ', ' ', 'o'): 0,
 (' ', ' ', 'p'): 0,
 (' ', ' ', 'q'): 0,
 (' ', ' ', 'r'): 0,
 (' ', ' ', 's'): 0,
 (' ', ' ', 't'): 0,
 (' ', ' ', 'u'): 0,
 (' ', ' ', 'v'): 0,
 (' ', ' ', 'w'): 0,
 (' ', ' ', 'x'): 0,
 (' ', ' ', 'y'): 0,
 (' ', ' ', 'z'): 0,
 (' ', "'", ' '): 0,
 (' ', "'", "'"): 0,
 (' ', "'", '-'): 0,
 (' ', "'", 'a'): 0,
 (' ', "'", 'b'): 0,
 (' ', "'", 'c'): 0,
 (' ', "'", 'd'): 0,
 (' ', "'", 'e'): 0,
 (' ', "'", 'f'): 0,
 (' ', "'", 'g'): 0,
 (' ', "'", 'h'): 0,
 (' ', "'", 'i'): 0,
 (' ', "'", 'j'): 0,
 (' ', "'", 'k'): 0,
 (' ', "'", 'l'): 0,
 (' ', "'", 'm'): 0,
 (' ', "'", 'n'): 0,
 (' ', "'", 'o'): 0,
 (' ', "'", '

In [47]:
# Changing trigrams column to become transliterated
df2_turk['trigrams'] = df2_turk['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Indonesian language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df2_turk, 'trigrams')
trigram_fdist




{(' ', ' ', ' '): 0.0,
 (' ', ' ', "'"): 0.0,
 (' ', ' ', '-'): 0.0,
 (' ', ' ', 'a'): 0.0,
 (' ', ' ', 'b'): 0.0,
 (' ', ' ', 'c'): 0.0,
 (' ', ' ', 'd'): 0.0,
 (' ', ' ', 'e'): 0.0,
 (' ', ' ', 'f'): 0.0,
 (' ', ' ', 'g'): 0.0,
 (' ', ' ', 'h'): 0.0,
 (' ', ' ', 'i'): 0.0,
 (' ', ' ', 'j'): 0.0,
 (' ', ' ', 'k'): 0.0,
 (' ', ' ', 'l'): 0.0,
 (' ', ' ', 'm'): 0.0,
 (' ', ' ', 'n'): 0.0,
 (' ', ' ', 'o'): 0.0,
 (' ', ' ', 'p'): 0.0,
 (' ', ' ', 'q'): 0.0,
 (' ', ' ', 'r'): 0.0,
 (' ', ' ', 's'): 0.0,
 (' ', ' ', 't'): 0.0,
 (' ', ' ', 'u'): 0.0,
 (' ', ' ', 'v'): 0.0,
 (' ', ' ', 'w'): 0.0,
 (' ', ' ', 'x'): 0.0,
 (' ', ' ', 'y'): 0.0,
 (' ', ' ', 'z'): 0.0,
 (' ', "'", ' '): 0.0,
 (' ', "'", "'"): 0.0,
 (' ', "'", '-'): 0.0,
 (' ', "'", 'a'): 0.0,
 (' ', "'", 'b'): 0.0,
 (' ', "'", 'c'): 0.0,
 (' ', "'", 'd'): 0.0,
 (' ', "'", 'e'): 0.0,
 (' ', "'", 'f'): 0.0,
 (' ', "'", 'g'): 0.0,
 (' ', "'", 'h'): 0.0,
 (' ', "'", 'i'): 0.0,
 (' ', "'", 'j'): 0.0,
 (' ', "'", 'k'): 0.0,
 (' ', "'",

In [48]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
initialized_unigrams

{' ': 0,
 "'": 0,
 '-': 0,
 'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0}

In [49]:
'''
Function to be applied to an ngrams column. Returns a hashmap of the relative frequency distribution for the current example.

grams_list: the list of -grams for this current example.
initialized_grams: a hashmap of all possible unigrams, bigrams, or trigrams as the keys and all values set to 0. This parameter is copied in the function.
'''
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative


In [50]:
# UNIGRAMS individual frequency distributions
df2_turk['indiv_unigrams_fdist'] = df2_turk['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(df2_turk.iloc[0]['indiv_unigrams_fdist'])

df2_turk.tail()

{' ': 0.0625, "'": 0, '-': 0, 'a': 0.3125, 'b': 0.125, 'c': 0, 'd': 0, 'e': 0.0625, 'f': 0, 'g': 0, 'h': 0.0625, 'i': 0, 'j': 0, 'k': 0, 'l': 0.0625, 'm': 0.0625, 'n': 0, 'o': 0.0625, 'p': 0, 'q': 0, 'r': 0, 's': 0.0625, 't': 0.0625, 'u': 0, 'v': 0.0625, 'w': 0, 'x': 0, 'y': 0, 'z': 0}


Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq,indiv_unigrams_fdist
18032,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,nil ipek hulagu ozturkmen,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...","[(n, i), (i, l), (l, ), ( , i), (i, p), (p, e...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...",22,4,5.5,0,0,3,"{' ': 0.12, ''': 0, '-': 0, 'a': 0.04, 'b': 0,..."
18033,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,fatma betül sayan kaya,fatma betul sayan kaya,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...",19,4,4.75,0,0,3,"{' ': 0.13636363636363635, ''': 0, '-': 0, 'a'..."
18034,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,elif nur bozkurt tandogan,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.5,0,0,3,"{' ': 0.12, ''': 0, '-': 0, 'a': 0.08, 'b': 0...."
18035,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,muhammed ali fatih erbakan,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75,0,0,3,"{' ': 0.11538461538461539, ''': 0, '-': 0, 'a'..."
18036,http://www.wikidata.org/entity/Q6071196,Vedat Ali Özkan Kayacı,vedat ali özkan kayacı,vedat ali ozkan kayaci,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[v, e, d, a, t, , a, l, i, , o, z, k, a, n, ...","[(v, e), (e, d), (d, a), (a, t), (t, ), ( , a...","[(v, e, d), (e, d, a), (d, a, t), (a, t, ), (...","[v, e, d, a, t, , a, l, i, , o, z, k, a, n, ...",19,4,4.75,0,0,3,"{' ': 0.13636363636363635, ''': 0, '-': 0, 'a'..."


In [51]:
# BIGRAMS individual frequency distributions
df2_turk['indiv_bigrams_fdist'] = df2_turk['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

# checking that it works for 'supriyadi'
print(df2_turk.iloc[0]['indiv_bigrams_fdist'][('s', 'u')])
print(1 / len(df2_turk.iloc[0]['bigrams']))

df2_turk.head()

0
0.06666666666666667


Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist
0,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,melahat abbasova,melahat abbasova,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,2,7.5,0,0,1,"{' ': 0.0625, ''': 0, '-': 0, 'a': 0.3125, 'b'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
1,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,freddy sahin-scholl,freddy sahin-scholl,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,2,9.0,0,1,1,"{' ': 0.05263157894736842, ''': 0, '-': 0.0526...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
2,http://www.wikidata.org/entity/Q217097,Richard Kingson,richard kingson,richard kingson,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,2,7.0,0,0,1,"{' ': 0.06666666666666667, ''': 0, '-': 0, 'a'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
3,http://www.wikidata.org/entity/Q110126,Hrant Dink,hrant dink,hrant dink,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,2,4.5,0,0,1,"{' ': 0.1, ''': 0, '-': 0, 'a': 0.1, 'b': 0, '...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
4,http://www.wikidata.org/entity/Q42079,Ricky Winslow,ricky winslow,ricky winslow,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,2,6.0,0,0,1,"{' ': 0.07692307692307693, ''': 0, '-': 0, 'a'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."


In [52]:
df2_turk['indiv_trigrams_fdist'] = df2_turk['trigrams'].apply(lambda entry: initialized_trigrams.copy())

In [53]:
'''
Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

trigrams_list: the list of trigrams for this current example.
init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
'''
def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
df2_turk['indiv_trigrams_fdist'] = df2_turk.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

In [51]:
"""
# Checking 0th example""
print(df2_turk.loc[0, 'indiv_trigrams_fdist'][('m', 'e', 'l')])
print(1 / len(df2_turk.loc[0, 'trigrams'])) # manual calculation

# Checking 1st example
print(df2_turk.loc[1, 'fullname'])
print(df2_turk.loc[1, 'indiv_trigrams_fdist'][('e', 'l', 'a')])
print(1 / len(df2_turk.loc[1, 'trigrams'])) # manual calculation
"""

'\n# Checking 0th example""\nprint(df2_turk.loc[0, \'indiv_trigrams_fdist\'][(\'m\', \'e\', \'l\')])\nprint(1 / len(df2_turk.loc[0, \'trigrams\'])) # manual calculation\n\n# Checking 1st example\nprint(df2_turk.loc[1, \'fullname\'])\nprint(df2_turk.loc[1, \'indiv_trigrams_fdist\'][(\'e\', \'l\', \'a\')])\nprint(1 / len(df2_turk.loc[1, \'trigrams\'])) # manual calculation\n'

In [54]:
 #This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_unigrams_fdist'] = df2_turk['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [55]:
# Calculating cosine similarity
df2_turk['unigrams_cosine_sim'] = df2_turk['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

In [56]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_bigrams_fdist'] = df2_turk['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

In [57]:
# Calculating cosine similarity
df2_turk['bigrams_cosine_sim'] = df2_turk['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])


In [58]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_trigrams_fdist'] = df2_turk['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

In [59]:
# Calculating cosine similarity
df2_turk['trigrams_cosine_sim'] = df2_turk['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
df2_turk.head()

Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,...,avg_token_length,period_freq,dash_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
0,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,melahat abbasova,melahat abbasova,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,...,7.5,0,0,1,"[[0.0625, 0.0, 0.0, 0.3125, 0.125, 0.0, 0.0, 0...","[[0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710095,0.320742,0.091389
1,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,freddy sahin-scholl,freddy sahin-scholl,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,...,9.0,0,1,1,"[[0.05263157894736842, 0.0, 0.0526315789473684...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.675591,0.162482,0.052841
2,http://www.wikidata.org/entity/Q217097,Richard Kingson,richard kingson,richard kingson,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,...,7.0,0,0,1,"[[0.06666666666666667, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.721316,0.226842,0.037285
3,http://www.wikidata.org/entity/Q110126,Hrant Dink,hrant dink,hrant dink,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,...,4.5,0,0,1,"[[0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.111111111111...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.704609,0.311383,0.056722
4,http://www.wikidata.org/entity/Q42079,Ricky Winslow,ricky winslow,ricky winslow,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,...,6.0,0,0,1,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.563211,0.092439,0.003844


In [60]:
df2_turk

Unnamed: 0,id,original_fullname,fullname,transliteration,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,...,avg_token_length,period_freq,dash_freq,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
0,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,melahat abbasova,melahat abbasova,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,...,7.50,0,0,1,"[[0.0625, 0.0, 0.0, 0.3125, 0.125, 0.0, 0.0, 0...","[[0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710095,0.320742,0.091389
1,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,freddy sahin-scholl,freddy sahin-scholl,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,...,9.00,0,1,1,"[[0.05263157894736842, 0.0, 0.0526315789473684...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.675591,0.162482,0.052841
2,http://www.wikidata.org/entity/Q217097,Richard Kingson,richard kingson,richard kingson,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,...,7.00,0,0,1,"[[0.06666666666666667, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.721316,0.226842,0.037285
3,http://www.wikidata.org/entity/Q110126,Hrant Dink,hrant dink,hrant dink,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,...,4.50,0,0,1,"[[0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.111111111111...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.704609,0.311383,0.056722
4,http://www.wikidata.org/entity/Q42079,Ricky Winslow,ricky winslow,ricky winslow,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,...,6.00,0,0,1,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.563211,0.092439,0.003844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18032,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,nil ipek hulagu ozturkmen,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...","[(n, i), (i, l), (l, ), ( , i), (i, p), (p, e...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, p, e, k, , h, u, l, a, g, u, ...",22,...,5.50,0,0,3,"[[0.12, 0.0, 0.0, 0.04, 0.0, 0.0, 0.0, 0.08, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.844140,0.293266,0.099437
18033,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,fatma betül sayan kaya,fatma betul sayan kaya,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, u, l, , s, a, y, ...",19,...,4.75,0,0,3,"[[0.13636363636363635, 0.0, 0.0, 0.27272727272...","[[0.0, 0.0, 0.0, 0.0, 0.047619047619047616, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.802330,0.451886,0.205497
18034,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,elif nur bozkurt tandogan,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,...,5.50,0,0,3,"[[0.12, 0.0, 0.0, 0.08, 0.04, 0.0, 0.04, 0.04,...","[[0.0, 0.0, 0.0, 0.0, 0.041666666666666664, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.838028,0.386151,0.120993
18035,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,muhammed ali fatih erbakan,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,...,5.75,0,0,3,"[[0.11538461538461539, 0.0, 0.0, 0.19230769230...","[[0.0, 0.0, 0.0, 0.04, 0.0, 0.0, 0.0, 0.04, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884643,0.448618,0.152740


In [61]:
# Save with compression (gzip)
df2_turk.to_pickle('../pickled_dataframes/turkish_df.pkl.gz', compression='gzip')

In [62]:
df_loaded_gz = pd.read_pickle('../pickled_dataframes/turkish_df.pkl.gz', compression='gzip')