In [1]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Reading in the Turkish nams 
df_turk  =  pd.read_excel('name_data/exigerData/EXGR_Turkish names2.xlsx')
print(df_turk)

                                            id                      label_tr  \
0       http://www.wikidata.org/entity/Q282782                          Ceza   
1       http://www.wikidata.org/entity/Q288019                        Gülşen   
2       http://www.wikidata.org/entity/Q236047                        Hadise   
3       http://www.wikidata.org/entity/Q236920                         Emrah   
4       http://www.wikidata.org/entity/Q182948                        Göksel   
...                                        ...                           ...   
18769  http://www.wikidata.org/entity/Q6053953     Elif Nur Bozkurt Tandoğan   
18770  http://www.wikidata.org/entity/Q6085044    Muhammed Ali Fatih Erbakan   
18771  http://www.wikidata.org/entity/Q6041731      İsmail Hamit Özer Derbil   
18772  http://www.wikidata.org/entity/Q6071196        Vedat Ali Özkan Kayacı   
18773     http://www.wikidata.org/entity/Q5031  Elder Paisios of Mount Athos   

      comment          Given name Middl

In [3]:
#printing out the shape 
df_turk.shape

(18774, 8)

In [4]:
#Printing out the head
df_turk.head()

Unnamed: 0,id,label_tr,comment,Given name,Middle Name,Family name,Maiden Name,Unnamed: 7
0,http://www.wikidata.org/entity/Q282782,Ceza,x,,,,,1
1,http://www.wikidata.org/entity/Q288019,Gülşen,x,,,,,1
2,http://www.wikidata.org/entity/Q236047,Hadise,x,,,,,1
3,http://www.wikidata.org/entity/Q236920,Emrah,x,,,,,1
4,http://www.wikidata.org/entity/Q182948,Göksel,x,,,,,1


In [5]:
#checking if there is any null entries in the datasets
df_turk.isnull().values.any()

True

In [6]:
# checking what's in the comment columns
df_turk['comment']

0          x
1          x
2          x
3          x
4          x
        ... 
18769    NaN
18770    NaN
18771      x
18772    NaN
18773      x
Name: comment, Length: 18774, dtype: object

In [7]:
df_turk.isnull().head()

Unnamed: 0,id,label_tr,comment,Given name,Middle Name,Family name,Maiden Name,Unnamed: 7
0,False,False,False,True,True,True,True,False
1,False,False,False,True,True,True,True,False
2,False,False,False,True,True,True,True,False
3,False,False,False,True,True,True,True,False
4,False,False,False,True,True,True,True,False


In [8]:
#checking how many null entries each columns have 
nan_count = np.sum(df_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18318
Given name       455
Middle Name    18772
Family name      459
Maiden Name    18763
Unnamed: 7         0
dtype: int64

In [9]:
# checking if the datasets have any duplicates 
df_turk['label_tr'].duplicated().any()

True

In [10]:
#check if there is any null entries 
np.any(df_turk['label_tr'].isnull())

False

In [11]:
#just checking the shape before dropping dupliates (18774,8)
df_turk.shape

(18774, 8)

In [12]:
#Dropping the duplicates(place in a new variable)
df2_turk = df_turk.drop_duplicates(subset=['label_tr'])
print(df2_turk)

                                            id                      label_tr  \
0       http://www.wikidata.org/entity/Q282782                          Ceza   
1       http://www.wikidata.org/entity/Q288019                        Gülşen   
2       http://www.wikidata.org/entity/Q236047                        Hadise   
3       http://www.wikidata.org/entity/Q236920                         Emrah   
4       http://www.wikidata.org/entity/Q182948                        Göksel   
...                                        ...                           ...   
18769  http://www.wikidata.org/entity/Q6053953     Elif Nur Bozkurt Tandoğan   
18770  http://www.wikidata.org/entity/Q6085044    Muhammed Ali Fatih Erbakan   
18771  http://www.wikidata.org/entity/Q6041731      İsmail Hamit Özer Derbil   
18772  http://www.wikidata.org/entity/Q6071196        Vedat Ali Özkan Kayacı   
18773     http://www.wikidata.org/entity/Q5031  Elder Paisios of Mount Athos   

      comment          Given name Middl

In [13]:
#checking if there are still duplicates
df2_turk['label_tr'].duplicated().any()


False

In [14]:
#Just checking if th shape went down(which it did (18774,8) to (18491,8))
df2_turk.shape

(18491, 8)

In [15]:
# checking if the null values went down (it did) 
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name       453
Middle Name    18489
Family name      457
Maiden Name    18480
Unnamed: 7         0
dtype: int64

In [16]:
#printing the col_names with null values
condition = nan_count != 0
col_names = nan_count[condition].index
nan_cols = list(col_names)
print(nan_cols)

['comment', 'Given name', 'Middle Name', 'Family name', 'Maiden Name']


In [17]:
print(df2_turk['comment'].unique())
print(df2_turk['Given name'].unique())
print(df2_turk['Middle Name'].unique())
print(df2_turk['Family name'].unique())
print(df2_turk['Maiden Name'].unique())

['x' nan]
[nan 'Melahat' 'Freddy' ... 'Elif Nur Bozkurt ' 'Muhammed Ali Fatih'
 'Vedat Ali Özkan']
[nan 'Sahin' 'Kemal']
[nan 'Abbasova' 'Scholl' ... 'Kapıcıoğlu' 'Ali Marandi' 'Bektur']
[nan 'Önder' 'Kafaoğlu' 'Türker' 'Doğan' 'Karahan' 'Sabancı' 'Ekşi' 'İnan'
 'Tosun' 'Sayan']


In [18]:
# drop comments row with x 
comments_drop_with_x =  df2_turk['comment'] == 'x'
df2_turk = df2_turk.drop(df2_turk[df2_turk['comment'] == 'x'].index)
print(df2_turk)
#print(comments_drop)
#df2_turk['comment']

                                             id                    label_tr  \
125     http://www.wikidata.org/entity/Q3508136            Melahat Abbasova   
126       http://www.wikidata.org/entity/Q96602         Freddy Sahin-Scholl   
127      http://www.wikidata.org/entity/Q217097             Richard Kingson   
129      http://www.wikidata.org/entity/Q110126                  Hrant Dink   
130       http://www.wikidata.org/entity/Q42079               Ricky Winslow   
...                                         ...                         ...   
18764  http://www.wikidata.org/entity/Q49703809   Nil İpek Hülagü Öztürkmen   
18765  http://www.wikidata.org/entity/Q24230049      Fatma Betül Sayan Kaya   
18769   http://www.wikidata.org/entity/Q6053953   Elif Nur Bozkurt Tandoğan   
18770   http://www.wikidata.org/entity/Q6085044  Muhammed Ali Fatih Erbakan   
18772   http://www.wikidata.org/entity/Q6071196      Vedat Ali Özkan Kayacı   

      comment          Given name Middle Name Famil

In [19]:
#check the shape again( went form (18491,8) to (18037, 8))
df2_turk.shape

(18037, 8)

In [20]:
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name         0
Middle Name    18035
Family name        4
Maiden Name    18026
Unnamed: 7         0
dtype: int64

In [21]:
non_alnum_names_turk = [name for name in df2_turk['label_tr'] if not str.isalnum(name.replace(' ', ''))]
print(len(non_alnum_names_turk))
non_alnum_names_turk

30


['Freddy Sahin-Scholl',
 'Perihan Önder-Ridder',
 'Erhan-Can Kartal',
 'Ali-Özgür Özdil',
 'Mehmet Aga-Oglu',
 'Ahmed Agha-Oghlu',
 'Hatice Aksoy-Woinek',
 'Bugha al-Sharabi',
 'Iffat Al-Thunayan',
 "Nev'îzâde Atâyî",
 'Üstün Bilgen-Reinart',
 'Mehpare Bozyigit-Kirchmann',
 'Elif Çağlar-Muslu',
 "Cem'i Demiroğlu",
 "Neş'e Erdok",
 'Asuman Kafaoğlu-Büke',
 'Elçin Kürşat-Ahlers',
 'Gönül Sen-Menzel',
 'Tülay Sözbir-Seidel',
 "Gaybi Sun'ullah",
 'Ayshe Talay-Ongan',
 'Sevgi Türker-Terlemez',
 "Mümtaz'er Türköne",
 'Sabiha Bânu Yalkut-Breddermann',
 "Temel Nücûm'i Göksel",
 'Molla Ahmed-i Cezirî',
 "Dipika O'Neill Joti",
 "Abdurrahman Necati Kara'a",
 'Kerimüddin Mahmud-i Aksarayî',
 'Salah al-Din Zarkub']

In [22]:
#drop extra column we don't need 
df2_turk = df2_turk.drop(columns = ['Unnamed: 7'])

In [23]:
nan_count = np.sum(df2_turk.isnull(), axis = 0)
nan_count

id                 0
label_tr           0
comment        18037
Given name         0
Middle Name    18035
Family name        4
Maiden Name    18026
dtype: int64

In [24]:
#Drop columns that won't be used in df2
print(df2_turk.columns)
df2_turk = df2_turk.drop(columns= [ 'comment', 'Given name', 'Middle Name', 'Family name',
       'Maiden Name'])


Index(['id', 'label_tr', 'comment', 'Given name', 'Middle Name', 'Family name',
       'Maiden Name'],
      dtype='object')


In [26]:
#Create lowercase colun
df2_turk['original_fullname'] = df2_turk['label_tr']
df2_turk['fullname'] = df2_turk['label_tr'].apply(str.lower)
df2_turk

Unnamed: 0,id,label_tr,original_fullname,fullname
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow
...,...,...,...,...
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan


Now For Feature Engineering 

In [28]:
#creating features to determine the alphabet
determine_alphabet = [unicodedata.name(name[0]).split(' ')[0] for name in df2_turk['fullname']]
#df2_turk['determine_alphabet'] = df2_turk['label_tr'].apply(lambda name: unicodedata.name(name[0]).split(' ')[0])
df2_turk['alphabet'] = determine_alphabet
print(df2_turk['alphabet'])


125      LATIN
126      LATIN
127      LATIN
129      LATIN
130      LATIN
         ...  
18764    LATIN
18765    LATIN
18769    LATIN
18770    LATIN
18772    LATIN
Name: alphabet, Length: 18037, dtype: object


In [29]:
#char grams
def get_ngrams(text, n):
    if isinstance(text,str):
        name = list(text)
    ngrams_list =  list(ngrams(list(text), n))
    return ngrams_list

df2_turk["unigrams"] = df2_turk['fullname'].apply(lambda name: list(name) if isinstance(name, str) else [])
df2_turk["bigrams"] = df2_turk['fullname'].apply(lambda name: get_ngrams(name,2) if isinstance(name, str) else [])
df2_turk["trigrams"] = df2_turk['fullname'].apply(lambda name: get_ngrams(name,3) if isinstance(name, str) else [])

df2_turk['char_ngrams'] = df2_turk["unigrams"] + df2_turk["bigrams"] + df2_turk["trigrams"]

In [30]:
print(df2_turk['char_ngrams'])

125      [m, e, l, a, h, a, t,  , a, b, b, a, s, o, v, ...
126      [f, r, e, d, d, y,  , s, a, h, i, n, -, s, c, ...
127      [r, i, c, h, a, r, d,  , k, i, n, g, s, o, n, ...
129      [h, r, a, n, t,  , d, i, n, k, (h, r), (r, a),...
130      [r, i, c, k, y,  , w, i, n, s, l, o, w, (r, i)...
                               ...                        
18764    [n, i, l,  , i, ̇, p, e, k,  , h, ü, l, a, g, ...
18765    [f, a, t, m, a,  , b, e, t, ü, l,  , s, a, y, ...
18769    [e, l, i, f,  , n, u, r,  , b, o, z, k, u, r, ...
18770    [m, u, h, a, m, m, e, d,  , a, l, i,  , f, a, ...
18772    [v, e, d, a, t,  , a, l, i,  , ö, z, k, a, n, ...
Name: char_ngrams, Length: 18037, dtype: object


In [31]:
# feature to check the name length
def name_length(name):
    return len(name.replace(' ', ''))

df2_turk['name_length'] = df2_turk['fullname'].apply(name_length)

print(df2_turk['name_length'])

125      15
126      18
127      14
129       9
130      12
         ..
18764    23
18765    19
18769    22
18770    23
18772    19
Name: name_length, Length: 18037, dtype: int64


In [32]:
# feature to check the name length
def token_length(name):
    return len(name.split())

df2_turk['num_tokens'] = df2_turk['fullname'].apply(token_length)

print(df2_turk['num_tokens'])

125      2
126      2
127      2
129      2
130      2
        ..
18764    4
18765    4
18769    4
18770    4
18772    4
Name: num_tokens, Length: 18037, dtype: int64


In [33]:
tokens = df2_turk['fullname'].apply(lambda name: name.split(' '))
print(tokens[-5:], '\n')
token_lengths = tokens.apply(lambda token_list: [len(token) for token in token_list])
print(token_lengths[-5:])
df2_turk['avg_token_length'] = token_lengths.apply(np.mean)
df2_turk.tail()

18764    [nil, i̇pek, hülagü, öztürkmen]
18765        [fatma, betül, sayan, kaya]
18769     [elif, nur, bozkurt, tandoğan]
18770    [muhammed, ali, fatih, erbakan]
18772        [vedat, ali, özkan, kayacı]
Name: fullname, dtype: object 

18764    [3, 5, 6, 9]
18765    [5, 5, 5, 4]
18769    [4, 3, 7, 8]
18770    [8, 3, 5, 7]
18772    [5, 3, 5, 6]
Name: fullname, dtype: object


Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,LATIN,"[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...","[(n, i), (i, l), (l, ), ( , i), (i, ̇), (̇, p...","[(n, i, l), (i, l, ), (l, , i), ( , i, ̇), (...","[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...",23,4,5.75
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya,LATIN,"[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...",19,4,4.75
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,LATIN,"[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.5
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,LATIN,"[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75
18772,http://www.wikidata.org/entity/Q6071196,Vedat Ali Özkan Kayacı,Vedat Ali Özkan Kayacı,vedat ali özkan kayacı,LATIN,"[v, e, d, a, t, , a, l, i, , ö, z, k, a, n, ...","[(v, e), (e, d), (d, a), (a, t), (t, ), ( , a...","[(v, e, d), (e, d, a), (d, a, t), (a, t, ), (...","[v, e, d, a, t, , a, l, i, , ö, z, k, a, n, ...",19,4,4.75


In [34]:
df2_turk['period_freq'] = df2_turk['fullname'].apply(lambda name: name.count('.'))
df2_turk['dash_freq'] = df2_turk['fullname'].apply(lambda name: name.count('-'))
df2_turk['space_freq'] = df2_turk['fullname'].apply(lambda name: name.count(' '))

In [35]:
# Have to do the transltion funtion 
def create_character_frequency_hashmap(df, names_col):
    char_freqs = {}
    for name in df[names_col]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
    return char_freqs


In [36]:
# add transaliteration
df2_turk['transliteration'] = df2_turk['fullname'].apply(lambda name: unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8'))
result = df2_turk[df2_turk['fullname'] != df2_turk['transliteration']][['fullname', 'transliteration']]

In [37]:
df2_turk


Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq,transliteration
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova,LATIN,"[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,2,7.50,0,0,1,melahat abbasova
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl,LATIN,"[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,2,9.00,0,1,1,freddy sahin-scholl
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson,LATIN,"[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,2,7.00,0,0,1,richard kingson
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink,LATIN,"[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,2,4.50,0,0,1,hrant dink
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow,LATIN,"[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,2,6.00,0,0,1,ricky winslow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,LATIN,"[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...","[(n, i), (i, l), (l, ), ( , i), (i, ̇), (̇, p...","[(n, i, l), (i, l, ), (l, , i), ( , i, ̇), (...","[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...",23,4,5.75,0,0,3,nil ipek hulagu ozturkmen
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya,LATIN,"[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...",19,4,4.75,0,0,3,fatma betul sayan kaya
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,LATIN,"[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.50,0,0,3,elif nur bozkurt tandogan
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,LATIN,"[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75,0,0,3,muhammed ali fatih erbakan


In [38]:
'''
Function that returns the relative frequency distribution for characters, aka unigrams, across the entire language.
Returns a hashmap sorted by the ASCII values of the keys in ascending order.

df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

In [39]:
# Creating the unigrams frequency distribution for the entire Indonesian language
unigram_fdist = create_lang_char_distribution(df2_turk, 'fullname')
print(len(unigram_fdist))
unigram_fdist

59


{' ': 0.08551088432574791,
 "'": 3.292045594831489e-05,
 '-': 9.053125385786593e-05,
 'a': 0.1269824287066376,
 'b': 0.017921073206863914,
 'c': 0.013933582980124275,
 'd': 0.021925023661577715,
 'e': 0.08330521377721081,
 'f': 0.011028352742685486,
 'g': 0.014205176741697873,
 'h': 0.02640220567054854,
 'i': 0.05496481626270524,
 'j': 0.000502036953211802,
 'k': 0.04339739105386609,
 'l': 0.0526192337763878,
 'm': 0.04385416238014896,
 'n': 0.06247479527591457,
 'o': 0.021443561993333606,
 'p': 0.006847454837249496,
 'q': 1.2345170980618081e-05,
 'r': 0.06075881650960866,
 's': 0.03056252829101683,
 't': 0.043261594173079294,
 'u': 0.03701082259989301,
 'v': 0.008357680753878442,
 'w': 7.40710258837085e-05,
 'x': 2.8805398954775523e-05,
 'y': 0.026336364758651906,
 'z': 0.019402493724538086,
 'à': 4.115056993539361e-06,
 'â': 0.00033743467347022757,
 'ä': 1.2345170980618081e-05,
 'ç': 0.010769104152092507,
 'è': 4.115056993539361e-06,
 'é': 4.11505699353936e-05,
 'ê': 8.23011398707872

In [40]:
'''
Function that returns all possible bigrams as a hashmap. Each possible bigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams
    

In [41]:
# Initializing all possible bigrams using all possible characters from unigrams frequency distribution
initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
initialized_bigrams

{(' ', ' '): 0,
 (' ', "'"): 0,
 (' ', '-'): 0,
 (' ', 'a'): 0,
 (' ', 'b'): 0,
 (' ', 'c'): 0,
 (' ', 'd'): 0,
 (' ', 'e'): 0,
 (' ', 'f'): 0,
 (' ', 'g'): 0,
 (' ', 'h'): 0,
 (' ', 'i'): 0,
 (' ', 'j'): 0,
 (' ', 'k'): 0,
 (' ', 'l'): 0,
 (' ', 'm'): 0,
 (' ', 'n'): 0,
 (' ', 'o'): 0,
 (' ', 'p'): 0,
 (' ', 'q'): 0,
 (' ', 'r'): 0,
 (' ', 's'): 0,
 (' ', 't'): 0,
 (' ', 'u'): 0,
 (' ', 'v'): 0,
 (' ', 'w'): 0,
 (' ', 'x'): 0,
 (' ', 'y'): 0,
 (' ', 'z'): 0,
 (' ', 'à'): 0,
 (' ', 'â'): 0,
 (' ', 'ä'): 0,
 (' ', 'ç'): 0,
 (' ', 'è'): 0,
 (' ', 'é'): 0,
 (' ', 'ê'): 0,
 (' ', 'ì'): 0,
 (' ', 'î'): 0,
 (' ', 'ï'): 0,
 (' ', 'ó'): 0,
 (' ', 'ö'): 0,
 (' ', 'ø'): 0,
 (' ', 'ù'): 0,
 (' ', 'û'): 0,
 (' ', 'ü'): 0,
 (' ', 'ý'): 0,
 (' ', 'ą'): 0,
 (' ', 'ć'): 0,
 (' ', 'č'): 0,
 (' ', 'ğ'): 0,
 (' ', 'ı'): 0,
 (' ', 'ş'): 0,
 (' ', 'š'): 0,
 (' ', 'ũ'): 0,
 (' ', 'ű'): 0,
 (' ', 'ž'): 0,
 (' ', 'ǧ'): 0,
 (' ', 'ș'): 0,
 (' ', '̇'): 0,
 ("'", ' '): 0,
 ("'", "'"): 0,
 ("'", '-'): 0,
 ("'", '

In [42]:
'''
Function that returns the relative frequency distribution for -grams (bigrams, trigrams, etc.) across the entire language.
Returns a hashmap.

initialized_grams: a hashmap with all possible -grams as keys and all values initialized to 0. This parameter is copied in the function.
df: a Pandas DataFrame with examples in only one language.
col_name: the name of the column where each entry is a list of -grams for the corresponding example.
'''
def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

In [43]:
# Creating the bigrams frequency distribution for the entire turkish language
bigram_fdist = create_lang_gram_distribution(initialized_bigrams, df2_turk, 'bigrams')
bigram_fdist

{(' ', ' '): 0.0,
 (' ', "'"): 0.0,
 (' ', '-'): 0.0,
 (' ', 'a'): 0.011405813142021487,
 (' ', 'b'): 0.006680801696203545,
 (' ', 'c'): 0.0022047090095255874,
 (' ', 'd'): 0.00435607828494975,
 (' ', 'e'): 0.005538442390864682,
 (' ', 'f'): 0.0012668186849088558,
 (' ', 'g'): 0.005027269939059354,
 (' ', 'h'): 0.0024269579016148604,
 (' ', 'i'): 0.002222488920892729,
 (' ', 'j'): 2.2224889208927295e-05,
 (' ', 'k'): 0.010187889213372272,
 (' ', 'l'): 0.0003511532495010512,
 (' ', 'm'): 0.002604757015286279,
 (' ', 'n'): 0.0014579527321056304,
 (' ', 'o'): 0.0019157854498095326,
 (' ', 'p'): 0.001462397709947416,
 (' ', 'q'): 0.0,
 (' ', 'r'): 0.0010579047263449392,
 (' ', 's'): 0.006262973779075712,
 (' ', 't'): 0.0060673947540371516,
 (' ', 'u'): 0.0017290963804545434,
 (' ', 'v'): 0.0005778471194321096,
 (' ', 'w'): 1.7779911367141836e-05,
 (' ', 'x'): 0.0,
 (' ', 'y'): 0.004529432420779383,
 (' ', 'z'): 0.0008356558342556662,
 (' ', 'à'): 0.0,
 (' ', 'â'): 2.6669867050712754e-05,
 

In [44]:
'''
Function that returns all possible trigrams as a hashmap. Each possible trigram is a key, and each value is set to 0.

all_possible_chars: a list of all possible characters sorted by ASCII value.
'''
def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

In [45]:
# Finding all possible transliterated characters
all_possible_chars_translit = create_lang_char_distribution(df2_turk, 'transliteration').keys()
print(len(all_possible_chars_translit))

# Creating all possible trigrams from transliterated characters
initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
print(len(initialized_trigrams))
initialized_trigrams

29
24389


{(' ', ' ', ' '): 0,
 (' ', ' ', "'"): 0,
 (' ', ' ', '-'): 0,
 (' ', ' ', 'a'): 0,
 (' ', ' ', 'b'): 0,
 (' ', ' ', 'c'): 0,
 (' ', ' ', 'd'): 0,
 (' ', ' ', 'e'): 0,
 (' ', ' ', 'f'): 0,
 (' ', ' ', 'g'): 0,
 (' ', ' ', 'h'): 0,
 (' ', ' ', 'i'): 0,
 (' ', ' ', 'j'): 0,
 (' ', ' ', 'k'): 0,
 (' ', ' ', 'l'): 0,
 (' ', ' ', 'm'): 0,
 (' ', ' ', 'n'): 0,
 (' ', ' ', 'o'): 0,
 (' ', ' ', 'p'): 0,
 (' ', ' ', 'q'): 0,
 (' ', ' ', 'r'): 0,
 (' ', ' ', 's'): 0,
 (' ', ' ', 't'): 0,
 (' ', ' ', 'u'): 0,
 (' ', ' ', 'v'): 0,
 (' ', ' ', 'w'): 0,
 (' ', ' ', 'x'): 0,
 (' ', ' ', 'y'): 0,
 (' ', ' ', 'z'): 0,
 (' ', "'", ' '): 0,
 (' ', "'", "'"): 0,
 (' ', "'", '-'): 0,
 (' ', "'", 'a'): 0,
 (' ', "'", 'b'): 0,
 (' ', "'", 'c'): 0,
 (' ', "'", 'd'): 0,
 (' ', "'", 'e'): 0,
 (' ', "'", 'f'): 0,
 (' ', "'", 'g'): 0,
 (' ', "'", 'h'): 0,
 (' ', "'", 'i'): 0,
 (' ', "'", 'j'): 0,
 (' ', "'", 'k'): 0,
 (' ', "'", 'l'): 0,
 (' ', "'", 'm'): 0,
 (' ', "'", 'n'): 0,
 (' ', "'", 'o'): 0,
 (' ', "'", '

In [46]:
# Changing trigrams column to become transliterated
df2_turk['trigrams'] = df2_turk['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))

# Creating the trigrams frequency distribution for the entire Indonesian language
trigram_fdist = create_lang_gram_distribution(initialized_trigrams, df2_turk, 'trigrams')
trigram_fdist




{(' ', ' ', ' '): 0.0,
 (' ', ' ', "'"): 0.0,
 (' ', ' ', '-'): 0.0,
 (' ', ' ', 'a'): 0.0,
 (' ', ' ', 'b'): 0.0,
 (' ', ' ', 'c'): 0.0,
 (' ', ' ', 'd'): 0.0,
 (' ', ' ', 'e'): 0.0,
 (' ', ' ', 'f'): 0.0,
 (' ', ' ', 'g'): 0.0,
 (' ', ' ', 'h'): 0.0,
 (' ', ' ', 'i'): 0.0,
 (' ', ' ', 'j'): 0.0,
 (' ', ' ', 'k'): 0.0,
 (' ', ' ', 'l'): 0.0,
 (' ', ' ', 'm'): 0.0,
 (' ', ' ', 'n'): 0.0,
 (' ', ' ', 'o'): 0.0,
 (' ', ' ', 'p'): 0.0,
 (' ', ' ', 'q'): 0.0,
 (' ', ' ', 'r'): 0.0,
 (' ', ' ', 's'): 0.0,
 (' ', ' ', 't'): 0.0,
 (' ', ' ', 'u'): 0.0,
 (' ', ' ', 'v'): 0.0,
 (' ', ' ', 'w'): 0.0,
 (' ', ' ', 'x'): 0.0,
 (' ', ' ', 'y'): 0.0,
 (' ', ' ', 'z'): 0.0,
 (' ', "'", ' '): 0.0,
 (' ', "'", "'"): 0.0,
 (' ', "'", '-'): 0.0,
 (' ', "'", 'a'): 0.0,
 (' ', "'", 'b'): 0.0,
 (' ', "'", 'c'): 0.0,
 (' ', "'", 'd'): 0.0,
 (' ', "'", 'e'): 0.0,
 (' ', "'", 'f'): 0.0,
 (' ', "'", 'g'): 0.0,
 (' ', "'", 'h'): 0.0,
 (' ', "'", 'i'): 0.0,
 (' ', "'", 'j'): 0.0,
 (' ', "'", 'k'): 0.0,
 (' ', "'",

In [47]:
initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
initialized_unigrams

{' ': 0,
 "'": 0,
 '-': 0,
 'a': 0,
 'b': 0,
 'c': 0,
 'd': 0,
 'e': 0,
 'f': 0,
 'g': 0,
 'h': 0,
 'i': 0,
 'j': 0,
 'k': 0,
 'l': 0,
 'm': 0,
 'n': 0,
 'o': 0,
 'p': 0,
 'q': 0,
 'r': 0,
 's': 0,
 't': 0,
 'u': 0,
 'v': 0,
 'w': 0,
 'x': 0,
 'y': 0,
 'z': 0,
 'à': 0,
 'â': 0,
 'ä': 0,
 'ç': 0,
 'è': 0,
 'é': 0,
 'ê': 0,
 'ì': 0,
 'î': 0,
 'ï': 0,
 'ó': 0,
 'ö': 0,
 'ø': 0,
 'ù': 0,
 'û': 0,
 'ü': 0,
 'ý': 0,
 'ą': 0,
 'ć': 0,
 'č': 0,
 'ğ': 0,
 'ı': 0,
 'ş': 0,
 'š': 0,
 'ũ': 0,
 'ű': 0,
 'ž': 0,
 'ǧ': 0,
 'ș': 0,
 '̇': 0}

In [48]:
'''
Function to be applied to an ngrams column. Returns a hashmap of the relative frequency distribution for the current example.

grams_list: the list of -grams for this current example.
initialized_grams: a hashmap of all possible unigrams, bigrams, or trigrams as the keys and all values set to 0. This parameter is copied in the function.
'''
def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative


In [49]:
# UNIGRAMS individual frequency distributions
df2_turk['indiv_unigrams_fdist'] = df2_turk['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))

# checking that the functin worked for our first example, 'supriyadi'
print(df2_turk.iloc[0]['indiv_unigrams_fdist'])

df2_turk.tail()

{' ': 0.0625, "'": 0, '-': 0, 'a': 0.3125, 'b': 0.125, 'c': 0, 'd': 0, 'e': 0.0625, 'f': 0, 'g': 0, 'h': 0.0625, 'i': 0, 'j': 0, 'k': 0, 'l': 0.0625, 'm': 0.0625, 'n': 0, 'o': 0.0625, 'p': 0, 'q': 0, 'r': 0, 's': 0.0625, 't': 0.0625, 'u': 0, 'v': 0.0625, 'w': 0, 'x': 0, 'y': 0, 'z': 0, 'à': 0, 'â': 0, 'ä': 0, 'ç': 0, 'è': 0, 'é': 0, 'ê': 0, 'ì': 0, 'î': 0, 'ï': 0, 'ó': 0, 'ö': 0, 'ø': 0, 'ù': 0, 'û': 0, 'ü': 0, 'ý': 0, 'ą': 0, 'ć': 0, 'č': 0, 'ğ': 0, 'ı': 0, 'ş': 0, 'š': 0, 'ũ': 0, 'ű': 0, 'ž': 0, 'ǧ': 0, 'ș': 0, '̇': 0}


Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq,transliteration,indiv_unigrams_fdist
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,LATIN,"[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...","[(n, i), (i, l), (l, ), ( , i), (i, ̇), (̇, p...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...",23,4,5.75,0,0,3,nil ipek hulagu ozturkmen,"{' ': 0.11538461538461539, ''': 0, '-': 0, 'a'..."
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya,LATIN,"[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...",19,4,4.75,0,0,3,fatma betul sayan kaya,"{' ': 0.13636363636363635, ''': 0, '-': 0, 'a'..."
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,LATIN,"[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,4,5.5,0,0,3,elif nur bozkurt tandogan,"{' ': 0.12, ''': 0, '-': 0, 'a': 0.08, 'b': 0...."
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,LATIN,"[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,4,5.75,0,0,3,muhammed ali fatih erbakan,"{' ': 0.11538461538461539, ''': 0, '-': 0, 'a'..."
18772,http://www.wikidata.org/entity/Q6071196,Vedat Ali Özkan Kayacı,Vedat Ali Özkan Kayacı,vedat ali özkan kayacı,LATIN,"[v, e, d, a, t, , a, l, i, , ö, z, k, a, n, ...","[(v, e), (e, d), (d, a), (a, t), (t, ), ( , a...","[(v, e, d), (e, d, a), (d, a, t), (a, t, ), (...","[v, e, d, a, t, , a, l, i, , ö, z, k, a, n, ...",19,4,4.75,0,0,3,vedat ali ozkan kayac,"{' ': 0.13636363636363635, ''': 0, '-': 0, 'a'..."


In [50]:
# BIGRAMS individual frequency distributions
df2_turk['indiv_bigrams_fdist'] = df2_turk['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))

# checking that it works for 'supriyadi'
print(df2_turk.iloc[0]['indiv_bigrams_fdist'][('s', 'u')])
print(1 / len(df2_turk.iloc[0]['bigrams']))

df2_turk.head()

0
0.06666666666666667


Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,num_tokens,avg_token_length,period_freq,dash_freq,space_freq,transliteration,indiv_unigrams_fdist,indiv_bigrams_fdist
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova,LATIN,"[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,2,7.5,0,0,1,melahat abbasova,"{' ': 0.0625, ''': 0, '-': 0, 'a': 0.3125, 'b'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl,LATIN,"[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,2,9.0,0,1,1,freddy sahin-scholl,"{' ': 0.05263157894736842, ''': 0, '-': 0.0526...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson,LATIN,"[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,2,7.0,0,0,1,richard kingson,"{' ': 0.06666666666666667, ''': 0, '-': 0, 'a'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink,LATIN,"[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,2,4.5,0,0,1,hrant dink,"{' ': 0.1, ''': 0, '-': 0, 'a': 0.1, 'b': 0, '...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow,LATIN,"[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,2,6.0,0,0,1,ricky winslow,"{' ': 0.07692307692307693, ''': 0, '-': 0, 'a'...","{(' ', ' '): 0, (' ', '''): 0, (' ', '-'): 0, ..."


In [51]:
df2_turk['indiv_trigrams_fdist'] = df2_turk['trigrams'].apply(lambda entry: initialized_trigrams.copy())

In [52]:
'''
Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

trigrams_list: the list of trigrams for this current example.
init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
'''
def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
df2_turk['indiv_trigrams_fdist'] = df2_turk.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

In [53]:
"""
# Checking 0th example""
print(df2_turk.loc[0, 'indiv_trigrams_fdist'][('m', 'e', 'l')])
print(1 / len(df2_turk.loc[0, 'trigrams'])) # manual calculation

# Checking 1st example
print(df2_turk.loc[1, 'fullname'])
print(df2_turk.loc[1, 'indiv_trigrams_fdist'][('e', 'l', 'a')])
print(1 / len(df2_turk.loc[1, 'trigrams'])) # manual calculation
"""

'\n# Checking 0th example""\nprint(df2_turk.loc[0, \'indiv_trigrams_fdist\'][(\'m\', \'e\', \'l\')])\nprint(1 / len(df2_turk.loc[0, \'trigrams\'])) # manual calculation\n\n# Checking 1st example\nprint(df2_turk.loc[1, \'fullname\'])\nprint(df2_turk.loc[1, \'indiv_trigrams_fdist\'][(\'e\', \'l\', \'a\')])\nprint(1 / len(df2_turk.loc[1, \'trigrams\'])) # manual calculation\n'

In [54]:
 #This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_unigrams_fdist'] = df2_turk['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)

In [55]:
# Calculating cosine similarity
df2_turk['unigrams_cosine_sim'] = df2_turk['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])

In [56]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_bigrams_fdist'] = df2_turk['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)

In [57]:
# Calculating cosine similarity
df2_turk['bigrams_cosine_sim'] = df2_turk['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])


In [58]:
# This cell cannot be run more than once!
# Converting fdists to numpy arrays first so we can pass them into cosine_similarity
df2_turk['indiv_trigrams_fdist'] = df2_turk['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype = float).reshape(1, -1))
trigram_fdist = np.fromiter(trigram_fdist.values(), dtype = float).reshape(1, -1)

In [59]:
# Calculating cosine similarity
df2_turk['trigrams_cosine_sim'] = df2_turk['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
df2_turk.head()

Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,...,period_freq,dash_freq,space_freq,transliteration,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova,LATIN,"[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,...,0,0,1,melahat abbasova,"[[0.0625, 0.0, 0.0, 0.3125, 0.125, 0.0, 0.0, 0...","[[0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.726774,0.32765,0.09374
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl,LATIN,"[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,...,0,1,1,freddy sahin-scholl,"[[0.05263157894736842, 0.0, 0.0526315789473684...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.654977,0.146567,0.053994
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson,LATIN,"[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,...,0,0,1,richard kingson,"[[0.06666666666666667, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.679317,0.201639,0.024418
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink,LATIN,"[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,...,0,0,1,hrant dink,"[[0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.111111111111...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.713517,0.307555,0.048496
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow,LATIN,"[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,...,0,0,1,ricky winslow,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.516877,0.069938,0.002807


In [60]:
df2_turk

Unnamed: 0,id,label_tr,original_fullname,fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,name_length,...,period_freq,dash_freq,space_freq,transliteration,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim
125,http://www.wikidata.org/entity/Q3508136,Melahat Abbasova,Melahat Abbasova,melahat abbasova,LATIN,"[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, a]","[(m, e), (e, l), (l, a), (a, h), (h, a), (a, t...","[(m, e, l), (e, l, a), (l, a, h), (a, h, a), (...","[m, e, l, a, h, a, t, , a, b, b, a, s, o, v, ...",15,...,0,0,1,melahat abbasova,"[[0.0625, 0.0, 0.0, 0.3125, 0.125, 0.0, 0.0, 0...","[[0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.726774,0.327650,0.093740
126,http://www.wikidata.org/entity/Q96602,Freddy Sahin-Scholl,Freddy Sahin-Scholl,freddy sahin-scholl,LATIN,"[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...","[(f, r), (r, e), (e, d), (d, d), (d, y), (y, ...","[(f, r, e), (r, e, d), (e, d, d), (d, d, y), (...","[f, r, e, d, d, y, , s, a, h, i, n, -, s, c, ...",18,...,0,1,1,freddy sahin-scholl,"[[0.05263157894736842, 0.0, 0.0526315789473684...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.654977,0.146567,0.053994
127,http://www.wikidata.org/entity/Q217097,Richard Kingson,Richard Kingson,richard kingson,LATIN,"[r, i, c, h, a, r, d, , k, i, n, g, s, o, n]","[(r, i), (i, c), (c, h), (h, a), (a, r), (r, d...","[(r, i, c), (i, c, h), (c, h, a), (h, a, r), (...","[r, i, c, h, a, r, d, , k, i, n, g, s, o, n, ...",14,...,0,0,1,richard kingson,"[[0.06666666666666667, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.679317,0.201639,0.024418
129,http://www.wikidata.org/entity/Q110126,Hrant Dink,Hrant Dink,hrant dink,LATIN,"[h, r, a, n, t, , d, i, n, k]","[(h, r), (r, a), (a, n), (n, t), (t, ), ( , d...","[(h, r, a), (r, a, n), (a, n, t), (n, t, ), (...","[h, r, a, n, t, , d, i, n, k, (h, r), (r, a),...",9,...,0,0,1,hrant dink,"[[0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.111111111111...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.713517,0.307555,0.048496
130,http://www.wikidata.org/entity/Q42079,Ricky Winslow,Ricky Winslow,ricky winslow,LATIN,"[r, i, c, k, y, , w, i, n, s, l, o, w]","[(r, i), (i, c), (c, k), (k, y), (y, ), ( , w...","[(r, i, c), (i, c, k), (c, k, y), (k, y, ), (...","[r, i, c, k, y, , w, i, n, s, l, o, w, (r, i)...",12,...,0,0,1,ricky winslow,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.516877,0.069938,0.002807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18764,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen,Nil İpek Hülagü Öztürkmen,nil i̇pek hülagü öztürkmen,LATIN,"[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...","[(n, i), (i, l), (l, ), ( , i), (i, ̇), (̇, p...","[(n, i, l), (i, l, ), (l, , i), ( , i, p), (...","[n, i, l, , i, ̇, p, e, k, , h, ü, l, a, g, ...",23,...,0,0,3,nil ipek hulagu ozturkmen,"[[0.11538461538461539, 0.0, 0.0, 0.03846153846...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.766280,0.250772,0.100388
18765,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya,Fatma Betül Sayan Kaya,fatma betül sayan kaya,LATIN,"[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...","[(f, a), (a, t), (t, m), (m, a), (a, ), ( , b...","[(f, a, t), (a, t, m), (t, m, a), (m, a, ), (...","[f, a, t, m, a, , b, e, t, ü, l, , s, a, y, ...",19,...,0,0,3,fatma betul sayan kaya,"[[0.13636363636363635, 0.0, 0.0, 0.27272727272...","[[0.0, 0.0, 0.0, 0.0, 0.047619047619047616, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.812166,0.453787,0.210581
18769,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan,Elif Nur Bozkurt Tandoğan,elif nur bozkurt tandoğan,LATIN,"[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...","[(e, l), (l, i), (i, f), (f, ), ( , n), (n, u...","[(e, l, i), (l, i, f), (i, f, ), (f, , n), (...","[e, l, i, f, , n, u, r, , b, o, z, k, u, r, ...",22,...,0,0,3,elif nur bozkurt tandogan,"[[0.12, 0.0, 0.0, 0.08, 0.04, 0.0, 0.04, 0.04,...","[[0.0, 0.0, 0.0, 0.0, 0.041666666666666664, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.820321,0.366293,0.123383
18770,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan,Muhammed Ali Fatih Erbakan,muhammed ali fatih erbakan,LATIN,"[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...","[(m, u), (u, h), (h, a), (a, m), (m, m), (m, e...","[(m, u, h), (u, h, a), (h, a, m), (a, m, m), (...","[m, u, h, a, m, m, e, d, , a, l, i, , f, a, ...",23,...,0,0,3,muhammed ali fatih erbakan,"[[0.11538461538461539, 0.0, 0.0, 0.19230769230...","[[0.0, 0.0, 0.0, 0.04, 0.0, 0.0, 0.0, 0.04, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.894470,0.462341,0.148177


In [None]:
df2_turk.to_csv("turkish_df.csv", index=False)