# Company Person Dataset Cleaning

In [1]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.util import ngrams
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity
import re

df = pd.read_csv("../name_data/company_person_name_dataset.csv")
df

Unnamed: 0,name,class,lang
0,The Canal of the Angels,0,en
1,Rescue Renovation,0,en
2,Agatha Christie: The ABC Murders,0,en
3,Siti Akbari,0,ar
4,Stany,0,pl
...,...,...,...
199995,Robber's Bridge,0,en
199996,Johan Renck,0,en
199997,Lyle Stewart,1,en
199998,Thomas Colclough Watson,1,en


## functions


In [2]:
def generate_char_ngrams(text):
    if isinstance(text, str):
        words = text.split()
        unigrams = list(ngrams(text, 1))
        bigrams = list(ngrams(text, 2))
        trigrams = list(ngrams(text, 3))
        
        interpolated_ngrams = unigrams + bigrams + trigrams
        return interpolated_ngrams
    else:
        return []
    
def generate_char_unigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        unigrams = list(ngrams(characters, 1))
        
        return unigrams
    else:
        return []
    
def generate_char_bigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        bigrams = list(ngrams(characters, 2))
        
        return bigrams
    else:
        return []
    
def generate_char_trigrams(text):
    if isinstance(text, str):
        # Tokenize the text into characters
        characters = list(text)
        
        # Create unigrams
        trigrams = list(ngrams(characters, 3))
        
        return trigrams
    else:
        return []

## Frequency Distribution

In [3]:
def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams
    return gram_freqs_relative

'''
Function to be applied to each row of a DataFrame. Sets and returns a hashmap of the relative trigrams frequency distribution for the current example.

trigrams_list: the list of trigrams for this current example.
init_trigrams: a hashmap of all possible trigrams as the keys and all values set to 0.
'''
def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

## Cleaning

In [4]:
df = df[df["class"]==1]
#look into smote for imbalance

In [6]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))

True
False
False


In [7]:
print(len(df.isnull()))

48190


In [8]:
df = df.dropna()

In [9]:
df.drop_duplicates()

Unnamed: 0,name,class,lang
11,Raha Etemadi,1,et
12,Leena Peltonen-Palotie,1,fi
16,Luma Grothe,1,en
20,Takuya Kakine,1,ja
21,Ōuyáng Zhènhuá,1,mi
...,...,...,...
199976,Clyde Donaldson,1,en
199980,Terry Alexander,1,en
199989,Neil Roebuck,1,en
199997,Lyle Stewart,1,en


In [10]:
language_counts = df["lang"].value_counts
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [11]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))
print(np.any(df.duplicated()))

False
False
False
True


In [12]:
df = df.drop_duplicates()

In [13]:
print(np.any(df.duplicated()))

False


In [14]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [15]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)

Language distribution:
lang
en    23352
es     2532
it     1224
fr     1181
ja     1155
      ...  
my        1
sd        1
kk        1
pa        1
si        1
Name: count, Length: 103, dtype: int64


## Additional Cleaning to Names with Special Characters that arent apostrophes, dashes, spaces, or accents.


Reset df index starting from 0

In [16]:
df.reset_index(drop=True, inplace=True)
print(df)

                          name  class lang
0                 Raha Etemadi      1   et
1       Leena Peltonen-Palotie      1   fi
2                  Luma Grothe      1   en
3                Takuya Kakine      1   ja
4               Ōuyáng Zhènhuá      1   mi
...                        ...    ...  ...
47754          Clyde Donaldson      1   en
47755          Terry Alexander      1   en
47756             Neil Roebuck      1   en
47757             Lyle Stewart      1   en
47758  Thomas Colclough Watson      1   en

[47759 rows x 3 columns]


Create langname to find the alphabet name for each character in a name instead of just the first letter (Anna said its better to have full list) and create column

In [18]:
langname = lambda x:[unicodedata.name(char).split(' ')[0] for char in x]
df['alphabet'] = df['name'].apply(langname)
print(df['alphabet'].head())

0    [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
1    [LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...
2    [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
3    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
4    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
Name: alphabet, dtype: object


In [19]:
allalphatype = []
#List of all alphabet types of every character in the dataset
for i in df['name']:
    for x in langname(i):
        allalphatype.append(x)

print(len(set(allalphatype)))

#Count 
total = 0
for i in set(allalphatype):
    count = 0
    total +=1
    for x,y in enumerate(df['name']):
        if i in langname(y):
            count += 1
    print(count, i)
print(total)

55


2489 FULL
1 LESS-THAN
5 NUMBER
5 COMBINING
5 MODIFIER
3 ASTERISK
339 APOSTROPHE
1 DOLLAR
1 WHITE
14 GREEK
1 FEMININE
23 HANGUL
1 ZERO
15 SOLIDUS
1 SINHALA
418 COMMA
1 EXCLAMATION
85 DIGIT
1 ETHIOPIC
1 GURMUKHI
13 HEBREW
1 MALAYALAM
47392 LATIN
184 CJK
1 TELUGU
2 IDEOGRAPHIC
17 DEVANAGARI
31 COLON
1 SYRIAC
312 RIGHT
2 THAI
1 LOW
46500 SPACE
77 ARABIC
1400 HYPHEN-MINUS
2 TAMIL
1 KATAKANA
4 VERTICAL
10 SEMICOLON
5 AMPERSAND
3 ARMENIAN
2 GEORGIAN
4 KANNADA
1 MYANMAR
4 EN
1 DAGGER
5 BENGALI
180 QUOTATION
1 ACUTE
301 LEFT
1 EQUALS
75 CYRILLIC
2 MIDDLE
2 HIRAGANA
1 FULLWIDTH
55


In [20]:
def findname(character, indexes):
    for x, y in enumerate(df['name']):
        if str(character) in (langname(y)):
            indexes.append(x)
            print(x, y, df.iloc[x, 2])

def replacechar(orig, repl, indexes):
    for y in indexes:
        df.iloc[y, 0]=df.iloc[y, 0].replace(orig, repl)
        print(df.iloc[y, 0])

### COMMA

In [21]:
com = []
findname('COMMA', com)

28 Vicente de Valverde y Alvarez de Toledo, O.P. es
161 St. Jutta of Kulmsee, T.O.S.F. en
195 Sir William Jardine, 7th Baronet en
439 Ric Manrique, Jr. ca
488 Saint Teresa of Calcutta,MC en
591 Y. Pierre Gobin, MD fr
637 Louis VII, Duke of Bavaria en
976 Samuel D. Pagdilao, Jr. en
1013 Blessed Agnes of Jesus, O.P. en
1025 St. Gerard of Csanád, O.S.B. hu
1193 Thomas Wharton, Jr. en
1332 Simon IV, Lord of Lippe en
1421 Walter D. Druen, Jr. da
1475 Saint Rose Venerini, M.P.V. la
1724 Hubert Duck, Deuteronomy Duck and Louis Duck (Quack Pack) en
2102 Sir Vincent Meredith, Bt en
2250 Eckhard I, Count of Scheyern de
2294 Sir Robert Rich, Bt en
2370 Edward Francis Mickolus, Jr. en
2376 Chen-Yuan Lee, en
2379 Charles Louis, Count of Nassau-Saarbrücken en
2429 Saint Jeanne de Lestonnac,O.D.N. fr
2659 Archie J. Old, Jr. en
2722 Sir Archibald Alison, Bt en
2829 Ellen Heber-Katz, PhD en
2965 Denmark Groover, Jr. en
3254 Henry IV, Count of Bar en
3334 Frederick Landis, Jr. en
3342 Mario Santos, Jr. 

### IDEOGRAPHIC

In [22]:
ide = []
findname('IDEOGRAPHIC', ide)

31381 漢人 陽子　(Kando Yōko) zh-CN
43099 飯島澄男　Sumio Iijima zh-CN


### COMBINING

In [23]:
comb = []
findname('COMBINING', comb)

13038 (Его́р  Лигачёв) tg
16669 Мико́ла Миха́йлович Люби́нський uk
20518 (Дми́трий Я́зов) ru
25538 (Никола́й Вознесе́нский) ru
27501 (Валенти́на Матвие́нко) bg


### ASTERISK

In [24]:
ast = []
findname('ASTERISK', ast)

6816 León Ávalos y Vez* es
12044 Biga*Ranx en
25458 *Tomko sk


### FULLWIDTH

In [25]:
fullw = []
findname('FULLWIDTH', fullw)

43794 Thomas Prence （or，Thomas Prince） en


### QUOTATION

In [26]:
quo = []
findname('QUOTATION', quo)

190 Francis "Frank" Forshew sv
199 Evelyn "Champagne" King en
248 Ernesto "Boy" F. Herrera en
440 Graeme "Shirley" Strachan en
557 Augustine "Willie" Dominguez en
775 Brian "Goldbelt" Maxine en
834 Joyce "Fenderella" Irby en
883 Farley "Jackmaster" Funk en
1686 Edgar "Ted" Codd en
1780 Lewis Burwell "Chesty" Puller cy
2551 Ivan Maček "Matija" hr
3020 Billy "Harp" Hamilton en
3094 George "Chappie" Johnson Jr. en
3285 "Professor" Irwin Corey en
3391 Alfredo "El Salsero" Escalera es
3477 Peter Ulysses "Sturgis" Turner en
3716 Mark Edmund "Duke" Bainum en
4987 Robert W. "Bob" Straub en
4997 James "J.T." Taylor en
5503 Arthur "Andy" B. VanGundy Jr. te
5743 David L. "Dave" Pearce en
6793 William "Bill" L. Mack en
6942 John Richard "Jack" Williams en
7024 Ronald Dean "Ron" Givens en
7480 H. K. "Kenneth" Cassidy en
7725 "Şeker" Ahmed Pasha tr
8116 H. L. "Matty" Matthews en
8296 Abigail "Abby" Sciuto it
8436 Hoffman Lee "Hop" Fuller en
8608 Christopher "Chris" A. Brown en
9339 Donald Howard "Do

### SEMICOLON

In [27]:
semi = []
findname('SEMICOLON', semi)


12404 Caesar Lucius Aurelius Commodus Antoninus Augustus (180); la
15848 Shinmen Takezō; Miyamoto Bennosuke; Niten Dōraku; Shinmen Musashi no Kami Fujiwara no Genshin ja
28415 Lele; Ezequiel haw
33844 Michael O;Shea en
35732 Steven K. D&#39;Arcy en
38925 Darna; Princess Vara; Olga sv
43280 Given name: Xuanyuan (Hsuan-yuan; 軒轅) zh-CN
44626 Marcus Aurelius Numerius Numerianus (from birth to elevation to Caesar); la
45209 Flavius Honorius (from birth to accession); en
46661 (198 to 211); en


### VERTICAL

In [28]:
ver = []
findname('VERTICAL', ver)

2816 thumb|right|Spitfire Mk IIa P7350 of the BBMF is the only existing airworthy Spitfire that fought in the Battle of Britain. en
4319 frameless|center en
14690 Veronika Sramatythumb|Veronika Sramaty making of The Top Ten en
22302 Timmy O'Dowdthumb|Timmy Dowd in Dowdies December 2011 en


### COLON 

In [29]:
col = []
findname('COLON', col)

762 Ancestral name: Jiang (姜) gu
3596 German:  Josef Franz Leopold en
4092 German: Henriette Maria Norberta en
5305 Ardeshir (or in higher Persian: Ardashir) Kamkar en
6336 Lily Munster/ OTS name: Phoebe Munster en
7433 File:Hollidaycamo.jpg en
15053 Farid Aslani (Persian : فرید اصلانی) fa
18358 Family name: Hóu (侯) en
19400 Dutch: Gabriël Boudewijn Karel Maria nl
20732 Dutch: Albert Leopold Clemens Maria Meinrad de
21742 Sajid AliUrdu:ساجد  علی fa
22198 Given name: Unknown en
24154 (Arabic:عبد العزيز بديع صالح السليمي) ar
25809 Chinese: Duàn Sīpíng (段思平) zh-CN
25879 Given name: Dao 導 ja
27432 Given name: Zhoupu (州蒲) en
27498 Makhdoom Khusro Bakhtyar (Seraiki:مخدوم  کسر  بختٻد) fa
27503 Laqab: Qutb ad-Din (shortly) uz
27582 Spanish: María Luisa de Borbón y Sajonia es
28808 Maganti Murali Mohan ( Born : Maganti Raja Babu ) te
31311 Traditional Chinese:韓世忠 zh-CN
34236 Given name: Dawud en
37179 Nu:Tone ro
40523 German: Wilhelm Friedrich Karl Ernst Joachim Albrecht en
40992 Family name:Si

### SOLIDUS

In [30]:
sol = []
findname('SOLIDUS', sol)

6336 Lily Munster/ OTS name: Phoebe Munster en
7610 Paula Baracho / Paula Ribeiro pt
8389 Rajendra Choudhary / राजेन्द्रा चौधरी hi
10940 Amanda Porter/Emily Thorne en
12453 Anne-Marie Baiynd/Presnell/Buttrick/Band en
15445 Líu Hóngdù (劉弘度) / Líu Bīn (劉玢) zh-CN
21785 Elavarasan a/l Elangowan ms
22295 Gay Stanhope Falcon/Gay Laurence/Tom Laurence/Michael "Mike" Waring en
23549 Carsten Bo Eriksen / MBD73 da
29724 Marcy Levy/Marcella Detroit en
29991 Mikhail Shervashidze/Chachba en
31777 Philip Hammond/Hamman, Sr. hi
43349 Nanthakumar a/l Kaliappan ta
45738 Maruja Isabella Sevilla (Spanish Era), Nina Concepcion/Cristy (Reincarnation) es
46497 Victor a/l Andrag ro


### EXCLAMATION

In [31]:
exl = []
findname('EXCLAMATION', exl)

27704 Dr. Kucho! hi


### DOLLAR

In [32]:
dol = []
findname('DOLLAR', dol)

35273 Ty Dolla $ign fi


### DAGGER

In [33]:
dag = []
findname('DAGGER', dag)

7182 Philipp zu Hohenfelden † de


### LESS-THAN

In [34]:
lessth = []
findname('LESS-THAN', lessth)

44921 <span en


### ACUTE

In [35]:
acu = []
findname('ACUTE', acu)

31033 Désiré M´Bonabucya fr


In [36]:
langname('M´Bonabucya')

['LATIN',
 'ACUTE',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN']

### EQUALS

In [37]:
eq = []
findname('EQUALS', eq)

24351 =Giovanni Agostino Marliani it


### ZERO (ZERO WIDTH SPACE)

In [38]:
zer= []
findname('ZERO', zer)

13387 صادق قطب‌زاده fa


### KEEP APOSTROPHE 

In [39]:
apo = []
findname('APOSTROPHE', apo)

147 Lawrence O'Brien en
263 William G. T'Vault en
553 John O'Donoghue en
600 Brendan O'Brien en
686 Breon O'Casey en
882 Detta O'Cathain gd
928 Junior Sa'u hr
1192 Ak'ak'i Meipariani ht
1248 Gary De'Roux fr
1419 Patrick O'Sullivan fi
1548 Sameh El-Saharty'' su
1949 Harriet Osborne O'Hagan en
2140 Cinders O'Brien en
2195 George 'Jocka' Todd en
2461 Rod O'Connor en
2570 Luke O'Connor en
2699 Mononc' Serge fr
2761 (Dato') it
2871 Aodh Ó'Raghallaigh ga
2972 Abdullah Sallum al-Samarra'i ar
2977 Abdillahi Suldaan Mohammed 'Timacade' so
3097 Graba' ro
3230 Steve O'Neill en
3339 Eleni Koiosi' fi
3839 Brittany O'Connell en
4006 Willem 's Gravesande en
4125 Paul O'Grady en
4184 Mark O'Brien en
4208 Anne d'Alençon fr
4297 Harvey O'Brien en
4342 Summer O'Brien en
4343 Martin Patrick O'Connell en
4367 F'Murr en
4405 Molly O'Day en
4420 Bryant O'Dare Hammett, Jr. en
4724 Ann O'Connell en
4746 सुर्नेद्र झा 'सुमन ' hi
4751 Conor Cruise O'Brien en
4782 Joseph Leonard O'Brien en
4887 John O'Neill en
492

### LOW

In [40]:
low = []
findname('LOW', low)

8271 Panpayak_Jitmuangnon jw


### FEMININE

In [41]:
fem = []
findname('FEMININE', fem)

31035 José Mª. Cabral Bermúdez es


### MODIFIER

In [42]:
mod = []
findname('MODIFIER', mod)

19200 al-Ḥasan ibn ʻAlī al-Barbahārī ar
35526 Mulielealiʻi haw
35885 Israel "Iz" Kamakawiwoʻole haw
37882 Wikolia Kamehamalu Keawenui Kaahumanu-a-Kekūanaōʻa, Victoria Kamāmalu Kaahumanu IV haw
41238 Abū al-Ḥassan ʿAlī ibn ʿUthmān al-Marīni ar


In [43]:
langname('ʻAlī al-Barbahārī')

['MODIFIER',
 'LATIN',
 'LATIN',
 'LATIN',
 'SPACE',
 'LATIN',
 'LATIN',
 'HYPHEN-MINUS',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN']

In [44]:
langname('ʿAlī ibn ʿUthmān al-Marīni')

['MODIFIER',
 'LATIN',
 'LATIN',
 'LATIN',
 'SPACE',
 'LATIN',
 'LATIN',
 'LATIN',
 'SPACE',
 'MODIFIER',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'SPACE',
 'LATIN',
 'LATIN',
 'HYPHEN-MINUS',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN']

### RIGHT (312) and LEFT (301)

In [45]:
langname('’')


['RIGHT']

In [46]:
rig = []
lef = []
findname('RIGHT', rig)
findname('LEFT', lef)

46 David John (Dave) Shannon en
101 (Armen Lubin) de
197 C.W. (Bill) Hutchins en
355 Koshinaka Makoto (越中睦士) ja
555 Josip Seissel (Jo Klek) sl
762 Ancestral name: Jiang (姜) gu
802 Frank J. O’Connell en
915 (최영일) ko
1357 (Henry Roger) Justin Lewis en
1724 Hubert Duck, Deuteronomy Duck and Louis Duck (Quack Pack) en
1757 Charles D. “Chuck” McAtee en
1794 (GOSEComCComCS) en
1997 Junsei Terasawa (寺沢潤世) ja
2335 () en
2346 Pavel (Pavlo) Gintov sk
2411 (Τίνα Μπιρμπίλη) el
2446 Blessed John Finch (martyr) en
2479 David Hirst (arachnologist) en
2502 Hormisdas Etienne Djibri (Estefan Jabri) fr
2505 Baldwin (II) Rátót hu
2605 Kim Tae-Jin (김태진) ko
2608 (Serdar-ı Ekrem Ömer Paşa) tr
2761 (Dato') it
2942 Claire Kirby (Claire Debrois) en
2980 Saint Honoratus (Honorius) of Amiens la
3187 Milo Gunderson (body) en
3654 Alexander (Oleksandr) Khvostenko-Khvostov ru
4084 Kanysh Satpayev (Kanysh Satbaev) ru
4216 Mohammad Boroujerdi (محمد بروجردی) ar
4301 Örüg Temür Khan (Gulichi) tr
4308 Samuel Robles (Pana

30145 (Charles) David White en
30557 Rana Ali Imran (علی عمران) fa
30723 Devan (R. Mahadevan) hi
30726 Yang Mulia Raja Ahmad Nazim Azlan Shah Ibni Raja Dato'seri Seri Ashman Shah (as birth) ms
30907 Julien Dupré (Dupre) fr
31290 (Варвара Зеленская) ru
31381 漢人 陽子　(Kando Yōko) zh-CN
31390 Jacques Miller (ACFRSFAA) en
31465 Mary J. P. (Maura) Scannell en
31600 Ludwik Konarzewski (junior) pl
31902 (Saint Damien of Molokai) en
32096 Jonathan (Dana) Wilson en
32104 Charles P. (Chuck) Thacker en
32301 Justin Sane (In professional wrestling) en
32375 (ლევან ყენია) ka
32463 (Cardinalof theCatholic Church) en
32500 Wilfred Austin “Wilf” Curtis en
32512 Kazuo “Shin Sen” Hamasaki ja
32934 Tony Rand (Anthony Eden "Tony" Rand) en
33069 (Юмжаагийн Цэдэнбал) mn
33521 (Ευστάθιος Ταυλαρίδης) el
33704 Joseph (Osip Ivanovich) Bové sk
33732 1315 (1899)P.-2 en
33741 Shekhar Dutt (SM) en
33750 Paladine H. Roye (Pon-Cee-Cee) gu
33988 (CBE) en
34072 Brig Gen. (Ret.) Gary Pendleton en
34370 Marcario (Macario) 

### AMPERSAND

In [47]:
amp = []
findname('AMPERSAND', amp)

4309 Isaac & Miria en
22717 Tom & Olly en
24908 Saints Hor, Besoy, & Daydara bn
35732 Steven K. D&#39;Arcy en
38598 Lanfranchi & Farina it


### WHITE

In [48]:
whi = []
findname('WHITE', whi)

19465 Beauty☆Takaco en


### MIDDLE

In [49]:
mid = []
findname('MIDDLE', mid)

20020 (額爾德特·文繡) zh-CN
47690 (愛新覺羅·胤禟) zh-CN


### DIGIT

In [50]:
dig = []
findname('DIGIT', dig)

195 Sir William Jardine, 7th Baronet en
1024 Auriane Mallo in 2013 la
2519 Coi4 Si1 Bui4 en
2766 Lok6 Dai3 ar
2816 thumb|right|Spitfire Mk IIa P7350 of the BBMF is the only existing airworthy Spitfire that fought in the Battle of Britain. en
2940 Ckay1 vi
3625 26th Clan Chief en
4112 Sir Edward Malet, 4th Bt en
4500 D-Ray 3000 en
4740 Robert de Holland, 1st Baron Holand es
6796 James Younger, 5th Viscount Younger of Leckie en
7745 Kyoko Inoue#5 ja
9952 Mateusz Rutkowski20px pl
12404 Caesar Lucius Aurelius Commodus Antoninus Augustus (180); la
12412 Can4 Jin4-hei1 en
12480 The Chain Gang of 1974 en
12817 William Pakenham, 4th Earl of Longford en
13902 George Douglas, 1st Earl of Dumbarton en
14063 9ice ar
14222 21 Savage en
14966 György Berencsi 3rd hu
15526 J57 ar
15874 1316-Sv.4 en
15914 Sir Charles Hastings, 1st Baronet en
16286 Gu2 Geoi6-gei1 bg
16428 Dik6 Lung4 en
17228 A1 ig
17400 Sir George White, 1st Baronet en
17700 wai6 laan4 ar
17701 1978 en
17727 Alexander Hood, 1st Viscount

In [51]:
for i, x in enumerate(dig):
    if langname(df['name'][x][0]) == ['DIGIT']:
        print(x, df['name'][x], df['lang'][x])

3625 26th Clan Chief en
14063 9ice ar
14222 21 Savage en
15874 1316-Sv.4 en
17701 1978 en
26310 1317-P.80 en
29759 1314-P.39 en
32216 2Mex en
33174 12th Planet en
33732 1315 (1899)P.-2 en
35044 1312-Sv.6 en
39681 2Tap ar
46347 1321-P.44 en
47230 1318-Ağ. Top1 az


### EN

In [52]:
enh = []
findname('EN', enh)

14408 Stevan Nedić–Ćela hr
18503 Jerzy Waldorff–Preyss en
19030 Meinhard II of Gorizia–Tyrol en
38917 James Stirling (1800–1876) en


### NUMBER

In [53]:
num = []
findname('NUMBER', num)

7745 Kyoko Inoue#5 ja
23111 Alacrana Plata #2 es
35599 Masked Burning #2 en
35732 Steven K. D&#39;Arcy en
38372 James Gunnar Nixon #1 en


In [54]:
alphatype1 = []
for i in df['name']:
    for x in langname(i):
        alphatype1.append(x)

print(len(set(alphatype1)))

total1 = 0
for i in set(alphatype1):
    count = 0
    total1 +=1
    for x,y in enumerate(df['name']):
        if i in langname(y):
            count += 1
    print(count, i)
print(total1)

55
2489 FULL
1 LESS-THAN
5 NUMBER
5 COMBINING
5 MODIFIER
3 ASTERISK
339 APOSTROPHE
1 DOLLAR
1 WHITE
14 GREEK
1 FEMININE
23 HANGUL
1 ZERO
15 SOLIDUS
1 SINHALA
418 COMMA
1 EXCLAMATION
85 DIGIT
1 ETHIOPIC
1 GURMUKHI
13 HEBREW
1 MALAYALAM
47392 LATIN
184 CJK
1 TELUGU
2 IDEOGRAPHIC
17 DEVANAGARI
31 COLON
1 SYRIAC
312 RIGHT
2 THAI
1 LOW
46500 SPACE
77 ARABIC
1400 HYPHEN-MINUS
2 TAMIL
1 KATAKANA
4 VERTICAL
10 SEMICOLON
5 AMPERSAND
3 ARMENIAN
2 GEORGIAN
4 KANNADA
1 MYANMAR
4 EN
1 DAGGER
5 BENGALI
180 QUOTATION
1 ACUTE
301 LEFT
1 EQUALS
75 CYRILLIC
2 MIDDLE
2 HIRAGANA
1 FULLWIDTH
55


### Time to Drop

In [55]:
drop = []
def dropfn(list):
    for i in list:
        drop.append(i)

listofdrops= [com, ide, comb, ast, fullw, quo, semi, ver, col, sol, exl, dol, dag, lessth, acu, eq, zer, low, fem, mod, rig, lef, amp, whi, mid, dig, enh, num]
for y in listofdrops:
    dropfn(y)
print(len(drop))
print(len(set(drop)))


1398
996


In [56]:
df = df.drop(drop)

In [57]:
len(df)

46763

In [58]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,name,class,lang,alphabet
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA..."
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA..."
...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."


### Doing basic cleaning again

In [59]:
print(np.any(df["name"].isna()))
print(np.any(df["name"] == ''))
print(np.any(df["name"] == ' '))
print(np.any(df["name"] == '  '))

#Need to drop alphabets because df.duplicated() doesn't work with lists
df = df.drop(columns = 'alphabet')
print(np.any(df.duplicated()))

False
False
False
False
False


In [60]:
df['alphabet'] = df['name'].apply(langname)

Drop certain rows

In [61]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,name,class,lang,alphabet
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA..."
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA..."
...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT..."
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT..."


In [62]:
allalphatype1 = []
#List of all alphabet types of every character in the dataset
for i in df['name']:
    for x in langname(i):
        allalphatype1.append(x)

print(len(set(allalphatype1)))

#Count 
total1 = 0
for i in set(allalphatype1):
    count = 0
    total1 +=1
    for x,y in enumerate(df['name']):
        if i in langname(y):
            count += 1
    print(count, i)
print(total1)

26
2210 FULL
332 APOSTROPHE
11 GREEK
3 HANGUL
1 SINHALA
1 ETHIOPIC
1 GURMUKHI
10 HEBREW
1 MALAYALAM
46468 LATIN
150 CJK
1 TELUGU
15 DEVANAGARI
1 SYRIAC
1 THAI
45568 SPACE
59 ARABIC
1319 HYPHEN-MINUS
2 TAMIL
1 KATAKANA
2 ARMENIAN
3 KANNADA
1 MYANMAR
5 BENGALI
45 CYRILLIC
2 HIRAGANA
26


## Feature Creation

### alphabet

Use langname to find the alphabet name for each character in a name instead of just the first letter (Anna said its better to have full list) and create column

In [63]:
langname = lambda x:[unicodedata.name(char).split(' ')[0] for char in x]
df['alphabet'] = df['name'].apply(langname)
print(df['alphabet'].head())

0    [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
1    [LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...
2    [LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...
3    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
4    [LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...
Name: alphabet, dtype: object


In [64]:
allalphatype = []
#List of all alphabet types of every character in the dataset
for i in df['name']:
    for x in langname(i):
        allalphatype.append(x)

total = 0
for i in set(allalphatype):
    count = 0
    total +=1
    for x,y in enumerate(df['name']):
        if i in langname(y):
            count += 1
    print(count, i)
print(total)

2210 FULL
332 APOSTROPHE
11 GREEK
3 HANGUL
1 SINHALA
1 ETHIOPIC
1 GURMUKHI
10 HEBREW
1 MALAYALAM
46468 LATIN
150 CJK
1 TELUGU
15 DEVANAGARI
1 SYRIAC
1 THAI
45568 SPACE
59 ARABIC
1319 HYPHEN-MINUS
2 TAMIL
1 KATAKANA
2 ARMENIAN
3 KANNADA
1 MYANMAR
5 BENGALI
45 CYRILLIC
2 HIRAGANA
26


Alphabets: (HEBREW, TELUGU, SINHALA, CJK, HIRAGANA, ARMENIAN, DEVANAGARI, KATAKANA, CYRILLIC, KANNADA,BENGALI, GURMUKHI, LATIN, TAMIL, GREEK, ARABIC, GEORGIAN, MALAYALAM, SYRIAC, HANGUL, MYANMAR, THAI, ETHIOPIC) (24)


Non alphabets: (ASTERISK, 418 COMMA, 2489 FULL, 1400 HYPHEN-MINUS, 5 COMBINING, 1 FULLWIDTH, 180 QUOTATION, 10 SEMICOLON, 4 VERTICAl, 31 COLON, 46500 SPACE, 15 SOLIDUS, 1 EXCLAMATION, 1 DOLLAR, 1 DAGGER, 1 LESS-THAN, 1 ACUTE, 1 EQUALS, 1 ZERO, 339 APOSTROPHE, 1 LOW, 2 IDEOGRAPHIC, 1 FEMININE, 5 MODIFIER, 312 RIGHT, 5 AMPERSAND, 1 WHITE, 301 LEFT, 2 MIDDLE, 85 DIGIT, 4 EN, 5 NUMBER) (32)

Check for double spaces

Cleaned: ASTERISK (cleaned), FULLWIDTH (cleaned), QUOTATION, SEMICOLON, VERTICAL (disappeared), 

Keep:FULL (period), COMBINING (accent), 

Skipped: COMMA, QUOTATION, COLON, EN (not sure if should remove), HYPHEN-MINUS (not sure if should remove)

In [65]:
char = []
for x, y in enumerate(df['name']):
    if 'ASTERISK' in (langname(y)):
        print(x, y, df.iloc[x, 2])
        char.append(x)
char

[]

Clean ASTERISK

In [66]:
tmp = [6816, 25458]
for y in tmp:
    df.iloc[y, 0]=df.iloc[y, 0].replace('*', '')
    print(df.iloc[y, 0])

df.iloc[12044, 0]=df.iloc[12044, 0].replace('*', ' ')
print({df.iloc[i, 0] for i in char})


Nonoy Baclao
Eldon Thiele
set()


Clean FULLWIDTH

In [67]:
fullw = []
for x, y in enumerate(df['name']):
    if 'FULLWIDTH' in (langname(y)):
        print(x, y, df.iloc[x, 2])
        fullw.append(x)
fullw

[]

In [68]:
df.iloc[43794, 0]= "Thomas Prence or Thomas Prince"
print(df.iloc[43794, 0])

Thomas Prence or Thomas Prince


Clean QUOTATION

In [69]:
quo = []
for x, y in enumerate(df['name']):
    if 'QUOTATION' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        quo.append(x)

In [70]:
for y in quo:
    df.iloc[y, 0]=df.iloc[y, 0].replace('"', '')
    print(df.iloc[y, 0])

Clean SEMICOLON

In [71]:
semi = []
for x, y in enumerate(df['name']):
    if 'SEMICOLON' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        semi.append(x)


In [72]:
for y in semi:
    df.iloc[y, 0]=df.iloc[y, 0].replace(';', '')
    print(df.iloc[y, 0])

Clean VERTICAL (dropped irrelevant)

In [73]:
ver = []
for x, y in enumerate(df['name']):
    if 'VERTICAL' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        ver.append(x)

In [74]:
drop = [2816, 4319]
df.iloc[14690, 0] = 'Veronika Sramaty'
df.iloc[22302, 0] = "Timmy O'Dowd"

df['name'][ver]

Series([], Name: name, dtype: object)

Clean COLON (incomp) (considering dropping all names)

In [75]:
col = []
for x, y in enumerate(df['name']):
    if 'COLON' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        col.append(x)

Clean SOLIDUS

In [76]:
sol = []
for x, y in enumerate(df['name']):
    if 'SOLIDUS' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        sol.append(x)


In [77]:
for y in sol:
    df.iloc[y, 0]=df.iloc[y, 0].replace('/', ' ')
    print(df.iloc[y, 0])

Clean EXCLAMATION

In [78]:
exl = []
for x, y in enumerate(df['name']):
    if 'EXCLAMATION' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        exl.append(x)

In [79]:
for y in exl:
    df.iloc[y, 0]=df.iloc[y, 0].replace('!', '')
    print(df.iloc[y, 0])

Clean DOLLAR

In [80]:
dol = []

for x, y in enumerate(df['name']):
    if 'DOLLAR' in (langname(y)):
        print(x,y, df.iloc[x, 2])
        dol.append(x)

In [81]:
for y in dol:
    df.iloc[y, 0]=df.iloc[y, 0].replace('$', '')
    print(df.iloc[y, 0])

Clean DAGGER

In [82]:
def findname(character, indexes):
    for x, y in enumerate(df['name']):
        if str(character) in (langname(y)):
            indexes.append(x)
            print(x, y, df.iloc[x, 2])

In [83]:
dag = []
findname('DAGGER', dag)

Drop certain rows all at once

In [84]:
#df = df.drop()
#df.reset_index(drop=True, inplace=True)

### avg_token_length (complete)

In [85]:
def average_token_length(name):
    total_length = 0
    total_tokens = 0
    tokens = name.split()

    # Calculate the total length of tokens in the name
    token_length = sum(len(token) for token in tokens)

    # Total length (characters in each word) and total number of tokens (words in name)
    total_length += token_length
    total_tokens += len(tokens)
    # Calculate the average token length
    if total_tokens > 0:
        average_length = total_length / total_tokens
        return average_length
    else:
        return 0 
    
avg_token_length = []
for i in df['name']:
    avg_token_length.append(average_token_length(i))
    
df['avg_token_length'] = avg_token_length

### num_tokens (complete)

In [86]:
def token_len(name):
    total_tokens = 0
    tokens = name.split()
    # Total number of tokens (words in name)
    total_tokens += len(tokens)
    return total_tokens

In [87]:
token_length = []
for i in df['name']:
    token_length.append(token_len(i))
df['num_tokens'] = token_length
df['name_lower'] = df['name'].str.lower()

### transliteration

In [88]:
transliteration = []
for i in df['name_lower']:
    transliteration.append(unidecode(i))
df['transliteration'] = transliteration

### char ngrams

In [89]:
#new column with names in all lower case

df['char_ngrams'] = df['transliteration'].apply(generate_char_ngrams)
df['unigrams'] = df['transliteration'].apply(lambda name: list(name))
df['bigrams'] = df['transliteration'].apply(generate_char_bigrams)
df['trigrams'] = df['transliteration'].apply(generate_char_trigrams)
df

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (..."
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (..."
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (..."
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (..."
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (..."
...,...,...,...,...,...,...,...,...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,clyde donaldson,clyde donaldson,"[(c,), (l,), (y,), (d,), (e,), ( ,), (d,), (o,...","[c, l, y, d, e, , d, o, n, a, l, d, s, o, n]","[(c, l), (l, y), (y, d), (d, e), (e, ), ( , d...","[(c, l, y), (l, y, d), (y, d, e), (d, e, ), (..."
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,terry alexander,terry alexander,"[(t,), (e,), (r,), (r,), (y,), ( ,), (a,), (l,...","[t, e, r, r, y, , a, l, e, x, a, n, d, e, r]","[(t, e), (e, r), (r, r), (r, y), (y, ), ( , a...","[(t, e, r), (e, r, r), (r, r, y), (r, y, ), (..."
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,neil roebuck,neil roebuck,"[(n,), (e,), (i,), (l,), ( ,), (r,), (o,), (e,...","[n, e, i, l, , r, o, e, b, u, c, k]","[(n, e), (e, i), (i, l), (l, ), ( , r), (r, o...","[(n, e, i), (e, i, l), (i, l, ), (l, , r), (..."
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,lyle stewart,lyle stewart,"[(l,), (y,), (l,), (e,), ( ,), (s,), (t,), (e,...","[l, y, l, e, , s, t, e, w, a, r, t]","[(l, y), (y, l), (l, e), (e, ), ( , s), (s, t...","[(l, y, l), (y, l, e), (l, e, ), (e, , s), (..."


In [90]:
df

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (..."
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (..."
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (..."
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (..."
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (..."
...,...,...,...,...,...,...,...,...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,clyde donaldson,clyde donaldson,"[(c,), (l,), (y,), (d,), (e,), ( ,), (d,), (o,...","[c, l, y, d, e, , d, o, n, a, l, d, s, o, n]","[(c, l), (l, y), (y, d), (d, e), (e, ), ( , d...","[(c, l, y), (l, y, d), (y, d, e), (d, e, ), (..."
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,terry alexander,terry alexander,"[(t,), (e,), (r,), (r,), (y,), ( ,), (a,), (l,...","[t, e, r, r, y, , a, l, e, x, a, n, d, e, r]","[(t, e), (e, r), (r, r), (r, y), (y, ), ( , a...","[(t, e, r), (e, r, r), (r, r, y), (r, y, ), (..."
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,neil roebuck,neil roebuck,"[(n,), (e,), (i,), (l,), ( ,), (r,), (o,), (e,...","[n, e, i, l, , r, o, e, b, u, c, k]","[(n, e), (e, i), (i, l), (l, ), ( , r), (r, o...","[(n, e, i), (e, i, l), (i, l, ), (l, , r), (..."
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,lyle stewart,lyle stewart,"[(l,), (y,), (l,), (e,), ( ,), (s,), (t,), (e,...","[l, y, l, e, , s, t, e, w, a, r, t]","[(l, y), (y, l), (l, e), (e, ), ( , s), (s, t...","[(l, y, l), (y, l, e), (l, e, ), (e, , s), (..."


### period_freq, dash_freq, apostrophe_freq, space_freq

In [91]:
df['period_freq'] = df['name'].apply(lambda name: name.count('.'))
df['dash_freq'] = df['name'].apply(lambda name: name.count('-'))
df['apostrophe_freq'] = df['name'].apply(lambda name: name.count('\''))
df['space_freq'] = df['name'].apply(lambda name: name.count(' '))

In [92]:
df.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,apostrophe_freq,space_freq
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (...",0,0,0,1
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (...",0,1,0,1
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (...",0,0,0,1
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (...",0,0,0,1
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (...",0,0,0,1


### Dropping all names that are the only sample of their respective language

In [93]:
lang_counts = df['lang'].value_counts()
languages_with_one_sample = lang_counts[lang_counts == 1].index.tolist()
df = df[~df['lang'].isin(languages_with_one_sample)]

### word n_grams

In [94]:
df['word_ngrams'] = df['name'].apply(lambda name: name.split())
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['word_ngrams'] = df['name'].apply(lambda name: name.split())


Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,apostrophe_freq,space_freq,word_ngrams
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (...",0,0,0,1,"[Raha, Etemadi]"
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (...",0,1,0,1,"[Leena, Peltonen-Palotie]"
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (...",0,0,0,1,"[Luma, Grothe]"
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (...",0,0,0,1,"[Takuya, Kakine]"
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (...",0,0,0,1,"[Ōuyáng, Zhènhuá]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,clyde donaldson,clyde donaldson,"[(c,), (l,), (y,), (d,), (e,), ( ,), (d,), (o,...","[c, l, y, d, e, , d, o, n, a, l, d, s, o, n]","[(c, l), (l, y), (y, d), (d, e), (e, ), ( , d...","[(c, l, y), (l, y, d), (y, d, e), (d, e, ), (...",0,0,0,1,"[Clyde, Donaldson]"
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,terry alexander,terry alexander,"[(t,), (e,), (r,), (r,), (y,), ( ,), (a,), (l,...","[t, e, r, r, y, , a, l, e, x, a, n, d, e, r]","[(t, e), (e, r), (r, r), (r, y), (y, ), ( , a...","[(t, e, r), (e, r, r), (r, r, y), (r, y, ), (...",0,0,0,1,"[Terry, Alexander]"
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,neil roebuck,neil roebuck,"[(n,), (e,), (i,), (l,), ( ,), (r,), (o,), (e,...","[n, e, i, l, , r, o, e, b, u, c, k]","[(n, e), (e, i), (i, l), (l, ), ( , r), (r, o...","[(n, e, i), (e, i, l), (i, l, ), (l, , r), (...",0,0,0,1,"[Neil, Roebuck]"
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,lyle stewart,lyle stewart,"[(l,), (y,), (l,), (e,), ( ,), (s,), (t,), (e,...","[l, y, l, e, , s, t, e, w, a, r, t]","[(l, y), (y, l), (l, e), (e, ), ( , s), (s, t...","[(l, y, l), (y, l, e), (l, e, ), (e, , s), (...",0,0,0,1,"[Lyle, Stewart]"


### Accent feature

In [95]:
text_with_accents = "Létérs wïth âccénts"
normalized_text = unidecode(text_with_accents)
print(normalized_text)

Leters with accents


In [96]:
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]

In [97]:
def identify_accents(text):
    accents = []
    for char in text:
        if unicodedata.normalize('NFD', char) != char:
            accents.append(char)
    return accents
allaccents= set()
for i in df['name_lower']:
    allaccents.update(identify_accents(i))
print(allaccents)
print(len(allaccents))

{'ě', 'â', 'ờ', 'ÿ', 'ạ', 'é', 'ę', 'á', 'ç', 'ǒ', 'ì', 'ơ', 'ĩ', 'ề', 'č', 'è', 'ż', 'ī', 'ệ', 'ả', 'ê', 'ş', 'ồ', 'ķ', '노', 'ř', 'ô', 'ő', 'ä', 'ǐ', 'ủ', 'ῑ', 'أ', '대', 'ţ', 'έ', 'ǚ', 'ؤ', 'ύ', 'ū', 'ọ', 'ị', 'ó', 'ű', 'ľ', 'ấ', 'ペ', 'ũ', 'ņ', 'ế', '원', 'ữ', 'í', 'ễ', 'ǎ', 'ầ', 'ằ', 'ü', 'ú', 'à', 'ợ', 'ೀ', 'ώ', 'ắ', 'å', 'ṣ', 'ñ', 'š', '이', 'î', 'ą', '동', '랑', 'ļ', 'ș', 'ï', 'ž', 'ț', 'ć', 'ṭ', 'й', 'ò', 'ń', 'ã', 'إ', 'ď', 'ǔ', 'ό', 'ῖ', 'ō', 'ů', 'ớ', 'ś', 'ù', 'ά', 'ğ', 'ί', 'û', 'ǫ', 'ö', 'ė', 'ư', 'ứ', 'ố', 'ụ', 'õ', '녕', 'ā', 'ň', 'ź', 'ă', 'ừ', 'ḫ', 'ë', 'ģ', 'ē', 'ো', 'ỹ', 'ド', 'ặ', 'ৌ', 'ŭ', 'ý', 'ї'}
124


In [98]:
allaccents1 = []
for i in allaccents:
    if langname(str(i)) != 'HANGUL':
        allaccents1.append(i)
print(allaccents1)
print(len(allaccents1))

['ě', 'â', 'ờ', 'ÿ', 'ạ', 'é', 'ę', 'á', 'ç', 'ǒ', 'ì', 'ơ', 'ĩ', 'ề', 'č', 'è', 'ż', 'ī', 'ệ', 'ả', 'ê', 'ş', 'ồ', 'ķ', 'ř', 'ô', 'ő', 'ä', 'ǐ', 'ủ', 'ῑ', 'أ', 'ţ', 'έ', 'ǚ', 'ؤ', 'ύ', 'ū', 'ọ', 'ị', 'ó', 'ű', 'ľ', 'ấ', 'ペ', 'ũ', 'ņ', 'ế', 'ữ', 'í', 'ễ', 'ǎ', 'ầ', 'ằ', 'ü', 'ú', 'à', 'ợ', 'ೀ', 'ώ', 'ắ', 'å', 'ṣ', 'ñ', 'š', 'î', 'ą', 'ļ', 'ș', 'ï', 'ž', 'ț', 'ć', 'ṭ', 'й', 'ò', 'ń', 'ã', 'إ', 'ď', 'ǔ', 'ό', 'ῖ', 'ō', 'ů', 'ớ', 'ś', 'ù', 'ά', 'ğ', 'ί', 'û', 'ǫ', 'ö', 'ė', 'ư', 'ứ', 'ố', 'ụ', 'õ', 'ā', 'ň', 'ź', 'ă', 'ừ', 'ḫ', 'ë', 'ģ', 'ē', 'ো', 'ỹ', 'ド', 'ặ', 'ৌ', 'ŭ', 'ý', 'ї']
117


In [99]:
unicodedata.normalize('NFD', '안녕하세요') == '안녕하세요'

False

In [100]:
'안녕하세요' == '안녕하세요'

True

In [101]:
#trying to figure out accent distribution
print(allaccents1)
print(len(allaccents1))

['ě', 'â', 'ờ', 'ÿ', 'ạ', 'é', 'ę', 'á', 'ç', 'ǒ', 'ì', 'ơ', 'ĩ', 'ề', 'č', 'è', 'ż', 'ī', 'ệ', 'ả', 'ê', 'ş', 'ồ', 'ķ', 'ř', 'ô', 'ő', 'ä', 'ǐ', 'ủ', 'ῑ', 'أ', 'ţ', 'έ', 'ǚ', 'ؤ', 'ύ', 'ū', 'ọ', 'ị', 'ó', 'ű', 'ľ', 'ấ', 'ペ', 'ũ', 'ņ', 'ế', 'ữ', 'í', 'ễ', 'ǎ', 'ầ', 'ằ', 'ü', 'ú', 'à', 'ợ', 'ೀ', 'ώ', 'ắ', 'å', 'ṣ', 'ñ', 'š', 'î', 'ą', 'ļ', 'ș', 'ï', 'ž', 'ț', 'ć', 'ṭ', 'й', 'ò', 'ń', 'ã', 'إ', 'ď', 'ǔ', 'ό', 'ῖ', 'ō', 'ů', 'ớ', 'ś', 'ù', 'ά', 'ğ', 'ί', 'û', 'ǫ', 'ö', 'ė', 'ư', 'ứ', 'ố', 'ụ', 'õ', 'ā', 'ň', 'ź', 'ă', 'ừ', 'ḫ', 'ë', 'ģ', 'ē', 'ো', 'ỹ', 'ド', 'ặ', 'ৌ', 'ŭ', 'ý', 'ї']
117


In [102]:
df_with_accent_counts = df.copy()
df['accent_count'] = 0
df['detected_accents'] = ""

# Count accents for each name and update the corresponding columns using regular expressions
for idx, row in df.iterrows():
    name = row['name']
    detected_accents = []
    total_accent_count = 0
    for accent in allaccents1:
        # Use regular expression to find all occurrences of the accent in the name
        accent_count = len(re.findall(re.escape(accent), name))
        total_accent_count += accent_count
        # If the accent appears in the name, add it to the detected accents list
        if accent_count > 0:
            detected_accents.append(accent)
    # Update the accent count and detected accent columns for the current row
    df.at[idx, 'accent_count'] = total_accent_count
    df.at[idx, 'detected_accents'] = ", ".join(detected_accents)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['accent_count'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['detected_accents'] = ""


In [103]:
df

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (...",0,0,0,1,"[Raha, Etemadi]",0,
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (...",0,1,0,1,"[Leena, Peltonen-Palotie]",0,
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (...",0,0,0,1,"[Luma, Grothe]",0,
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (...",0,0,0,1,"[Takuya, Kakine]",0,
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (...",0,0,0,1,"[Ōuyáng, Zhènhuá]",3,"á, è"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,clyde donaldson,clyde donaldson,"[(c,), (l,), (y,), (d,), (e,), ( ,), (d,), (o,...","[c, l, y, d, e, , d, o, n, a, l, d, s, o, n]","[(c, l), (l, y), (y, d), (d, e), (e, ), ( , d...","[(c, l, y), (l, y, d), (y, d, e), (d, e, ), (...",0,0,0,1,"[Clyde, Donaldson]",0,
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,terry alexander,terry alexander,"[(t,), (e,), (r,), (r,), (y,), ( ,), (a,), (l,...","[t, e, r, r, y, , a, l, e, x, a, n, d, e, r]","[(t, e), (e, r), (r, r), (r, y), (y, ), ( , a...","[(t, e, r), (e, r, r), (r, r, y), (r, y, ), (...",0,0,0,1,"[Terry, Alexander]",0,
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,neil roebuck,neil roebuck,"[(n,), (e,), (i,), (l,), ( ,), (r,), (o,), (e,...","[n, e, i, l, , r, o, e, b, u, c, k]","[(n, e), (e, i), (i, l), (l, ), ( , r), (r, o...","[(n, e, i), (e, i, l), (i, l, ), (l, , r), (...",0,0,0,1,"[Neil, Roebuck]",0,
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,lyle stewart,lyle stewart,"[(l,), (y,), (l,), (e,), ( ,), (s,), (t,), (e,...","[l, y, l, e, , s, t, e, w, a, r, t]","[(l, y), (y, l), (l, e), (e, ), ( , s), (s, t...","[(l, y, l), (y, l, e), (l, e, ), (e, , s), (...",0,0,0,1,"[Lyle, Stewart]",0,


In [104]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)

Language distribution:
lang
en    22779
es     2502
it     1207
fr     1164
ja     1135
      ...  
sr        3
mk        3
mn        3
ky        2
hy        2
Name: count, Length: 90, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lang"] = df["lang"].str.strip()


In [105]:
print("Language distribution:")
print(language_counts.to_string())

Language distribution:
lang
en       22779
es        2502
it        1207
fr        1164
ja        1135
ar        1086
pt        1068
de        1064
ru         983
hi         793
nl         695
hr         582
pl         561
zh-CN      522
sv         480
bg         476
fi         451
hu         434
da         408
no         403
el         397
tr         376
ro         365
ms         345
sk         334
id         332
lb         298
ca         271
la         267
bs         211
sl         210
bn         195
ga         192
cs         192
co         177
ha         170
tl         168
gu         159
mr         154
sq         149
af         148
fy         139
vi         130
cy         126
eu         123
sw         115
uz         114
et         113
is         104
lt         100
jw          94
lv          89
kn          87
te          87
gd          86
zu          79
haw         76
mt          70
mi          68
sn          68
az          67
mg          67
so          67
ny          66
rw          

In [106]:
df.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,raha etemadi,raha etemadi,"[(r,), (a,), (h,), (a,), ( ,), (e,), (t,), (e,...","[r, a, h, a, , e, t, e, m, a, d, i]","[(r, a), (a, h), (h, a), (a, ), ( , e), (e, t...","[(r, a, h), (a, h, a), (h, a, ), (a, , e), (...",0,0,0,1,"[Raha, Etemadi]",0,
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (...",0,1,0,1,"[Leena, Peltonen-Palotie]",0,
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (...",0,0,0,1,"[Luma, Grothe]",0,
3,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,takuya kakine,takuya kakine,"[(t,), (a,), (k,), (u,), (y,), (a,), ( ,), (k,...","[t, a, k, u, y, a, , k, a, k, i, n, e]","[(t, a), (a, k), (k, u), (u, y), (y, a), (a, ...","[(t, a, k), (a, k, u), (k, u, y), (u, y, a), (...",0,0,0,1,"[Takuya, Kakine]",0,
4,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,ōuyáng zhènhuá,ouyang zhenhua,"[(o,), (u,), (y,), (a,), (n,), (g,), ( ,), (z,...","[o, u, y, a, n, g, , z, h, e, n, h, u, a]","[(o, u), (u, y), (y, a), (a, n), (n, g), (g, ...","[(o, u, y), (u, y, a), (y, a, n), (a, n, g), (...",0,0,0,1,"[Ōuyáng, Zhènhuá]",3,"á, è"


### Dropping languages that have a small amount of samples ( < 100)

In [107]:
lang_counts = df['lang'].value_counts()
languages_with_few_samples = lang_counts[lang_counts < 100].index.tolist()
df = df[~df['lang'].isin(languages_with_few_samples)]

In [108]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)
print(len(language_counts))

Language distribution:
lang
en       22779
es        2502
it        1207
fr        1164
ja        1135
ar        1086
pt        1068
de        1064
ru         983
hi         793
nl         695
hr         582
pl         561
zh-CN      522
sv         480
bg         476
fi         451
hu         434
da         408
no         403
el         397
tr         376
ro         365
ms         345
sk         334
id         332
lb         298
ca         271
la         267
bs         211
sl         210
bn         195
ga         192
cs         192
co         177
ha         170
tl         168
gu         159
mr         154
sq         149
af         148
fy         139
vi         130
cy         126
eu         123
sw         115
uz         114
et         113
is         104
lt         100
Name: count, dtype: int64
50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lang"] = df["lang"].str.strip()


### Keeping top 20 most frequent languages and dropping everything else

In [109]:
lang_counts = df['lang'].value_counts()
languages_with_fewsamples = lang_counts[lang_counts < 400].index.tolist()
df = df[~df['lang'].isin(languages_with_fewsamples)]

In [110]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)
print(len(language_counts))


Language distribution:
lang
en       22779
es        2502
it        1207
fr        1164
ja        1135
ar        1086
pt        1068
de        1064
ru         983
hi         793
nl         695
hr         582
pl         561
zh-CN      522
sv         480
bg         476
fi         451
hu         434
da         408
no         403
Name: count, dtype: int64
20


## dropping CJK languages


In [111]:
df = df.loc[df['lang'] != 'ja'].copy()

In [112]:
df = df.loc[df['lang'] != 'zh-CN'].copy()

In [113]:
df

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents
1,Leena Peltonen-Palotie,1,fi,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",10.5,2,leena peltonen-palotie,leena peltonen-palotie,"[(l,), (e,), (e,), (n,), (a,), ( ,), (p,), (e,...","[l, e, e, n, a, , p, e, l, t, o, n, e, n, -, ...","[(l, e), (e, e), (e, n), (n, a), (a, ), ( , p...","[(l, e, e), (e, e, n), (e, n, a), (n, a, ), (...",0,1,0,1,"[Leena, Peltonen-Palotie]",0,
2,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2,luma grothe,luma grothe,"[(l,), (u,), (m,), (a,), ( ,), (g,), (r,), (o,...","[l, u, m, a, , g, r, o, t, h, e]","[(l, u), (u, m), (m, a), (a, ), ( , g), (g, r...","[(l, u, m), (u, m, a), (m, a, ), (a, , g), (...",0,0,0,1,"[Luma, Grothe]",0,
5,Jordan Gideon Archer,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,3,jordan gideon archer,jordan gideon archer,"[(j,), (o,), (r,), (d,), (a,), (n,), ( ,), (g,...","[j, o, r, d, a, n, , g, i, d, e, o, n, , a, ...","[(j, o), (o, r), (r, d), (d, a), (a, n), (n, ...","[(j, o, r), (o, r, d), (r, d, a), (d, a, n), (...",0,0,0,2,"[Jordan, Gideon, Archer]",0,
7,Yannis Becker,1,fr,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2,yannis becker,yannis becker,"[(y,), (a,), (n,), (n,), (i,), (s,), ( ,), (b,...","[y, a, n, n, i, s, , b, e, c, k, e, r]","[(y, a), (a, n), (n, n), (n, i), (i, s), (s, ...","[(y, a, n), (a, n, n), (n, n, i), (n, i, s), (...",0,0,0,1,"[Yannis, Becker]",0,
8,Tom Clyde,1,en,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",4.0,2,tom clyde,tom clyde,"[(t,), (o,), (m,), ( ,), (c,), (l,), (y,), (d,...","[t, o, m, , c, l, y, d, e]","[(t, o), (o, m), (m, ), ( , c), (c, l), (l, y...","[(t, o, m), (o, m, ), (m, , c), ( , c, l), (...",0,0,0,1,"[Tom, Clyde]",0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46758,Clyde Donaldson,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,clyde donaldson,clyde donaldson,"[(c,), (l,), (y,), (d,), (e,), ( ,), (d,), (o,...","[c, l, y, d, e, , d, o, n, a, l, d, s, o, n]","[(c, l), (l, y), (y, d), (d, e), (e, ), ( , d...","[(c, l, y), (l, y, d), (y, d, e), (d, e, ), (...",0,0,0,1,"[Clyde, Donaldson]",0,
46759,Terry Alexander,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",7.0,2,terry alexander,terry alexander,"[(t,), (e,), (r,), (r,), (y,), ( ,), (a,), (l,...","[t, e, r, r, y, , a, l, e, x, a, n, d, e, r]","[(t, e), (e, r), (r, r), (r, y), (y, ), ( , a...","[(t, e, r), (e, r, r), (r, r, y), (r, y, ), (...",0,0,0,1,"[Terry, Alexander]",0,
46760,Neil Roebuck,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,neil roebuck,neil roebuck,"[(n,), (e,), (i,), (l,), ( ,), (r,), (o,), (e,...","[n, e, i, l, , r, o, e, b, u, c, k]","[(n, e), (e, i), (i, l), (l, ), ( , r), (r, o...","[(n, e, i), (e, i, l), (i, l, ), (l, , r), (...",0,0,0,1,"[Neil, Roebuck]",0,
46761,Lyle Stewart,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2,lyle stewart,lyle stewart,"[(l,), (y,), (l,), (e,), ( ,), (s,), (t,), (e,...","[l, y, l, e, , s, t, e, w, a, r, t]","[(l, y), (y, l), (l, e), (e, ), ( , s), (s, t...","[(l, y, l), (y, l, e), (l, e, ), (e, , s), (...",0,0,0,1,"[Lyle, Stewart]",0,


### Frequency Distribution

In [114]:
def process_language(df, lang_code):
    #language relative frequency distributions
    lang_df = df[df['lang']==lang_code].copy() #might not need to copy
    unigram_fdist = create_lang_char_distribution(lang_df, 'transliteration')
    initialized_bigrams = initialize_all_possible_bigrams(unigram_fdist.keys())
    bigram_fdist = create_lang_gram_distribution(initialized_bigrams, lang_df, 'bigrams')
    all_possible_chars_translit = create_lang_char_distribution(lang_df, 'transliteration').keys()
    #initialized_trigrams = initialize_all_possible_trigrams(all_possible_chars_translit)
    #lang_df['trigrams'] = lang_df['transliteration'].apply(lambda name: list(ngrams(list(name), 3)))
    #trigram_fdist = create_lang_gram_distribution(initialized_trigrams, lang_df, 'trigrams')
    #individual relative frequency distributions
    initialized_unigrams = {char: 0 for char in unigram_fdist.keys()}
    lang_df['indiv_unigrams_fdist'] = lang_df['unigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_unigrams))
    lang_df['indiv_bigrams_fdist'] = lang_df['bigrams'].apply(lambda grams_list: create_indiv_gram_distribution(grams_list, initialized_bigrams))
    #lang_df['indiv_trigrams_fdist'] = lang_df['trigrams'].apply(lambda entry: initialized_trigrams.copy())
    #lang_df['indiv_trigrams_fdist'] = lang_df.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis=1)
    #comparing distributions
    lang_df['indiv_unigrams_fdist'] = lang_df['indiv_unigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype=float).reshape(1,-1))
    unigram_fdist = np.fromiter(unigram_fdist.values(), dtype = float).reshape(1, -1)
    #calculating cosine similarity
    lang_df['unigrams_cosine_sim'] = lang_df['indiv_unigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, unigram_fdist)[0][0])
    #bigrams cosine similarity
    lang_df['indiv_bigrams_fdist'] = lang_df['indiv_bigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype=float).reshape(1,-1))
    bigram_fdist = np.fromiter(bigram_fdist.values(), dtype = float).reshape(1, -1)
    lang_df['bigrams_cosine_sim'] = lang_df['indiv_bigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, bigram_fdist)[0][0])
    #trigrams cosine similarity
    #lang_df['indiv_trigrams_fdist'] = lang_df['indiv_trigrams_fdist'].apply(lambda fdist: np.fromiter(fdist.values(), dtype=float).reshape(1,-1))
    #trigram_fdist = np.fromiter(trigram_fdist.values(), dtype=float).reshape(1,-1)
    #lang_df['trigrams_cosine_sim'] = lang_df['indiv_trigrams_fdist'].apply(lambda fdist: cosine_similarity(fdist, trigram_fdist)[0][0])
    return lang_df

In [115]:
#Every language separated into their own individual data frame
df_en = df[df['lang'] == 'en'].copy() #latin alphabet
df_es = df[df['lang'] == 'es'].copy() #latin alphabet
df_it = df[df['lang'] == 'it'].copy() #latin alphabet
df_fr = df[df['lang'] == 'fr'].copy() #latin alphabet
df_ar = df[df['lang'] == 'ar'].copy() #non-latin alphabet, split later
df_pt = df[df['lang'] == 'pt'].copy() #latin alphabet
df_de = df[df['lang'] == 'de'].copy() #latin alphabet
df_ru = df[df['lang'] == 'ru'].copy() #non-latin alphabet, split
df_hi = df[df['lang'] == 'hi'].copy() #non-latin alphabet, split
df_nl = df[df['lang'] == 'nl'].copy() #latin alphabet
df_hr = df[df['lang'] == 'hr'].copy() #latin alphabet
df_pl = df[df['lang'] == 'pl'].copy() #latin alphabet
df_bg = df[df['lang'] == 'bg'].copy() #non latin alphabet, split
df_fi = df[df['lang'] == 'fi'].copy() #latin alphabet
df_hu = df[df['lang'] == 'hu'].copy() #latin alphabet
df_da = df[df['lang'] == 'da'].copy() #latin alphabet
df_no = df[df['lang'] == 'no'].copy() #latin alphabet

In [116]:
#Split non latin languages
ar_latin_mask = df_ar['name'].str.contains(r'[a-zA-Z]')
ar_latin = df_ar[ar_latin_mask].copy()
ar_non_latin = df_ar[~ar_latin_mask].copy() #2
ru_latin_mask = df_ru['name'].str.contains(r'[a-zA-Z]')
ru_latin = df_ru[ru_latin_mask].copy()
ru_non_latin = df_ru[~ru_latin_mask].copy() #3
hi_latin_mask = df_hi['name'].str.contains(r'[a-zA-Z]')
hi_latin = df_hi[hi_latin_mask].copy()
hi_non_latin = df_hi[~hi_latin_mask].copy() #4
bg_latin_mask = df_bg['name'].str.contains(r'[a-zA-Z]')
bg_latin = df_bg[bg_latin_mask].copy()
bg_non_latin = df_bg[~bg_latin_mask].copy() #6

### frequency distribution for names that don't use the latin alphabet

In [117]:
ar_non_latin = process_language(ar_non_latin, 'ar')

In [118]:
ar_non_latin

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,...,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
825,محمد حواضلي مذر,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",4.333333,3,محمد حواضلي مذر,mHmd HwDly mdhr,"[(m,), (H,), (m,), (d,), ( ,), (H,), (w,), (D,...","[m, H, m, d, , H, w, D, l, y, , m, d, h, r]",...,0,0,2,"[محمد, حواضلي, مذر]",0,,"[[0.13333333333333333, 0.0, 0.0, 0.06666666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.07142857142857142...",0.825318,0.527612
841,أبو العباس محمد بن جعفر المقتدر,1,ar,"[ARABIC, ARABIC, ARABIC, SPACE, ARABIC, ARABIC...",4.333333,6,أبو العباس محمد بن جعفر المقتدر,'bw l`bs mHmd bn j`fr lmqtdr,"[(',), (b,), (w,), ( ,), (l,), (`,), (b,), (s,...","[', b, w, , l, `, b, s, , m, H, m, d, , b, ...",...,0,0,5,"[أبو, العباس, محمد, بن, جعفر, المقتدر]",1,أ,"[[0.17857142857142855, 0.03571428571428571, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.886418,0.626447
2243,عبدي إسماعيل سمتر,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",5.0,3,عبدي إسماعيل سمتر,`bdy sm`yl smtr,"[(`,), (b,), (d,), (y,), ( ,), (s,), (m,), (`,...","[`, b, d, y, , s, m, `, y, l, , s, m, t, r]",...,0,0,2,"[عبدي, إسماعيل, سمتر]",1,إ,"[[0.13333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.835773,0.229876
3075,أحمد بن علي,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",3.0,3,أحمد بن علي,'Hmd bn `ly,"[(',), (H,), (m,), (d,), ( ,), (b,), (n,), ( ,...","[', H, m, d, , b, n, , `, l, y]",...,0,0,2,"[أحمد, بن, علي]",1,أ,"[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.894907,0.631991
3200,نور الدين الأتاسي,1,ar,"[ARABIC, ARABIC, ARABIC, SPACE, ARABIC, ARABIC...",5.0,3,نور الدين الأتاسي,nwr ldyn l'tsy,"[(n,), (w,), (r,), ( ,), (l,), (d,), (y,), (n,...","[n, w, r, , l, d, y, n, , l, ', t, s, y]",...,0,0,2,"[نور, الدين, الأتاسي]",1,أ,"[[0.14285714285714285, 0.07142857142857142, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.782856,0.338508
8680,الأشرف موسى,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, ARABIC, ARABI...",5.0,2,الأشرف موسى,l'shrf mws~,"[(l,), (',), (s,), (h,), (r,), (f,), ( ,), (m,...","[l, ', s, h, r, f, , m, w, s, ~]",...,0,0,1,"[الأشرف, موسى]",1,أ,"[[0.09090909090909091, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.589005,0.141566
9161,يزيد بن عبد الملك,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",3.5,4,يزيد بن عبد الملك,yzyd bn `bd lmlk,"[(y,), (z,), (y,), (d,), ( ,), (b,), (n,), ( ,...","[y, z, y, d, , b, n, , `, b, d, , l, m, l, k]",...,0,0,3,"[يزيد, بن, عبد, الملك]",0,,"[[0.1875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.874269,0.620435
13957,محمد محمود ابراهيم,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",5.333333,3,محمد محمود ابراهيم,mHmd mHmwd brhym,"[(m,), (H,), (m,), (d,), ( ,), (m,), (H,), (m,...","[m, H, m, d, , m, H, m, w, d, , b, r, h, y, m]",...,0,0,2,"[محمد, محمود, ابراهيم]",0,,"[[0.125, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.751577,0.624517
17048,طوني فرنجية,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",5.0,2,طوني فرنجية,Twny frnjy@,"[(T,), (w,), (n,), (y,), ( ,), (f,), (r,), (n,...","[T, w, n, y, , f, r, n, j, y, @]",...,0,0,1,"[طوني, فرنجية]",0,,"[[0.09090909090909091, 0.0, 0.0909090909090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.546565,0.171901
17296,محمد نبيل الخطيب,1,ar,"[ARABIC, ARABIC, ARABIC, ARABIC, SPACE, ARABIC...",4.666667,3,محمد نبيل الخطيب,mHmd nbyl lkhTyb,"[(m,), (H,), (m,), (d,), ( ,), (n,), (b,), (y,...","[m, H, m, d, , n, b, y, l, , l, k, h, T, y, b]",...,0,0,2,"[محمد, نبيل, الخطيب]",0,,"[[0.125, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.06...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.869253,0.520146


In [119]:
# ru_non_latin = process_language(ru_non_latin, 'ru')
# hi_non_latin = process_language(hi_non_latin, 'hi')
# bg_non_latin = process_language(bg_non_latin, 'bg')

In [127]:
hi_latin['lang']

19       hi
69       hi
250      hi
334      hi
361      hi
         ..
46397    hi
46440    hi
46669    hi
46671    hi
46689    hi
Name: lang, Length: 781, dtype: object

In [129]:
ru_non_latin = process_language(ru_non_latin, 'ru')
ru_latin = process_language(ru_latin, 'ru')
hi_non_latin = process_language(hi_non_latin, 'hi')
hi_latin = process_language(hi_latin, 'hi')
bg_non_latin = process_language(bg_non_latin, 'bg')
bg_latin = process_language(bg_latin, 'bg')
ar_non_latin = process_language(ar_non_latin, 'ar')
ar_latin = process_language(ar_latin, 'ar')

In [130]:
df_en = process_language(df_en, 'en')

In [131]:
df_es = process_language(df_es, 'es')
df_it = process_language(df_it, 'it')
df_fr = process_language(df_fr, 'fr')
df_pt = process_language(df_pt, 'pt')
df_de = process_language(df_de, 'de')
df_nl = process_language(df_nl, 'nl')
df_hr = process_language(df_hr, 'hr')
df_pl = process_language(df_pl, 'pl')
df_fi = process_language(df_fi, 'fi')
df_hu = process_language(df_hu, 'hu')
df_da = process_language(df_da, 'da')
df_no = process_language(df_no, 'no')

In [134]:
df_pt.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens,name_lower,transliteration,char_ngrams,unigrams,...,dash_freq,apostrophe_freq,space_freq,word_ngrams,accent_count,detected_accents,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
86,Mario Longo,1,pt,"[LATIN, LATIN, LATIN, LATIN, LATIN, SPACE, LAT...",5.0,2,mario longo,mario longo,"[(m,), (a,), (r,), (i,), (o,), ( ,), (l,), (o,...","[m, a, r, i, o, , l, o, n, g, o]",...,0,0,1,"[Mario, Longo]",0,,"[[0.09090909090909091, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.743305,0.401369
161,Serafim Baptista,1,pt,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",7.5,2,serafim baptista,serafim baptista,"[(s,), (e,), (r,), (a,), (f,), (i,), (m,), ( ,...","[s, e, r, a, f, i, m, , b, a, p, t, i, s, t, a]",...,0,0,1,"[Serafim, Baptista]",0,,"[[0.0625, 0.0, 0.0, 0.0, 0.1875, 0.0625, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.06666666666666667...",0.746885,0.206832
168,Susana Brandao,1,pt,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2,susana brandao,susana brandao,"[(s,), (u,), (s,), (a,), (n,), (a,), ( ,), (b,...","[s, u, s, a, n, a, , b, r, a, n, d, a, o]",...,0,0,1,"[Susana, Brandao]",0,,"[[0.07142857142857142, 0.0, 0.0, 0.0, 0.285714...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307692307693...",0.747134,0.336682
188,Roberto Sebastián Brum,1,pt,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",6.666667,3,roberto sebastián brum,roberto sebastian brum,"[(r,), (o,), (b,), (e,), (r,), (t,), (o,), ( ,...","[r, o, b, e, r, t, o, , s, e, b, a, s, t, i, ...",...,0,0,2,"[Roberto, Sebastián, Brum]",1,á,"[[0.09090909090909091, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.04761904761904761...",0.816559,0.378614
191,Oswaldo de Barros Velloso,1,pt,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...",5.5,4,oswaldo de barros velloso,oswaldo de barros velloso,"[(o,), (s,), (w,), (a,), (l,), (d,), (o,), ( ,...","[o, s, w, a, l, d, o, , d, e, , b, a, r, r, ...",...,0,0,3,"[Oswaldo, de, Barros, Velloso]",0,,"[[0.12, 0.0, 0.0, 0.0, 0.08, 0.04, 0.0, 0.08, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.04166666666666666...",0.812051,0.468489


## pickle datasets

In [132]:
ru_non_latin.to_pickle('../pickled_dataframes/russian_df.pkl.gz', compression='gzip')

In [135]:
ru_latin.to_pickle('../pickled_dataframes/russianLatin_df.pkl.gz', compression='gzip')
hi_non_latin.to_pickle('../pickled_dataframes/hindi_df.pkl.gz', compression='gzip')
hi_latin.to_pickle('../pickled_dataframes/hindiLatin_df.pkl.gz', compression='gzip')
bg_non_latin.to_pickle('../pickled_dataframes/bulgarian_df.pkl.gz', compression='gzip')
bg_latin.to_pickle('../pickled_dataframes/bulgarianLatin_df.pkl.gz', compression='gzip')
ar_non_latin.to_pickle('../pickled_dataframes/arabic_df.pkl.gz', compression='gzip')
ar_latin.to_pickle('../pickled_dataframes/arabicLatin_df.pkl.gz', compression='gzip')
df_en.to_pickle('../pickled_dataframes/english_df.pkl.gz', compression='gzip')
df_es.to_pickle('../pickled_dataframes/spanish_df.pkl.gz', compression='gzip')
df_it.to_pickle('../pickled_dataframes/italian_df.pkl.gz', compression='gzip')
df_fr.to_pickle('../pickled_dataframes/french_df.pkl.gz', compression='gzip')
df_pt.to_pickle('../pickled_dataframes/portuguese_df.pkl.gz', compression='gzip')
df_de.to_pickle('../pickled_dataframes/german_df.pkl.gz', compression='gzip')
df_nl.to_pickle('../pickled_dataframes/dutch_df.pkl.gz', compression='gzip')
df_hr.to_pickle('../pickled_dataframes/croatian_df.pkl.gz', compression='gzip')
df_pl.to_pickle('../pickled_dataframes/polish_df.pkl.gz', compression='gzip')
df_fi.to_pickle('../pickled_dataframes/finnish_df.pkl.gz', compression='gzip')
df_hu.to_pickle('../pickled_dataframes/hungarian_df.pkl.gz', compression='gzip')
df_da.to_pickle('../pickled_dataframes/danish_df.pkl.gz', compression='gzip')
df_no.to_pickle('../pickled_dataframes/norwegian_df.pkl.gz', compression='gzip')
