## Cleaning

In [95]:
import pandas as pd
df = pd.read_csv("company_person_name_dataset.csv")
df

Unnamed: 0,name,class,lang
0,The Canal of the Angels,0,en
1,Rescue Renovation,0,en
2,Agatha Christie: The ABC Murders,0,en
3,Siti Akbari,0,ar
4,Stany,0,pl
...,...,...,...
199995,Robber's Bridge,0,en
199996,Johan Renck,0,en
199997,Lyle Stewart,1,en
199998,Thomas Colclough Watson,1,en


In [96]:
df = df[df["class"]==1]
#look into smote for imbalance

In [97]:
import numpy as np

In [98]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))

True
False
False


In [99]:
print(len(df.isnull()))

48190


In [100]:
df = df.dropna()

In [101]:
df.drop_duplicates()

Unnamed: 0,name,class,lang
11,Raha Etemadi,1,et
12,Leena Peltonen-Palotie,1,fi
16,Luma Grothe,1,en
20,Takuya Kakine,1,ja
21,Ōuyáng Zhènhuá,1,mi
...,...,...,...
199976,Clyde Donaldson,1,en
199980,Terry Alexander,1,en
199989,Neil Roebuck,1,en
199997,Lyle Stewart,1,en


In [102]:
language_counts = df["lang"].value_counts

In [103]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [104]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))
print(np.any(df.duplicated()))

False
False
False
True


In [105]:
df = df.drop_duplicates()

In [106]:
print(np.any(df.duplicated()))

False


In [107]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [108]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)

Language distribution:
lang
en    23352
es     2532
it     1224
fr     1181
ja     1155
      ...  
my        1
sd        1
kk        1
pa        1
si        1
Name: count, Length: 103, dtype: int64


## Additional Cleaning to Drop Names with Special Characters

Reset df index starting from 0

In [109]:
df.reset_index(drop=True, inplace=True)
print(df)

                          name  class lang
0                 Raha Etemadi      1   et
1       Leena Peltonen-Palotie      1   fi
2                  Luma Grothe      1   en
3                Takuya Kakine      1   ja
4               Ōuyáng Zhènhuá      1   mi
...                        ...    ...  ...
47754          Clyde Donaldson      1   en
47755          Terry Alexander      1   en
47756             Neil Roebuck      1   en
47757             Lyle Stewart      1   en
47758  Thomas Colclough Watson      1   en

[47759 rows x 3 columns]


Create langname to find the alphabet name of the first character in the first word of a given name

In [110]:
import unicodedata
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]
langname('Raha Etemadi')

'LATIN'

Using langname to find the alphabet of the first character in the first word of each name in df

In [111]:
alphabet = []
for i in df['name']:
    alphabet.append(langname(i))

print(set(alphabet))
print(len(set(alphabet)))

df['alphabet'] = alphabet

{'EQUALS', 'DIGIT', 'HANGUL', 'SINHALA', 'HEBREW', 'CYRILLIC', 'CJK', 'SYRIAC', 'HYPHEN-MINUS', 'KANNADA', 'THAI', 'TELUGU', 'TAMIL', 'MYANMAR', 'GURMUKHI', 'RIGHT', 'APOSTROPHE', 'ETHIOPIC', 'LATIN', 'ARABIC', 'LESS-THAN', 'GREEK', 'ASTERISK', 'QUOTATION', 'LEFT', 'ARMENIAN', 'BENGALI', 'KATAKANA', 'DEVANAGARI'}
29


Getting a count of names that use each alphabet in df

In [112]:
for i in set(alphabet):
    count = 0
    for x,y in enumerate(df['alphabet']):
        if str(i) in y:
            count += 1
            #print(df['name'][x])
    print(count, i)

1 EQUALS
14 DIGIT
3 HANGUL
1 SINHALA
10 HEBREW
43 CYRILLIC
146 CJK
1 SYRIAC
3 HYPHEN-MINUS
2 KANNADA
1 THAI
1 TELUGU
2 TAMIL
1 MYANMAR
1 GURMUKHI
1 RIGHT
2 APOSTROPHE
1 ETHIOPIC
47317 LATIN
55 ARABIC
1 LESS-THAN
10 GREEK
1 ASTERISK
8 QUOTATION
109 LEFT
3 ARMENIAN
5 BENGALI
1 KATAKANA
15 DEVANAGARI


Finding names that have special characters and preparing to drop them

In [113]:
alphabet1 = []
notalpha = []
for i in df['name']:
    tempword = i.split()
    if (all(word.isalpha() == True for word in tempword)):
        alphabet1.append(langname(i))
    else: 
        notalpha.append(i)
    
print(notalpha)
print(len(notalpha))
print(set(alphabet1))
print(len(set(alphabet1)))

['Leena Peltonen-Palotie', 'R. M. Dharmadasa Banda', 'Vicente de Valverde y Alvarez de Toledo, O.P.', 'Ute Kircheis-Wessel', 'David John (Dave) Shannon', 'Dr. Joan W. Miller', 'Donald M. Davis', 'Akseli Gallen-Kallela', 'Gerald S. McGowan', 'Jung Chul-Woon', '(Armen Lubin)', 'Peter R. Orszag', 'William D. Port', 'Harry H. Peterson', 'Lim You-Hwan', "Lawrence O'Brien", 'St. Jutta of Kulmsee, T.O.S.F.', 'José M. Hernández', 'Im Jae-Sun', 'Charles A. Cummings', 'Francis "Frank" Forshew', 'Sir William Jardine, 7th Baronet', 'C.W. (Bill) Hutchins', 'Evelyn "Champagne" King', 'Feng Ming-chu', 'Isaac Campbell Kidd Jr.', 'Ernesto "Boy" F. Herrera', "William G. T'Vault", 'Laura J. Burns', 'Gordon S. Holder', 'David E. Rutledge', 'Richard Tsoi Yiu-cheong', 'William A. Cugno', 'Fahad Barakah Al-Marwani Al-Johani', 'Jan O. Karlsson', 'P.B. Premachandra', 'Anne-Pia Nygård', 'Wu Chi-wai', 'Yoon Jun-sung', 'Koshinaka Makoto (越中睦士)', 'Lize-Mari Retief', 'V. C. Andrews', 'Giancarlo Cornaggia-Medici', '

Created a new dataframe 'df2' with dropped names

In [114]:
droprows = []
for i, name in enumerate(df['name']):
    if name in notalpha:
        droprows.append(i)
#df2 = df.drop(droprows) df2 is created

Reset df2 index starting from 0

In [115]:
df2.reset_index(drop=True, inplace=True)
print(df2)

                          name  class lang alphabet
0                 Raha Etemadi      1   et    LATIN
1                  Luma Grothe      1   en    LATIN
2                Takuya Kakine      1   ja    LATIN
3               Ōuyáng Zhènhuá      1   mi    LATIN
4         Jordan Gideon Archer      1   en    LATIN
...                        ...    ...  ...      ...
42913          Clyde Donaldson      1   en    LATIN
42914          Terry Alexander      1   en    LATIN
42915             Neil Roebuck      1   en    LATIN
42916             Lyle Stewart      1   en    LATIN
42917  Thomas Colclough Watson      1   en    LATIN

[42918 rows x 4 columns]


Counting number of names that use each alphabet

In [116]:
alphabet2 = []
for i in df2['name']:
    alphabet2.append(langname(i))

print(set(alphabet2))
print(len(set(alphabet2)))

df2['alphabet'] = alphabet2

for i in set(alphabet2):
    count = 0
    for x,y in enumerate(df2['alphabet']):
        if str(i) in y:
            count += 1
    print(count, i)

{'GREEK', 'HEBREW', 'CYRILLIC', 'HANGUL', 'CJK', 'SYRIAC', 'ARMENIAN', 'LATIN', 'KATAKANA', 'ARABIC'}
10
9 GREEK
8 HEBREW
42 CYRILLIC
3 HANGUL
144 CJK
1 SYRIAC
2 ARMENIAN
42655 LATIN
1 KATAKANA
53 ARABIC


Checking original names in DEVANAGARI

In [117]:
df['alphabet'][0]

'LATIN'

In [128]:
for i, x in enumerate(df['name']):
    if df['alphabet'][i] == 'DEVANAGARI':
        print(df['name'][i])
        print(df['name'][i].isalpha())

राणा उदय सिहं
False
करतार सिंह भड़ाना
False
शंखलाल माझी
False
सुर्नेद्र झा 'सुमन '
False
आचार्य सारंगधर
False
पवन कुमार शर्मा
False
अटल बिहारी वाजपेयी
False
गेंदा लाल  चौधरी
False
विजय सिंह
False
परेश रावल
False
पुरुशोत्तम काशीनाथ केळकर
False
फ़ौज़िया  तहसीन ख़ान
False
रवीन्द्र प्रभात
False
नाहिद हसन
False
ओम प्रकाश वर्मा
False


## Feature Creation

### accents (incomplete)

In [119]:
from unidecode import unidecode
import unicodedata

text_with_accents = "Létérs wïth âccénts"
normalized_text = unidecode(text_with_accents)
print(normalized_text)

Leters with accents


In [120]:
import unicodedata
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]

In [121]:
def identify_accents(text):
    accents = []
    for char in text:
        if unicodedata.normalize('NFD', char) != char:
            accents.append(char)
    return accents
allaccents= set()
for i in df['name']:
    allaccents.update(identify_accents(i))
print(allaccents)
print(len(allaccents))

{'춘', '수', 'ć', 'ế', 'ೇ', '신', 'Ș', 'ķ', 'ṣ', 'ো', 'ô', 'ừ', 'ọ', 'ì', 'ă', 'ń', 'ò', '동', '일', 'ί', 'ώ', 'Ü', '태', 'أ', 'å', 'č', 'ț', 'ű', 'ễ', 'Ž', 'ņ', 'ż', 'ặ', 'ё', '랑', 'ū', '정', 'ļ', 'Ż', 'Ö', 'ú', '헌', 'ą', 'ǔ', 'Ä', 'Å', 'ž', 'ạ', '현', 'ň', 'Â', 'ṭ', '김', 'ũ', 'ÿ', '겸', 'ù', 'ố', 'ţ', 'Ľ', '보', 'ǎ', 'ģ', 'š', 'ǫ', 'ό', 'ê', '민', 'ź', 'Ş', 'ά', 'ụ', 'ؤ', 'í', '규', 'Ō', 'Í', 'ầ', 'ǒ', 'ů', 'ē', 'ś', 'ơ', 'ῑ', '순', 'İ', 'й', 'ắ', 'ǚ', '송', 'Ḥ', '건', 'έ', '원', 'ę', 'ě', 'Ī', 'Ț', 'ứ', '허', '기', 'ç', 'ý', 'î', 'â', '식', 'è', '권', 'ৌ', 'ã', '진', 'إ', 'Ř', 'õ', 'ĩ', 'ớ', 'Š', 'ő', 'ằ', '윤', 'ペ', '섭', 'ữ', '녕', 'ờ', 'ồ', 'ï', 'Ś', 'ệ', 'ī', '환', '상', 'ó', 'Ó', 'Ć', 'É', 'ä', 'ủ', '조', '이', '철', '박', 'à', 'Ú', 'ೀ', '최', 'ύ', 'û', 'ḫ', 'ė', '재', '명', 'ǐ', 'Ṭ', '대', 'ř', 'ỹ', 'é', 'ş', 'Ç', 'Ѓ', '연', 'ë', 'ợ', 'Ḫ', '홍', '긍', 'ō', 'á', '형', 'ș', 'ị', '광', 'ề', '강', 'ď', '석', '하', 'ド', 'ü', 'Č', 'ľ', 'ї', 'ả', '노', 'ö', 'Á', 'ā', 'ấ', 'ñ', 'ῖ', '영', 'ư', 'ŭ', 'ğ'}
195


In [122]:
allaccents1 = []
for i in allaccents:
    if langname(str(i)) != 'HANGUL':
        print(i)
        allaccents1.append(i)
print(allaccents1)
print(len(allaccents1))


ć
ế
ೇ
Ș
ķ
ṣ
ো
ô
ừ
ọ
ì
ă
ń
ò
ί
ώ
Ü
أ
å
č
ț
ű
ễ
Ž
ņ
ż
ặ
ё
ū
ļ
Ż
Ö
ú
ą
ǔ
Ä
Å
ž
ạ
ň
Â
ṭ
ũ
ÿ
ù
ố
ţ
Ľ
ǎ
ģ
š
ǫ
ό
ê
ź
Ş
ά
ụ
ؤ
í
Ō
Í
ầ
ǒ
ů
ē
ś
ơ
ῑ
İ
й
ắ
ǚ
Ḥ
έ
ę
ě
Ī
Ț
ứ
ç
ý
î
â
è
ৌ
ã
إ
Ř
õ
ĩ
ớ
Š
ő
ằ
ペ
ữ
ờ
ồ
ï
Ś
ệ
ī
ó
Ó
Ć
É
ä
ủ
à
Ú
ೀ
ύ
û
ḫ
ė
ǐ
Ṭ
ř
ỹ
é
ş
Ç
Ѓ
ë
ợ
Ḫ
ō
á
ș
ị
ề
ď
ド
ü
Č
ľ
ї
ả
ö
Á
ā
ấ
ñ
ῖ
ư
ŭ
ğ
['ć', 'ế', 'ೇ', 'Ș', 'ķ', 'ṣ', 'ো', 'ô', 'ừ', 'ọ', 'ì', 'ă', 'ń', 'ò', 'ί', 'ώ', 'Ü', 'أ', 'å', 'č', 'ț', 'ű', 'ễ', 'Ž', 'ņ', 'ż', 'ặ', 'ё', 'ū', 'ļ', 'Ż', 'Ö', 'ú', 'ą', 'ǔ', 'Ä', 'Å', 'ž', 'ạ', 'ň', 'Â', 'ṭ', 'ũ', 'ÿ', 'ù', 'ố', 'ţ', 'Ľ', 'ǎ', 'ģ', 'š', 'ǫ', 'ό', 'ê', 'ź', 'Ş', 'ά', 'ụ', 'ؤ', 'í', 'Ō', 'Í', 'ầ', 'ǒ', 'ů', 'ē', 'ś', 'ơ', 'ῑ', 'İ', 'й', 'ắ', 'ǚ', 'Ḥ', 'έ', 'ę', 'ě', 'Ī', 'Ț', 'ứ', 'ç', 'ý', 'î', 'â', 'è', 'ৌ', 'ã', 'إ', 'Ř', 'õ', 'ĩ', 'ớ', 'Š', 'ő', 'ằ', 'ペ', 'ữ', 'ờ', 'ồ', 'ï', 'Ś', 'ệ', 'ī', 'ó', 'Ó', 'Ć', 'É', 'ä', 'ủ', 'à', 'Ú', 'ೀ', 'ύ', 'û', 'ḫ', 'ė', 'ǐ', 'Ṭ', 'ř', 'ỹ', 'é', 'ş', 'Ç', 'Ѓ', 'ë', 'ợ', 'Ḫ', 'ō', 'á', 'ș', 'ị', 'ề', 'ď', 'ド', 'ü', 'Č', 'ľ', 'ї', 'ả', 'ö', 'Á'

In [123]:
unicodedata.normalize('NFD', '안녕하세요') == '안녕하세요'

False

In [124]:
'안녕하세요' == '안녕하세요'

True

### alphabet

Creating langname2 as another option to find the alphabet name for each character in a name instead of just the first letter

In [125]:
langname2 = lambda x: [unicodedata.name(char).split(' ') for char in x]
langname2('Raha Etemadi')

[['LATIN', 'CAPITAL', 'LETTER', 'R'],
 ['LATIN', 'SMALL', 'LETTER', 'A'],
 ['LATIN', 'SMALL', 'LETTER', 'H'],
 ['LATIN', 'SMALL', 'LETTER', 'A'],
 ['SPACE'],
 ['LATIN', 'CAPITAL', 'LETTER', 'E'],
 ['LATIN', 'SMALL', 'LETTER', 'T'],
 ['LATIN', 'SMALL', 'LETTER', 'E'],
 ['LATIN', 'SMALL', 'LETTER', 'M'],
 ['LATIN', 'SMALL', 'LETTER', 'A'],
 ['LATIN', 'SMALL', 'LETTER', 'D'],
 ['LATIN', 'SMALL', 'LETTER', 'I']]

Checking for non alphabet characters in each names

In [126]:
alphabet1 = []
notalpha = []
for i in df['name']:
    tempword = i.split()
    if (all(word.isalpha() == True for word in tempword)):
        alphabet1.append(langname(i))
    else: 
        notalpha.append(i)
    
print(notalpha)
print(len(notalpha))
print(set(alphabet1))
print(len(set(alphabet1)))


['Leena Peltonen-Palotie', 'R. M. Dharmadasa Banda', 'Vicente de Valverde y Alvarez de Toledo, O.P.', 'Ute Kircheis-Wessel', 'David John (Dave) Shannon', 'Dr. Joan W. Miller', 'Donald M. Davis', 'Akseli Gallen-Kallela', 'Gerald S. McGowan', 'Jung Chul-Woon', '(Armen Lubin)', 'Peter R. Orszag', 'William D. Port', 'Harry H. Peterson', 'Lim You-Hwan', "Lawrence O'Brien", 'St. Jutta of Kulmsee, T.O.S.F.', 'José M. Hernández', 'Im Jae-Sun', 'Charles A. Cummings', 'Francis "Frank" Forshew', 'Sir William Jardine, 7th Baronet', 'C.W. (Bill) Hutchins', 'Evelyn "Champagne" King', 'Feng Ming-chu', 'Isaac Campbell Kidd Jr.', 'Ernesto "Boy" F. Herrera', "William G. T'Vault", 'Laura J. Burns', 'Gordon S. Holder', 'David E. Rutledge', 'Richard Tsoi Yiu-cheong', 'William A. Cugno', 'Fahad Barakah Al-Marwani Al-Johani', 'Jan O. Karlsson', 'P.B. Premachandra', 'Anne-Pia Nygård', 'Wu Chi-wai', 'Yoon Jun-sung', 'Koshinaka Makoto (越中睦士)', 'Lize-Mari Retief', 'V. C. Andrews', 'Giancarlo Cornaggia-Medici', '

### avg_token_length (complete)

In [133]:
print(df2['name'][0])

Raha Etemadi


In [134]:
def average_token_length(name):
    total_length = 0
    total_tokens = 0
    # Tokenize the name (split by spaces)
    tokens = name.split()

    # Calculate the total length of tokens in the name
    token_length = sum(len(token) for token in tokens)

    # Total length (characters in each word) and total number of tokens (words in name)
    total_length += token_length
    total_tokens += len(tokens)
    # Calculate the average token length
    if total_tokens > 0:
        average_length = total_length / total_tokens
        return average_length
    else:
        return 0 
    
avg_token_length = []
for i in df2['name']:
    avg_token_length.append(average_token_length(i))

In [135]:
df2['avg_token_length'] = avg_token_length

In [143]:
df2

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens
0,Raha Etemadi,1,et,LATIN,5.5,2
1,Luma Grothe,1,en,LATIN,5.0,2
2,Takuya Kakine,1,ja,LATIN,6.0,2
3,Ōuyáng Zhènhuá,1,mi,LATIN,6.5,2
4,Jordan Gideon Archer,1,en,LATIN,6.0,3
...,...,...,...,...,...,...
42913,Clyde Donaldson,1,en,LATIN,7.0,2
42914,Terry Alexander,1,en,LATIN,7.0,2
42915,Neil Roebuck,1,en,LATIN,5.5,2
42916,Lyle Stewart,1,en,LATIN,5.5,2


### num_tokens

In [137]:
def token_len(name):
    total_tokens = 0
    tokens = name.split()
    # Total number of tokens (words in name)
    total_tokens += len(tokens)
    return total_tokens


In [142]:
token_length = []
for i in df2['name']:
    token_length.append(token_len(i))
df2['num_tokens'] = token_length

In [145]:
df2

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens
0,Raha Etemadi,1,et,LATIN,5.5,2
1,Luma Grothe,1,en,LATIN,5.0,2
2,Takuya Kakine,1,ja,LATIN,6.0,2
3,Ōuyáng Zhènhuá,1,mi,LATIN,6.5,2
4,Jordan Gideon Archer,1,en,LATIN,6.0,3
...,...,...,...,...,...,...
42913,Clyde Donaldson,1,en,LATIN,7.0,2
42914,Terry Alexander,1,en,LATIN,7.0,2
42915,Neil Roebuck,1,en,LATIN,5.5,2
42916,Lyle Stewart,1,en,LATIN,5.5,2


### transliteration

### iso

### space_frequency