## Cleaning

In [103]:
import pandas as pd
df = pd.read_csv("company_person_name_dataset.csv")
df

Unnamed: 0,name,class,lang
0,The Canal of the Angels,0,en
1,Rescue Renovation,0,en
2,Agatha Christie: The ABC Murders,0,en
3,Siti Akbari,0,ar
4,Stany,0,pl
...,...,...,...
199995,Robber's Bridge,0,en
199996,Johan Renck,0,en
199997,Lyle Stewart,1,en
199998,Thomas Colclough Watson,1,en


In [104]:
df = df[df["class"]==1]
#look into smote for imbalance

In [105]:
import numpy as np

In [106]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))

True
False
False


In [107]:
print(len(df.isnull()))

48190


In [108]:
df = df.dropna()

In [109]:
df.drop_duplicates()

Unnamed: 0,name,class,lang
11,Raha Etemadi,1,et
12,Leena Peltonen-Palotie,1,fi
16,Luma Grothe,1,en
20,Takuya Kakine,1,ja
21,Ōuyáng Zhènhuá,1,mi
...,...,...,...
199976,Clyde Donaldson,1,en
199980,Terry Alexander,1,en
199989,Neil Roebuck,1,en
199997,Lyle Stewart,1,en


In [110]:
language_counts = df["lang"].value_counts

In [111]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [112]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))
print(np.any(df.duplicated()))

False
False
False
True


In [113]:
df = df.drop_duplicates()

In [114]:
print(np.any(df.duplicated()))

False


In [115]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [116]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)

Language distribution:
lang
en    23352
es     2532
it     1224
fr     1181
ja     1155
      ...  
my        1
sd        1
kk        1
pa        1
si        1
Name: count, Length: 103, dtype: int64


## Additional Cleaning to Drop Names with Special Characters

Reset df index starting from 0

In [117]:
df.reset_index(drop=True, inplace=True)
print(df)

                          name  class lang
0                 Raha Etemadi      1   et
1       Leena Peltonen-Palotie      1   fi
2                  Luma Grothe      1   en
3                Takuya Kakine      1   ja
4               Ōuyáng Zhènhuá      1   mi
...                        ...    ...  ...
47754          Clyde Donaldson      1   en
47755          Terry Alexander      1   en
47756             Neil Roebuck      1   en
47757             Lyle Stewart      1   en
47758  Thomas Colclough Watson      1   en

[47759 rows x 3 columns]


Create langname to find the alphabet name of the first character in the first word of a given name

In [118]:
import unicodedata
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]
langname('Raha Etemadi')

'LATIN'

Using langname to find the alphabet of the first character in the first word of each name in df

In [119]:
alphabet = []
for i in df['name']:
    alphabet.append(langname(i))

print(set(alphabet))
print(len(set(alphabet)))

df['alphabet'] = alphabet

{'EQUALS', 'DIGIT', 'APOSTROPHE', 'TELUGU', 'MYANMAR', 'HANGUL', 'LEFT', 'HEBREW', 'CJK', 'QUOTATION', 'SYRIAC', 'ARABIC', 'ARMENIAN', 'ETHIOPIC', 'RIGHT', 'LESS-THAN', 'SINHALA', 'KATAKANA', 'ASTERISK', 'DEVANAGARI', 'BENGALI', 'TAMIL', 'GURMUKHI', 'GREEK', 'KANNADA', 'CYRILLIC', 'THAI', 'HYPHEN-MINUS', 'LATIN'}
29


Getting a count of names that use each alphabet in df

In [120]:
for i in set(alphabet):
    count = 0
    for x,y in enumerate(df['alphabet']):
        if str(i) in y:
            count += 1
            #print(df['name'][x])
    print(count, i)

1 EQUALS
14 DIGIT
2 APOSTROPHE
1 TELUGU
1 MYANMAR
3 HANGUL
109 LEFT
10 HEBREW
146 CJK
8 QUOTATION
1 SYRIAC


55 ARABIC
3 ARMENIAN
1 ETHIOPIC
1 RIGHT
1 LESS-THAN
1 SINHALA
1 KATAKANA
1 ASTERISK
15 DEVANAGARI
5 BENGALI
2 TAMIL
1 GURMUKHI
10 GREEK
2 KANNADA
43 CYRILLIC
1 THAI
3 HYPHEN-MINUS
47317 LATIN


Checking every character in each name using is.alpha() and preparing to drop them

In [121]:
alphabet1 = []
notalpha = []
for i in df['name']:
    tempword = i.split()
    if (all(word.isalpha() == True for word in tempword)):
        alphabet1.append(langname(i))
    else: 
        notalpha.append(i)
    
print(notalpha[0:10])
print(len(notalpha))
print(set(alphabet1))
print(len(set(alphabet1)))

['Leena Peltonen-Palotie', 'R. M. Dharmadasa Banda', 'Vicente de Valverde y Alvarez de Toledo, O.P.', 'Ute Kircheis-Wessel', 'David John (Dave) Shannon', 'Dr. Joan W. Miller', 'Donald M. Davis', 'Akseli Gallen-Kallela', 'Gerald S. McGowan', 'Jung Chul-Woon']
4841
{'GREEK', 'HEBREW', 'CJK', 'CYRILLIC', 'SYRIAC', 'KATAKANA', 'ARABIC', 'ARMENIAN', 'HANGUL', 'LATIN'}
10


Created a new dataframe 'df2' with dropped names

In [122]:
droprows = []
for i, name in enumerate(df['name']):
    if name in notalpha:
        droprows.append(i)
df2 = df.drop(droprows) #df2 is created

Reset df2 index starting from 0

In [123]:
df2.reset_index(drop=True, inplace=True)
print(df2)

                          name  class lang alphabet
0                 Raha Etemadi      1   et    LATIN
1                  Luma Grothe      1   en    LATIN
2                Takuya Kakine      1   ja    LATIN
3               Ōuyáng Zhènhuá      1   mi    LATIN
4         Jordan Gideon Archer      1   en    LATIN
...                        ...    ...  ...      ...
42913          Clyde Donaldson      1   en    LATIN
42914          Terry Alexander      1   en    LATIN
42915             Neil Roebuck      1   en    LATIN
42916             Lyle Stewart      1   en    LATIN
42917  Thomas Colclough Watson      1   en    LATIN

[42918 rows x 4 columns]


Counting number of names that use each alphabet

In [124]:
alphabet2 = []
for i in df2['name']:
    alphabet2.append(langname(i))

print(set(alphabet2))
print(len(set(alphabet2)))

df2['alphabet'] = alphabet2

for i in set(alphabet2):
    count = 0
    for x,y in enumerate(df2['alphabet']):
        if str(i) in y:
            count += 1
    print(count, i)

{'GREEK', 'HEBREW', 'CJK', 'CYRILLIC', 'SYRIAC', 'KATAKANA', 'ARABIC', 'ARMENIAN', 'HANGUL', 'LATIN'}
10
9 GREEK
8 HEBREW
144 CJK
42 CYRILLIC
1 SYRIAC
1 KATAKANA
53 ARABIC
2 ARMENIAN
3 HANGUL
42655 LATIN


Checking original names in DEVANAGARI (isalpha() does not recognize DEVANAGARI as an alphabet)(may need to add it back)

In [125]:
for i, x in enumerate(df['name']):
    if df['alphabet'][i] == 'DEVANAGARI':
        print(df['name'][i])
        print(df['name'][i].isalpha())

राणा उदय सिहं
False
करतार सिंह भड़ाना
False
शंखलाल माझी
False
सुर्नेद्र झा 'सुमन '
False
आचार्य सारंगधर
False
पवन कुमार शर्मा
False
अटल बिहारी वाजपेयी
False
गेंदा लाल  चौधरी
False
विजय सिंह
False
परेश रावल
False
पुरुशोत्तम काशीनाथ केळकर
False
फ़ौज़िया  तहसीन ख़ान
False
रवीन्द्र प्रभात
False
नाहिद हसन
False
ओम प्रकाश वर्मा
False


In [126]:
df2['lang'] = df2["lang"].str.strip()
language_counts2 = df2["lang"].value_counts()
print("Language distribution:")
print(language_counts)


Language distribution:
lang
en    23352
es     2532
it     1224
fr     1181
ja     1155
      ...  
my        1
sd        1
kk        1
pa        1
si        1
Name: count, Length: 103, dtype: int64


## Feature Creation

### accents (incomplete)

In [127]:
from unidecode import unidecode
import unicodedata

text_with_accents = "Létérs wïth âccénts"
normalized_text = unidecode(text_with_accents)
print(normalized_text)

Leters with accents


In [128]:
import unicodedata
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]

In [129]:
def identify_accents(text):
    accents = []
    for char in text:
        if unicodedata.normalize('NFD', char) != char:
            accents.append(char)
    return accents
allaccents= set()
for i in df['name']:
    allaccents.update(identify_accents(i))
print(allaccents)
print(len(allaccents))

{'ỹ', 'Ś', '이', 'ģ', 'Ú', 'إ', '석', '강', '건', '겸', 'ž', 'ņ', 'ữ', 'ụ', 'û', 'ş', 'Č', 'ň', '형', '연', 'ứ', 'Ü', 'ţ', 'ί', 'Ó', 'Ț', '원', 'ؤ', '랑', '허', 'ś', 'ῑ', '노', 'ř', '민', '권', '홍', 'Ö', 'Ḥ', 'Ř', 'ặ', 'ā', 'ș', 'Ç', 'ừ', 'ơ', 'ü', '윤', '철', 'ῖ', 'ż', 'Ī', 'é', 'í', 'ọ', 'ę', 'ă', 'ó', 'ē', 'ớ', 'Â', 'ç', 'ò', '상', '헌', 'å', 'ế', 'è', 'ầ', 'ÿ', '섭', 'Š', 'ű', 'ö', 'č', 'ė', '영', 'Ș', 'ṭ', 'أ', 'ễ', 'ũ', 'ý', '재', 'Ѓ', 'à', 'ằ', '최', 'ù', 'ĩ', 'ủ', 'ό', '수', 'Å', 'ắ', 'ύ', 'ё', 'ī', 'ì', 'ǐ', 'ệ', 'î', 'ồ', 'ೇ', '기', 'ư', 'ë', 'ị', 'Ṭ', 'ề', 'ô', 'ṣ', 'ǎ', '신', 'ḫ', 'Á', '박', 'ú', '현', 'ї', 'ů', 'ő', 'ō', 'Ō', 'Ć', 'ố', '규', '긍', '명', 'õ', 'ೀ', '하', 'ą', 'â', '광', 'ৌ', 'ń', 'ờ', 'á', 'É', 'Ä', '태', 'Í', 'ấ', 'ợ', 'ľ', 'έ', 'Ż', 'ț', 'Ḫ', 'ě', 'ū', 'ŭ', 'ǔ', 'ো', '대', 'ć', '보', 'š', 'ド', 'ώ', 'ď', 'ñ', 'ğ', '순', 'Ľ', '식', 'ạ', 'ά', 'İ', 'ǚ', '조', 'ļ', '정', '춘', '송', '녕', 'ǫ', '진', 'ã', '김', 'ä', 'ペ', 'ź', '일', 'Ş', 'й', 'ǒ', 'ķ', '동', '환', 'ê', 'ï', 'Ž', 'ả'}
195


In [130]:
allaccents1 = []
for i in allaccents:
    if langname(str(i)) != 'HANGUL':
        allaccents1.append(i)
print(allaccents1)
print(len(allaccents1))


['ỹ', 'Ś', 'ģ', 'Ú', 'إ', 'ž', 'ņ', 'ữ', 'ụ', 'û', 'ş', 'Č', 'ň', 'ứ', 'Ü', 'ţ', 'ί', 'Ó', 'Ț', 'ؤ', 'ś', 'ῑ', 'ř', 'Ö', 'Ḥ', 'Ř', 'ặ', 'ā', 'ș', 'Ç', 'ừ', 'ơ', 'ü', 'ῖ', 'ż', 'Ī', 'é', 'í', 'ọ', 'ę', 'ă', 'ó', 'ē', 'ớ', 'Â', 'ç', 'ò', 'å', 'ế', 'è', 'ầ', 'ÿ', 'Š', 'ű', 'ö', 'č', 'ė', 'Ș', 'ṭ', 'أ', 'ễ', 'ũ', 'ý', 'Ѓ', 'à', 'ằ', 'ù', 'ĩ', 'ủ', 'ό', 'Å', 'ắ', 'ύ', 'ё', 'ī', 'ì', 'ǐ', 'ệ', 'î', 'ồ', 'ೇ', 'ư', 'ë', 'ị', 'Ṭ', 'ề', 'ô', 'ṣ', 'ǎ', 'ḫ', 'Á', 'ú', 'ї', 'ů', 'ő', 'ō', 'Ō', 'Ć', 'ố', 'õ', 'ೀ', 'ą', 'â', 'ৌ', 'ń', 'ờ', 'á', 'É', 'Ä', 'Í', 'ấ', 'ợ', 'ľ', 'έ', 'Ż', 'ț', 'Ḫ', 'ě', 'ū', 'ŭ', 'ǔ', 'ো', 'ć', 'š', 'ド', 'ώ', 'ď', 'ñ', 'ğ', 'Ľ', 'ạ', 'ά', 'İ', 'ǚ', 'ļ', 'ǫ', 'ã', 'ä', 'ペ', 'ź', 'Ş', 'й', 'ǒ', 'ķ', 'ê', 'ï', 'Ž', 'ả']
148


In [131]:
unicodedata.normalize('NFD', '안녕하세요') == '안녕하세요'

False

In [132]:
'안녕하세요' == '안녕하세요'

True

### alphabet

Creating langname2 as another option to find the alphabet name for each character in a name instead of just the first letter (use langname2 because Anna said its better to have full list)

In [133]:
langname2 = lambda x:[unicodedata.name(char).split(' ')[0] for char in x]
langname2('Raha Etemadi')

['LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'SPACE',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN',
 'LATIN']

Checking for non alphabet characters in each names

In [134]:
alphabettemp = []
notalphatemp = []
for i in df2['name']:
    tempword = i.split()
    if (all(word.isalpha() == True for word in tempword)):
        alphabettemp.append(langname(i))
    else: 
        notalpha.append(i)
print(notalpha[0:10])
print(len(notalpha))
print(alphabettemp[0:10])

['Leena Peltonen-Palotie', 'R. M. Dharmadasa Banda', 'Vicente de Valverde y Alvarez de Toledo, O.P.', 'Ute Kircheis-Wessel', 'David John (Dave) Shannon', 'Dr. Joan W. Miller', 'Donald M. Davis', 'Akseli Gallen-Kallela', 'Gerald S. McGowan', 'Jung Chul-Woon']
4841
['LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN', 'LATIN']


In [144]:
df2['alphabet'] = df2['name'].apply(langname2)
df2.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens
0,Raha Etemadi,1,et,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.5,2
1,Luma Grothe,1,en,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",5.0,2
2,Takuya Kakine,1,ja,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,2
3,Ōuyáng Zhènhuá,1,mi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.5,2
4,Jordan Gideon Archer,1,en,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",6.0,3


### avg_token_length (complete)

In [135]:
def average_token_length(name):
    total_length = 0
    total_tokens = 0
    tokens = name.split()

    # Calculate the total length of tokens in the name
    token_length = sum(len(token) for token in tokens)

    # Total length (characters in each word) and total number of tokens (words in name)
    total_length += token_length
    total_tokens += len(tokens)
    # Calculate the average token length
    if total_tokens > 0:
        average_length = total_length / total_tokens
        return average_length
    else:
        return 0 
    
avg_token_length = []
for i in df2['name']:
    avg_token_length.append(average_token_length(i))

In [136]:
df2['avg_token_length'] = avg_token_length

In [137]:
df2

Unnamed: 0,name,class,lang,alphabet,avg_token_length
0,Raha Etemadi,1,et,LATIN,5.5
1,Luma Grothe,1,en,LATIN,5.0
2,Takuya Kakine,1,ja,LATIN,6.0
3,Ōuyáng Zhènhuá,1,mi,LATIN,6.5
4,Jordan Gideon Archer,1,en,LATIN,6.0
...,...,...,...,...,...
42913,Clyde Donaldson,1,en,LATIN,7.0
42914,Terry Alexander,1,en,LATIN,7.0
42915,Neil Roebuck,1,en,LATIN,5.5
42916,Lyle Stewart,1,en,LATIN,5.5


### num_tokens

In [138]:
def token_len(name):
    total_tokens = 0
    tokens = name.split()
    # Total number of tokens (words in name)
    total_tokens += len(tokens)
    return total_tokens


In [139]:
token_length = []
for i in df2['name']:
    token_length.append(token_len(i))
df2['num_tokens'] = token_length

In [140]:
df2

Unnamed: 0,name,class,lang,alphabet,avg_token_length,num_tokens
0,Raha Etemadi,1,et,LATIN,5.5,2
1,Luma Grothe,1,en,LATIN,5.0,2
2,Takuya Kakine,1,ja,LATIN,6.0,2
3,Ōuyáng Zhènhuá,1,mi,LATIN,6.5,2
4,Jordan Gideon Archer,1,en,LATIN,6.0,3
...,...,...,...,...,...,...
42913,Clyde Donaldson,1,en,LATIN,7.0,2
42914,Terry Alexander,1,en,LATIN,7.0,2
42915,Neil Roebuck,1,en,LATIN,5.5,2
42916,Lyle Stewart,1,en,LATIN,5.5,2


### transliteration

### iso

### space_frequency