## Cleaning

In [57]:
import pandas as pd
df = pd.read_csv("company_person_name_dataset.csv")
df

Unnamed: 0,name,class,lang
0,The Canal of the Angels,0,en
1,Rescue Renovation,0,en
2,Agatha Christie: The ABC Murders,0,en
3,Siti Akbari,0,ar
4,Stany,0,pl
...,...,...,...
199995,Robber's Bridge,0,en
199996,Johan Renck,0,en
199997,Lyle Stewart,1,en
199998,Thomas Colclough Watson,1,en


In [58]:
df = df[df["class"]==1]
#look into smote for imbalance

In [59]:
import numpy as np

In [60]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))

True
False
False


In [61]:
print(len(df.isnull()))

48190


In [62]:
df = df.dropna()

In [63]:
df.drop_duplicates()

Unnamed: 0,name,class,lang
11,Raha Etemadi,1,et
12,Leena Peltonen-Palotie,1,fi
16,Luma Grothe,1,en
20,Takuya Kakine,1,ja
21,Ōuyáng Zhènhuá,1,mi
...,...,...,...
199976,Clyde Donaldson,1,en
199980,Terry Alexander,1,en
199989,Neil Roebuck,1,en
199997,Lyle Stewart,1,en


In [64]:
language_counts = df["lang"].value_counts

In [65]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [66]:
print(np.any(df["name"].isnull()))
print(np.any(df["class"].isnull()))
print(np.any(df["lang"].isnull()))
print(np.any(df.duplicated()))

False
False
False
True


In [67]:
df = df.drop_duplicates()

In [68]:
print(np.any(df.duplicated()))

False


In [69]:
print("Language distribution:")
print(language_counts)

Language distribution:
<bound method IndexOpsMixin.value_counts of 11        et
12        fi
16        en
20        ja
21        mi
          ..
199976    en
199980    en
199989    en
199997    en
199998    en
Name: lang, Length: 48188, dtype: object>


In [70]:
df["lang"] = df["lang"].str.strip()
language_counts = df["lang"].value_counts()
print("Language distribution:")
print(language_counts)

Language distribution:
lang
en    23352
es     2532
it     1224
fr     1181
ja     1155
      ...  
my        1
sd        1
kk        1
pa        1
si        1
Name: count, Length: 103, dtype: int64


## Feature Creation

In [71]:
df.reset_index(drop=True, inplace=True)
print(df)

                          name  class lang
0                 Raha Etemadi      1   et
1       Leena Peltonen-Palotie      1   fi
2                  Luma Grothe      1   en
3                Takuya Kakine      1   ja
4               Ōuyáng Zhènhuá      1   mi
...                        ...    ...  ...
47754          Clyde Donaldson      1   en
47755          Terry Alexander      1   en
47756             Neil Roebuck      1   en
47757             Lyle Stewart      1   en
47758  Thomas Colclough Watson      1   en

[47759 rows x 3 columns]


### accents

In [72]:
from unidecode import unidecode
import unicodedata

text_with_accents = "Létérs wïth âccénts"
normalized_text = unidecode(text_with_accents)
print(normalized_text)

Leters with accents


In [74]:
df['name'][11]

'Rick Savage'

In [75]:
def identify_accents(text):
    accents = []
    for char in text:
        if unicodedata.normalize('NFD', char) != char:
            accents.append(char)
    return accents
allaccents= set()
for i in df['name']:
    allaccents.update(identify_accents(i))
print(allaccents)
print(len(allaccents))


{'ô', 'ń', '영', 'ề', 'ồ', 'Ö', '홍', 'ļ', '긍', 'Š', 'ೇ', 'ǎ', 'إ', 'ŭ', 'ợ', '헌', 'ź', '녕', '민', 'š', 'ș', 'ớ', '광', '상', '연', 'ṭ', 'Ç', 'ī', 'ύ', 'ǫ', 'ệ', 'ơ', '허', 'ę', 'ė', 'ǐ', 'ế', 'ĩ', 'ý', '권', 'Ř', 'ď', '재', 'ώ', 'É', 'ņ', '겸', 'ё', 'ド', '춘', 'ř', 'ū', 'ằ', 'ì', 'ắ', 'Ľ', 'Å', '이', 'ü', '규', '동', 'Í', 'ù', 'Á', 'ğ', 'ố', 'ģ', '송', 'û', '명', '순', 'é', '김', 'ż', 'ţ', 'ペ', 'ṣ', '섭', 'ǒ', 'ć', 'ḫ', 'ķ', '일', 'й', 'ě', '수', 'έ', 'ụ', '대', 'ư', 'ạ', 'ї', '형', 'ê', 'ů', 'Ѓ', 'ľ', '진', 'ç', 'Ō', 'ί', 'Ó', 'ủ', 'ো', 'ț', 'ë', 'İ', 'ā', 'ữ', 'ừ', 'Ş', '하', 'Ä', '조', '원', 'ē', 'ờ', '석', 'Ḥ', 'ǔ', 'ñ', 'ň', 'ǚ', 'ặ', 'Ú', 'ò', 'ÿ', 'ã', 'ả', 'Â', 'ž', 'ö', 'Ș', 'î', '박', 'ï', '철', 'ọ', '최', 'Ć', 'Ś', 'å', '식', 'ō', 'أ', '태', 'č', 'Ü', 'Ț', 'ấ', 'ῖ', 'ễ', 'ő', '현', '건', 'ῑ', 'Ḫ', 'Č', 'è', 'ú', '노', '랑', 'ứ', 'Ż', '강', '정', 'õ', 'ị', 'ó', 'í', 'á', 'ű', '윤', '신', 'ό', 'ś', 'à', 'ೀ', 'ỹ', 'ä', 'Ṭ', 'â', 'Ž', '보', 'ৌ', 'ũ', 'ş', 'ă', 'ؤ', 'ầ', 'ά', '기', 'ą', '환', 'Ī'}
195


In [76]:
unicodedata.normalize('NFD', '안녕하세요') == '안녕하세요'

False

In [77]:
'안녕하세요' == '안녕하세요'

True

### alphabet

In [78]:
import unicodedata
langname = lambda x : unicodedata.name(x[0]).split(' ')[0]
langname('Raha Etemadi')

'LATIN'

In [79]:
alphabet = []
for i in df['name']:
    alphabet.append(langname(i))

print(set(alphabet))
print(len(set(alphabet)))

print(alphabet.index('CJK'))
df['alphabet'] = alphabet



{'EQUALS', 'HEBREW', 'HANGUL', 'QUOTATION', 'CYRILLIC', 'DIGIT', 'RIGHT', 'HYPHEN-MINUS', 'SYRIAC', 'ASTERISK', 'GREEK', 'LEFT', 'LESS-THAN', 'MYANMAR', 'ARABIC', 'GURMUKHI', 'TELUGU', 'ARMENIAN', 'SINHALA', 'LATIN', 'THAI', 'KANNADA', 'KATAKANA', 'CJK', 'ETHIOPIC', 'DEVANAGARI', 'APOSTROPHE', 'TAMIL', 'BENGALI'}
29
271


In [80]:
for i in set(alphabet):
    count = 0
    for x,y in enumerate(df['alphabet']):
        if str(i) in y:
            count += 1
            #print(df['name'][x])
    print(count)

        

1
10
3
8
43
14
1
3
1
1
10
109
1
1
55
1
1
3
1
47317
1
2
1
146
1
15
2
2
5


In [81]:
print(df)

                          name  class lang alphabet
0                 Raha Etemadi      1   et    LATIN
1       Leena Peltonen-Palotie      1   fi    LATIN
2                  Luma Grothe      1   en    LATIN
3                Takuya Kakine      1   ja    LATIN
4               Ōuyáng Zhènhuá      1   mi    LATIN
...                        ...    ...  ...      ...
47754          Clyde Donaldson      1   en    LATIN
47755          Terry Alexander      1   en    LATIN
47756             Neil Roebuck      1   en    LATIN
47757             Lyle Stewart      1   en    LATIN
47758  Thomas Colclough Watson      1   en    LATIN

[47759 rows x 4 columns]


In [82]:
print(df['name'][271])

桑一非


In [83]:
df['alphabet'][271]

'CJK'

### avg_token_length 

In [93]:
print(df['name'][0])

Raha Etemadi


In [102]:
def average_token_length(name):
    total_length = 0
    total_tokens = 0
    # Tokenize the name (split by spaces)
    tokens = name.split()

    # Calculate the total length of tokens in the name
    token_length = sum(len(token) for token in tokens)

    # Total length (characters in each word) and total number of tokens (words in name)
    total_length += token_length
    total_tokens += len(tokens)
    # Calculate the average token length
    if total_tokens > 0:
        average_length = total_length / total_tokens
        return average_length
    else:
        return 0 
    
avg_token_length = []
for i in df['name']:
    avg_token_length.append(average_token_length(i))

In [103]:
df['avg_token_length'] = avg_token_length

In [104]:
print(df)

                          name  class lang alphabet  avg_token_length
0                 Raha Etemadi      1   et    LATIN               5.5
1       Leena Peltonen-Palotie      1   fi    LATIN              10.5
2                  Luma Grothe      1   en    LATIN               5.0
3                Takuya Kakine      1   ja    LATIN               6.0
4               Ōuyáng Zhènhuá      1   mi    LATIN               6.5
...                        ...    ...  ...      ...               ...
47754          Clyde Donaldson      1   en    LATIN               7.0
47755          Terry Alexander      1   en    LATIN               7.0
47756             Neil Roebuck      1   en    LATIN               5.5
47757             Lyle Stewart      1   en    LATIN               5.5
47758  Thomas Colclough Watson      1   en    LATIN               7.0

[47759 rows x 5 columns]


In [None]:
for i, x in enumerate(df['alphabet']):
    if x == 'LEFT':
        print(df.iloc[i,])

### token_length

In [127]:
def token_len(name):
    total_tokens = 0
    tokens = name.split()
    # Total number of tokens (words in name)
    total_tokens += len(tokens)
    return total_tokens


In [131]:
token_length = []
for i in df['name']:
    token_length.append(token_len(i))
df['token_length'] = token_length

In [133]:
df.head()

Unnamed: 0,name,class,lang,alphabet,avg_token_length,token_length
0,Raha Etemadi,1,et,LATIN,5.5,2
1,Leena Peltonen-Palotie,1,fi,LATIN,10.5,2
2,Luma Grothe,1,en,LATIN,5.0,2
3,Takuya Kakine,1,ja,LATIN,6.0,2
4,Ōuyáng Zhènhuá,1,mi,LATIN,6.5,2


### transliteration

### iso

### period_freq, dash_freq, apostrophe_freq