In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.utils import shuffle

# Model training
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

This file was originally for frequency distributions for Latin names!
but we can also concat the other dfs / do model training after in the same file

In [2]:
df_indo = pd.read_pickle('df_indo.pkl.gz', compression = 'gzip')
df_malay = pd.read_pickle('df_malay.pkl.gz', compression = 'gzip')
df_viet = pd.read_pickle('viet_df.pkl.gz', compression = 'gzip')
df_cnrom = pd.read_pickle('cnrom_df.pkl.gz', compression = 'gzip')
df_cnchar = pd.read_pickle('cnchar_df.pkl.gz', compression = 'gzip')
df_turk = pd.read_pickle('turkish_df.pkl.gz', compression = 'gzip')
df_korean = pd.read_pickle('korean_df.pkl.gz', compression ='gzip') 
# import company csv

all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk, df_korean]

In [3]:
# finding percentages
df_names = ['Indonesian', 'Malay', 'Vietnamese', 'Chinese (Romanized)', 'Chinese (Characters)', 'Turkish', 'Korean (Romanized & Characters)']
total_size = 0

for i, df in enumerate(all_dfs):
    total_size += df.shape[0]
    print(df_names[i], ':', df.shape[0])
    
# for i, df in enumerate(all_dfs):
#     print(df_names[i], ':', df.shape[0] / total_size)

Indonesian : 11246
Malay : 2908
Vietnamese : 2290
Chinese (Romanized) : 10478
Chinese (Characters) : 11055
Turkish : 18037
Korean (Romanized & Characters) : 19118


### 1. Cleaning up column names

column names to KEEP: (10 so far)

* name_length
* avg_token_length
* num_tokens
* period_freq
* dash_freq
* apostrophe_freq
* space_freq
* unigrams_cosine_sim
* bigrams_cosine_sim
* language

In [35]:
#This is where you rename columns to all match
df_viet.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnrom.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnchar.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_viet.head()

Unnamed: 0,fullname,alphabet,name_length,num_tokens,char_ngrams,period_freq,dash_freq,space_freq,apostrophe_freq,transliteration,unigrams,bigrams,trigrams,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,language
0,từ hoàng thông,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, LAT...",14,3,"[(T,), (ừ,), ( ,), (H,), (o,), (à,), (n,), (g,...",0,0,2,0,tu hoang thong,"[t, ừ, , h, o, à, n, g, , t, h, ô, n, g]","[(t, ừ), (ừ, ), ( , h), (h, o), (o, à), (à, n...","[(t, ừ, ), (ừ, , h), ( , h, o), (h, o, à), (...",4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07...",0.805625,0.508198,Vietnamese
1,nguyễn thị phương thảo,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, SPA...",22,4,"[(N,), (g,), (u,), (y,), (ễ,), (n,), ( ,), (T,...",0,0,3,0,nguyen thi phuong thao,"[n, g, u, y, ễ, n, , t, h, ị, , p, h, ư, ơ, ...","[(n, g), (g, u), (u, y), (y, ễ), (ễ, n), (n, ...","[(n, g, u), (g, u, y), (u, y, ễ), (y, ễ, n), (...",4.75,"[[0.13636363636363635, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.884792,0.667716,Vietnamese
2,nick út,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",7,2,"[(N,), (i,), (c,), (k,), ( ,), (Ú,), (t,), (N,...",0,0,1,0,nick ut,"[n, i, c, k, , ú, t]","[(n, i), (i, c), (c, k), (k, ), ( , ú), (ú, t)]","[(n, i, c), (i, c, k), (c, k, ), (k, , ú), (...",3.0,"[[0.14285714285714285, 0.0, 0.0, 0.14285714285...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.59269,0.0056,Vietnamese
3,cao văn lầu,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",11,3,"[(C,), (a,), (o,), ( ,), (V,), (ă,), (n,), ( ,...",0,0,2,0,cao van lau,"[c, a, o, , v, ă, n, , l, ầ, u]","[(c, a), (a, o), (o, ), ( , v), (v, ă), (ă, n...","[(c, a, o), (a, o, ), (o, , v), ( , v, ă), (...",3.0,"[[0.18181818181818182, 0.09090909090909091, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665965,0.243176,Vietnamese
4,tạ thu thâu,"[LATIN, LATIN, SPACE, LATIN, LATIN, LATIN, SPA...",11,3,"[(T,), (ạ,), ( ,), (T,), (h,), (u,), ( ,), (T,...",0,0,2,0,ta thu thau,"[t, ạ, , t, h, u, , t, h, â, u]","[(t, ạ), (ạ, ), ( , t), (t, h), (h, u), (u, ...","[(t, ạ, ), (ạ, , t), ( , t, h), (t, h, u), (...",3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.596114,0.288942,Vietnamese


### 2. Redoing frequency distributions across all Latin names

#### 2a. Frequency Distribution Functions

In [5]:
# Functions from IndoMalay.ipynb

def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
#df_indo['indiv_trigrams_fdist'] = df_indo.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

#### 2b. Determining which languages use Latin

For these lines of code to work, the datasets must have been pickled to preserve data types! `pd.csv` turns everything into strings; for example, a list of `[LATIN, LATIN, LATIN, ...]` becomes `'[LATIN, LATIN, LATIN, ...]'` (i.e., `'['` becomes a character).

In [6]:
indo_latin_percent = create_lang_char_distribution(df_indo, 'alphabet')['LATIN']
malay_latin_percent = create_lang_char_distribution(df_malay, 'alphabet')['LATIN']
viet_latin_percent = create_lang_char_distribution(df_viet, 'alphabet')['LATIN']
cnrom_latin_percent = create_lang_char_distribution(df_cnrom, 'alphabet')['LATIN']
# cnchar_latin_percent = create_lang_char_distribution(df_cnchar, 'alphabet')['LATIN'] error -> no latin
turk_latin_percent = create_lang_char_distribution(df_turk, 'alphabet')['LATIN']
korean_latin_percent = create_lang_char_distribution(df_korean, 'alphabet')['LATIN']
korean_latin_percent

0.6589739940220817

In [7]:
df_cnchar.head()

Unnamed: 0,original_fullname,transliteration,alphabet,name_length,num_tokens,char_ngrams,unigrams,bigrams,trigrams,period_freq,dash_freq,space_freq,apostrophe_freq,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,丁一平,ding yi ping,"[CJK, CJK, CJK]",12,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , p, i, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.333333,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.083333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.774279,0.548928
1,丁世雄,ding shi xiong,"[CJK, CJK, CJK]",14,3,"[(d,), (i,), (n,), (g,), ( ,), (s,), (h,), (i,...","[d, i, n, g, , s, h, i, , x, i, o, n, g]","[(d, i), (i, n), (n, g), (g, ), ( , s), (s, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , s), (...",0,0,2,0,4.0,"[[0.14285714285714285, 0.0, 0.0, 0.0, 0.071428...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.811762,0.560151
2,丁亦昕,ding yi xin,"[CJK, CJK, CJK]",11,3,"[(d,), (i,), (n,), (g,), ( ,), (y,), (i,), ( ,...","[d, i, n, g, , y, i, , x, i, n]","[(d, i), (i, n), (n, g), (g, ), ( , y), (y, i...","[(d, i, n), (i, n, g), (n, g, ), (g, , y), (...",0,0,2,0,3.0,"[[0.18181818181818182, 0.0, 0.0, 0.0, 0.090909...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.77639,0.510394
3,丁仲礼,ding zhong li,"[CJK, CJK, CJK]",13,3,"[(d,), (i,), (n,), (g,), ( ,), (z,), (h,), (o,...","[d, i, n, g, , z, h, o, n, g, , l, i]","[(d, i), (i, n), (n, g), (g, ), ( , z), (z, h...","[(d, i, n), (i, n, g), (n, g, ), (g, , z), (...",0,0,2,0,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.076923...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.841584,0.605839
4,丁伟,ding wei,"[CJK, CJK]",8,2,"[(d,), (i,), (n,), (g,), ( ,), (w,), (e,), (i,...","[d, i, n, g, , w, e, i]","[(d, i), (i, n), (n, g), (g, ), ( , w), (w, e...","[(d, i, n), (i, n, g), (n, g, ), (g, , w), (...",0,0,1,0,3.5,"[[0.125, 0.0, 0.0, 0.0, 0.125, 0.125, 0.0, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.710349,0.440812


In [8]:
df_korean.head()

Unnamed: 0,fullname,original_fullname,alphabet,transliteration,unigrams,bigrams,trigrams,char_ngrams,num_tokens,period_freq,dash_freq,space_freq,name_length,avg_token_length,indiv_unigrams_fdist,indiv_bigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim
0,park joo-bong,Park Joo-bong,"[LATIN, LATIN, LATIN, LATIN, SPACE, LATIN, LAT...",park joo-bong,"[p, a, r, k, , j, o, o, -, b, o, n, g]","[(p, a), (a, r), (r, k), (k, ), ( , j), (j, o...","[(p, a, r), (a, r, k), (r, k, ), (k, , j), (...","[p, a, r, k, , j, o, o, -, b, o, n, g, (p, a)...",2,0,1,1,13,6.0,"[[0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.68059,0.37766
1,kim jong hoon,KIM Jong hoon,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...",kim jong hoon,"[k, i, m, , j, o, n, g, , h, o, o, n]","[(k, i), (i, m), (m, ), ( , j), (j, o), (o, n...","[(k, i, m), (i, m, ), (m, , j), ( , j, o), (...","[k, i, m, , j, o, n, g, , h, o, o, n, (k, i)...",3,0,0,2,13,3.666667,"[[0.15384615384615385, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.762211,0.55209
2,이민혁,이민혁,"[HANGUL, HANGUL, HANGUL]",iminhyeog,"[i, m, i, n, h, y, e, o, g]","[(i, m), (m, i), (i, n), (n, h), (h, y), (y, e...","[(i, m, i), (m, i, n), (i, n, h), (n, h, y), (...","[i, m, i, n, h, y, e, o, g, (i, m), (m, i), (i...",1,0,0,0,9,9.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.757701,0.34441
3,lee ho,Lee Ho,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN]",lee ho,"[l, e, e, , h, o]","[(l, e), (e, e), (e, ), ( , h), (h, o)]","[(l, e, e), (e, e, ), (e, , h), ( , h, o)]","[l, e, e, , h, o, (l, e), (e, e), (e, ), ( ,...",2,0,0,1,6,2.5,"[[0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.537098,0.143205
4,최민호,최민호,"[HANGUL, HANGUL, HANGUL]",choeminho,"[c, h, o, e, m, i, n, h, o]","[(c, h), (h, o), (o, e), (e, m), (m, i), (i, n...","[(c, h, o), (h, o, e), (o, e, m), (e, m, i), (...","[c, h, o, e, m, i, n, h, o, (c, h), (h, o), (o...",1,0,0,0,9,9.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.665396,0.171394


#### 2c. Remaking Frequency Distributions

### 3. Add a_hat_freq and turn categorical columns into numerical

In [11]:
#TBD after Maker Day

In [12]:
# see 10/16 anna meeting notes for ideas on more features

### 4. Cleaning up other columns

In [13]:
df_korean.columns

Index(['fullname', 'original_fullname', 'alphabet', 'transliteration',
       'unigrams', 'bigrams', 'trigrams', 'char_ngrams', 'num_tokens',
       'period_freq', 'dash_freq', 'space_freq', 'name_length',
       'avg_token_length', 'indiv_unigrams_fdist', 'indiv_bigrams_fdist',
       'unigrams_cosine_sim', 'bigrams_cosine_sim'],
      dtype='object')

In [14]:
df_turk.columns

Index(['id', 'label_tr', 'original_fullname', 'fullname', 'alphabet',
       'unigrams', 'bigrams', 'trigrams', 'char_ngrams', 'name_length',
       'num_tokens', 'avg_token_length', 'period_freq', 'dash_freq',
       'space_freq', 'transliteration', 'indiv_unigrams_fdist',
       'indiv_bigrams_fdist', 'indiv_trigrams_fdist', 'unigrams_cosine_sim',
       'bigrams_cosine_sim', 'trigrams_cosine_sim'],
      dtype='object')

In [15]:
df_turk['apostrophe_freq'] = df_turk['fullname'].apply(lambda name: name.count('\''))
df_korean['apostrophe_freq'] = df_korean['fullname'].apply(lambda name: name.count('\''))

### 5. Adding the language (label) to each dataset

In [16]:
df_indo['language'] = 'Indonesian'
df_malay['language'] = 'Malay'
df_viet['language'] = 'Vietnamese'
df_cnrom['language'] = 'Chinese (Romanized)'
df_cnchar['language'] = 'Chinese (Characters)'
df_turk['language'] = 'Turkish'
df_korean['language'] = 'Korean' 

### 6. Combining all names to make one big dataset

In [17]:
# as you can see from output, we need the columns in the concatenated df (in this case, viet) to match
# it's okay if some values are NaN bc we'll drop all non-numerical columns anyway
merged_df = pd.concat(all_dfs, ignore_index = True, join = 'outer')
merged_df

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim,language,id,label_tr
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.664809,0.250640,0.085949,Indonesian,,
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.08333333333333333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.686625,0.353292,0.117226,Indonesian,,
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.688312,0.197139,0.090295,Indonesian,,
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.581396,0.155386,0.060083,Indonesian,,
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.463215,0.176917,0.052811,Indonesian,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75127,lee han-wi,Lee Han-wi,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[l, e, e, , h, a, n, -, w, i]","[(l, e), (e, e), (e, ), ( , h), (h, a), (a, n...","[(l, e, e), (e, e, ), (e, , h), ( , h, a), (...","[l, e, e, , h, a, n, -, w, i, (l, e), (e, e),...",,10,4.5,...,1,"[[0.1, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",,0.675313,0.212623,,Korean,,
75128,gil jung-woo,Gil Jung-woo,"[LATIN, LATIN, LATIN, SPACE, LATIN, LATIN, LAT...","[g, i, l, , j, u, n, g, -, w, o, o]","[(g, i), (i, l), (l, ), ( , j), (j, u), (u, n...","[(g, i, l), (i, l, ), (l, , j), ( , j, u), (...","[g, i, l, , j, u, n, g, -, w, o, o, (g, i), (...",,12,5.5,...,1,"[[0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",,0.769942,0.394563,,Korean,,
75129,이정희,이정희,"[HANGUL, HANGUL, HANGUL]","[i, j, e, o, n, g, h, u, i]","[(i, j), (j, e), (e, o), (o, n), (n, g), (g, h...","[(i, j, e), (j, e, o), (e, o, n), (o, n, g), (...","[i, j, e, o, n, g, h, u, i, (i, j), (j, e), (e...",,9,9.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",,0.777239,0.508705,,Korean,,
75130,금태섭,금태섭,"[HANGUL, HANGUL, HANGUL]","[g, e, u, m, t, a, e, s, e, o, b]","[(g, e), (e, u), (u, m), (m, t), (t, a), (a, e...","[(g, e, u), (e, u, m), (u, m, t), (m, t, a), (...","[g, e, u, m, t, a, e, s, e, o, b, (g, e), (e, ...",,11,11.0,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",,0.614584,0.229460,,Korean,,


#### How many unique characters?

In [19]:
merged_df.columns

Index(['fullname', 'original_fullname', 'alphabet', 'unigrams', 'bigrams',
       'trigrams', 'char_ngrams', 'word_ngrams', 'name_length',
       'avg_token_length', 'num_tokens', 'transliteration', 'period_freq',
       'dash_freq', 'apostrophe_freq', 'space_freq', 'indiv_unigrams_fdist',
       'indiv_bigrams_fdist', 'indiv_trigrams_fdist', 'unigrams_cosine_sim',
       'bigrams_cosine_sim', 'trigrams_cosine_sim', 'language', 'id',
       'label_tr'],
      dtype='object')

In [32]:
# how many total unique chars based on our unigrams?
print(merged_df['language'].unique())
unigrams_fdist = create_lang_char_distribution(merged_df, 'unigrams')
len(unigrams_fdist)

['Indonesian' 'Malay' 'Vietnamese' 'Chinese (Romanized)'
 'Chinese (Characters)' 'Turkish' 'Korean']


152

In [33]:
unigrams_fdist.keys()

dict_keys([' ', '"', "'", '(', ')', ',', '-', '.', '/', '7', ':', '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', 'à', 'á', 'â', 'ã', 'ä', 'å', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ā', 'ă', 'ą', 'ć', 'č', 'đ', 'ē', 'ğ', 'ĩ', 'ī', 'ı', 'ō', 'ŏ', 'ś', 'ş', 'š', 'ũ', 'ū', 'ŭ', 'ű', 'ž', 'ơ', 'ư', 'ǎ', 'ǧ', 'ǹ', 'ș', 'ț', 'ʻ', '̇', 'ạ', 'ả', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ', 'ắ', 'ằ', 'ặ', 'ế', 'ề', 'ể', 'ễ', 'ệ', 'ỉ', 'ị', 'ọ', 'ỏ', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ớ', 'ờ', 'ở', 'ợ', 'ụ', 'ủ', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'ỳ', 'ỵ', 'ỷ', 'ỹ', '\u200b', '\u200e', '\u200f', '‑', '’', '人', '卓', '政', '治', '燮', '物', '賢', '趙', '郑', '镇', '高'])

In [34]:
unigrams_fdist

{' ': 0.09234122115983197,
 '"': 2.1503457218334276e-06,
 "'": 9.354003889975411e-05,
 '(': 0.0006343519879408612,
 ')': 0.0006343519879408612,
 ',': 1.612759291375071e-05,
 '-': 0.012799932909213479,
 '.': 0.0007622975583899502,
 '/': 2.1503457218334276e-05,
 '7': 1.0751728609167138e-06,
 ':': 1.0751728609167138e-06,
 '@': 7.5262100264169975e-06,
 'a': 0.10832689125594168,
 'b': 0.013768663656899438,
 'c': 0.014053584465042367,
 'd': 0.02025948221825364,
 'e': 0.06334488427376911,
 'f': 0.007453098271874661,
 'g': 0.05017401672753937,
 'h': 0.04609158537463861,
 'i': 0.07519328920107131,
 'j': 0.015486789888644346,
 'k': 0.023249537944463022,
 'l': 0.03357334775498531,
 'm': 0.03245301763391009,
 'n': 0.09933414544723428,
 'o': 0.050080476688639616,
 'p': 0.007413316876020743,
 'q': 0.002555685890399029,
 'r': 0.033337884898444546,
 's': 0.03247667143685026,
 't': 0.024273102508055734,
 'u': 0.050744933516686144,
 'v': 0.004068454105708845,
 'w': 0.010253923574562701,
 'x': 0.00450819

In [29]:
merged_df.shape

(75132, 10)

### 7. Keeping numerical columns only for each dataset (Same process as step 4, except we don't have to repeat lines of code...)

In [20]:
label_col = merged_df['language']
merged_df = merged_df.select_dtypes(exclude = 'object')
merged_df.drop('trigrams_cosine_sim', inplace = True, axis = 1)
merged_df['language'] = label_col
merged_df

Unnamed: 0,name_length,avg_token_length,num_tokens,period_freq,dash_freq,apostrophe_freq,space_freq,unigrams_cosine_sim,bigrams_cosine_sim,language
0,9,9.0,1,0,0,0,0,0.664809,0.250640,Indonesian
1,12,12.0,1,0,0,0,0,0.686625,0.353292,Indonesian
2,8,8.0,1,0,0,0,0,0.688312,0.197139,Indonesian
3,9,9.0,1,0,0,0,0,0.581396,0.155386,Indonesian
4,8,8.0,1,0,0,0,0,0.463215,0.176917,Indonesian
...,...,...,...,...,...,...,...,...,...,...
75127,10,4.5,2,0,1,0,1,0.675313,0.212623,Korean
75128,12,5.5,2,0,1,0,1,0.769942,0.394563,Korean
75129,9,9.0,1,0,0,0,0,0.777239,0.508705,Korean
75130,11,11.0,1,0,0,0,0,0.614584,0.229460,Korean


In [21]:
# checking that there are no null values
np.any(pd.isnull(merged_df))

False

In [22]:
merged_df.to_pickle('merged_df.pkl.gz', compression='gzip')

## IGNORE everything after this: we will be training in individual files

### Steps

Next steps:
1. Clean up columns so we can combine dataframes into one (focus on making an all-Latin dataset first)
    - do not combine in this step
2. Frequency distributions for Latin names -> redo
3. Add a_hat_freq
4. Only keep numerical columns
    - turn some categorical features -> numerical so we have more things to feed into model
5. Add in label (language) for each dataset
6. Combine Latin and non-Latin names to make one big dataset
    - may need to repeat some of the above steps for non-Latin names
7. Train test split
8. MODEL TRAINING!
9. Model evaluation

Reminder:
- We decided to keep period_freq, dash_freq, apostrophe_freq for now. After our first run of model training, we can remove them to see if it improves the performance

**You can work on these steps out of order** (act as if the previous steps r there), but in the end we ideally want all of these steps implemented in this order.

For example, you could write the code for model training and train the model on one or a few datasets. Later on, we'll just replace the variables you used with the ones containing all the languages/names.

### 7. train test split

In [23]:
#can replace file names later
filename = os.path.join(os.getcwd(), "company_person_name_dataset.csv")
#filename = os.path.join(os.getcwd(), "Name_Of_Origin_Project-", "company_person_name_dataset.csv")
df = pd.read_csv(filename, header=0)

print(df)


                                    name  class lang
0                The Canal of the Angels      0   en
1                      Rescue Renovation      0   en
2       Agatha Christie: The ABC Murders      0   en
3                            Siti Akbari      0   ar
4                                  Stany      0   pl
...                                  ...    ...  ...
199995                   Robber's Bridge      0   en
199996                       Johan Renck      0   en
199997                      Lyle Stewart      1   en
199998           Thomas Colclough Watson      1   en
199999                              Gavà      0   ca

[200000 rows x 3 columns]


In [24]:
y = merged_df['language']
print(merged_df.columns)

X = merged_df.drop(columns = 'language', axis = 1) # oops this code is a bit redundant with before but its ok
X.head()

Index(['name_length', 'avg_token_length', 'num_tokens', 'period_freq',
       'dash_freq', 'apostrophe_freq', 'space_freq', 'unigrams_cosine_sim',
       'bigrams_cosine_sim', 'language'],
      dtype='object')


Unnamed: 0,name_length,avg_token_length,num_tokens,period_freq,dash_freq,apostrophe_freq,space_freq,unigrams_cosine_sim,bigrams_cosine_sim
0,9,9.0,1,0,0,0,0,0.664809,0.25064
1,12,12.0,1,0,0,0,0,0.686625,0.353292
2,8,8.0,1,0,0,0,0,0.688312,0.197139
3,9,9.0,1,0,0,0,0,0.581396,0.155386
4,8,8.0,1,0,0,0,0,0.463215,0.176917


In [25]:
# Step 3: Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [26]:
# Step 4: Model Training - Random Forest
rf_classifier = RandomForestClassifier(random_state=42)

In [27]:
# Hyperparameter tuning using GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [28]:
grid_search_rf = GridSearchCV(rf_classifier, param_grid=param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Get the best model from the grid search
best_rf_model = grid_search_rf.best_estimator_

In [None]:
# Evaluate the Random Forest model on the test set
rf_predictions = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [None]:
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

Random Forest Accuracy: 0.8428922708845618
Random Forest Classification Report:
                      precision    recall  f1-score   support

Chinese (Characters)       0.79      0.88      0.83      2211
 Chinese (Romanized)       0.68      0.75      0.72      2096
          Indonesian       0.73      0.80      0.76      2249
              Korean       0.94      0.91      0.93      3844
               Malay       0.55      0.23      0.33       582
             Turkish       1.00      0.99      1.00      3607
          Vietnamese       0.56      0.33      0.42       458

            accuracy                           0.84     15047
           macro avg       0.75      0.70      0.71     15047
        weighted avg       0.84      0.84      0.84     15047



In [None]:
# Step 4: Model Training - SVM
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

In [None]:
grid_search_svm = GridSearchCV(svm_classifier, param_grid=param_grid_svm, cv=5)
grid_search_svm.fit(X_train, y_train)


In [None]:
# Get the best model from the grid search
best_svm_model = grid_search_svm.best_estimator_

In [None]:
# Evaluate the SVM model on the test set
svm_predictions = best_svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)

In [None]:
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

In [None]:
# we will leave this commented for now
# randomizing data - idk if this is correct or necessary?
# X, y = shuffle(X, y)
# X

In [None]:
#can change test data size
# changed: 0.10 -> 0.30
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1234) 
# y_train

Naive Bayes

In [None]:
# Define a pipeline with a TfidfVectorizer and Multinomial Naive Bayes classifier
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()) # we want this probably
])

# Define the parameter grid to search
param_grid_nb = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': (1e-2, 1e-3, 1e-4)
}

In [None]:
grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv = 5, n_jobs = -1)
grid_search_nb.fit(X_train, y_train)

In [None]:
# Get the best model from the grid search
best_nb_model = grid_search_nb.best_estimator_

In [None]:
# Print the best parameters and best score
# not in the other code parts but its ok
print("Best Parameters: ", grid_search_nb.best_params_)
print("Best Score: ", grid_search_nb.best_score_)

In [None]:
nb_predictions = grid_search_nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)

print("NB Accuracy:", svm_accuracy)
print("NB Classification Report:")
print(classification_report(y_test, nb_predictions))

#### 8. Model Training

Random forest, SVM, RNNs, Naive Bayes

use gridsearchcv

In [None]:
# Random forest
# rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 20)
# rf.fit(X_train, y_train)
# rf_predictions = list(rf_20_model.predict_proba(X_test)[:,1])
# in ML foundations we used ROC and AUC to evaluate

In [None]:
# there r multiple types of support vector machines
# not sure if this is correct
# svc = svm.SVC()
# svc.fit(X_train, y_train)
# svc_predictions = svc.predict(X_test, y_test)

In [None]:
# RNNs - not sure if this is correct
# mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, ... hidden_layer_sizes=(5, 2), random_state=1)
# mlp.fit(X_train, y_train)
# mlp_predictions = mlp.predict(X_test, y_test)

In [None]:
# naive bayes - there r diff types
# this is multinomialNB, is said to be used for text classification
# mn_nb = MultinomialNB(force_alpha=True) # idk
# mn_nb.fit(X_train, y_train)
# mn_nb_predictions = mn_nb.predict(X_test, y_test)

evaluation: precision, recall, f1 score

In [None]:
# f1 score
# need multiple cells, one for each evaluation
# rf_f1 = f1_score(y_test, rf_predictions, average=None)