In [1]:
import pandas as pd
import numpy as np

# Model training
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

This file was originally for frequency distributions for Latin names!
but we can also concat the other dfs / do model training after in the same file

In [14]:
df_indo = pd.read_pickle('df_indo.pkl.gz', compression='gzip')
df_malay = pd.read_pickle('df_malay.pkl.gz', compression='gzip')
df_viet = pd.read_csv('viet_df.csv')
df_cnrom = pd.read_csv('cnrom_df.csv')
df_cnchar = pd.read_csv('cnchar_df.csv')
df_turk = pd.read_pickle('turkish_df.pkl.gz', compression='gzip')
df_korean = pd.read_pickle('korean_df.pkl.gz', compression='gzip') 
# also import other dfs

all_dfs = [df_indo, df_malay, df_viet, df_cnrom, df_cnchar, df_turk]

### 1. Cleaning up column names

column names to KEEP: (10 so far)

* name_length
* avg_token_length
* num_tokens
* period_freq
* dash_freq
* apostrophe_freq
* space_freq
* unigrams_cosine_sim
* bigrams_cosine_sim
* language

In [3]:
# # finding what columns r in viet that aren't in indo
# print(df_indo.columns)
# print(df_viet.columns)
# [col for col in df_viet.columns if col not in df_indo.columns]

In [4]:
# idk if i can generalize this lol
# def rename_columns(df_ref, df_to_change):
#     diff_cols = [col for col in df_to_change.columns if col not in df_ref.columns]
#     new_names = 
#     df_to_change.rename(columns = {oldName : newName})

In [15]:
#This is where you rename columns to all match
df_viet.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnrom.rename(columns = {'word_length': 'name_length'}, inplace = True)
df_cnchar.rename(columns = {'word_length': 'name_length'}, inplace = True)

### 2. Redoing frequency distributions across all Latin names

#### 2a. Frequency Distribution Functions

In [16]:
# Functions from IndoMalay.ipynb

def create_lang_char_distribution(df, col_name):
    char_freqs = {}
    total_num_chars = 0  # across the entire language/dataset

    for name in df[col_name]:
        for char in name:
            if char not in char_freqs.keys():
                char_freqs[char] = 1
            else:
                char_freqs[char] += 1
            total_num_chars += 1

    char_freqs_relative = dict(sorted({char: count / total_num_chars for char, count in char_freqs.items()}.items()))
    return char_freqs_relative

def initialize_all_possible_bigrams(all_possible_chars):
    all_possible_bigrams = {}
    for first_char in all_possible_chars:  # first character of the current bigram
        for second_char in all_possible_chars:  # second character of the current bigram
            all_possible_bigrams[(first_char, second_char)] = 0
    return all_possible_bigrams

def create_lang_gram_distribution(initialized_grams, df, col_name):
    gram_freqs = initialized_grams.copy()  # need a copy otherwise initiailized_grams is changed
    total_num_grams = 0  # across the entire language/dataset
    
    for grams_list in df[col_name]:
        for gram in grams_list:
            gram_freqs[gram] += 1
            total_num_grams += 1
    
    gram_freqs_relative = {gram: count / total_num_grams for gram, count in gram_freqs.items()}
    return gram_freqs_relative

def initialize_all_possible_trigrams(all_possible_chars):
    all_possible_trigrams = {}
    for first_char in all_possible_chars:  # first character of the current trigram
        for second_char in all_possible_chars:  # second character of the current trigram
            for third_char in all_possible_chars:  # third character of the current trigram
                all_possible_trigrams[(first_char, second_char, third_char)] = 0
    return all_possible_trigrams

def create_indiv_gram_distribution(grams_list, initialized_grams):
    gram_freqs_relative = initialized_grams.copy()  
    num_grams = len(grams_list)  # for this current example
    
    for gram in grams_list:
        gram_freqs_relative[gram] += 1 / num_grams

    return gram_freqs_relative

def set_indiv_trigram_dist(trigrams_list, init_trigrams):
    trigrams_fdist_relative = init_trigrams
    num_grams = len(trigrams_list)

    for gram in trigrams_list:
        trigrams_fdist_relative[gram] += 1 / num_grams

    return trigrams_fdist_relative

# TRIGRAMS individual frequency distributions
#df_indo['indiv_trigrams_fdist'] = df_indo.apply(lambda row: set_indiv_trigram_dist(row['trigrams'], row['indiv_trigrams_fdist']), axis = 1)

#### 2b. Determining which languages use Latin

For these lines of code to work, the datasets must have been pickled to preserve data types! `pd.csv` turns everything into strings; for example, a list of `[LATIN, LATIN, LATIN, ...]` becomes `'[LATIN, LATIN, LATIN, ...]'` (i.e., `'['` becomes a character).

In [20]:
indo_latin_percent = create_lang_char_distribution(df_indo, 'alphabet')['LATIN']
malay_latin_percent = create_lang_char_distribution(df_malay, 'alphabet')['LATIN']
#malay_latin_percent
# viet_latin_percent = create_lang_char_distribution(df_viet, 'alphabet')['LATIN']
# cnrom_latin_percent = create_lang_char_distribution(df_cnrom, 'alphabet')['LATIN']
# cnchar_latin_percent = create_lang_char_distribution(df_cnchar, 'alphabet')['LATIN']
turk_latin_percent = create_lang_char_distribution(df_turk, 'alphabet')
turk_latin_percent

{'A': 0.2, 'I': 0.2, 'L': 0.2, 'N': 0.2, 'T': 0.2}

#### 2c. Remaking Frequency Distributions

### 3. Add a_hat_freq and turn categorical columns into numerical

### 4. Keeping numerical columns only for each dataset

In [8]:
# # DO FOR EACH DATASET
# # dropping non-numerical columns - this is why it would be good to have more numerical features since we don't have a lot
# df_indo = df_indo.select_dtypes(exclude = 'object')
# # df_indo

In [9]:
# # DROP FOR WHOLE DATASET
# # dropping trigrams for indo and malay
# df_indo.drop('trigrams_cosine_sim', inplace = True, axis = 1)
# df_malay.drop('trigrams_cosine_sim', inplace = True, axis = 1)

In [10]:
# # clean up viet / other dfs before combining
# df_viet.drop('trigrams', inplace = True, axis = 1)

### 5. Adding the language (label) to each dataset

In [11]:
df_indo['language'] = 'Indonesian'
df_malay['language'] = 'Malay'
df_viet['language'] = 'Vietnamese'
df_cnrom['language'] = 'Chinese (Romanized)'
df_cnchar['language'] = 'Chinese (Characters)'
df_turk['language'] = 'Turkish'
#df_korean['language'] = 'Korean (Romanized & Characters)'  Have to fix
# df_indo.head()

### 6. Combining all names to make one big dataset

In [12]:
# as you can see from output, we need the columns in the concatenated df (in this case, viet) to match
# it's okay if some values are NaN bc we'll drop all non-numerical columns anyway
merged_df = pd.concat(all_dfs, ignore_index = True, join = 'outer')
merged_df

Unnamed: 0,fullname,original_fullname,alphabet,unigrams,bigrams,trigrams,char_ngrams,word_ngrams,name_length,avg_token_length,...,space_freq,indiv_unigrams_fdist,indiv_bigrams_fdist,indiv_trigrams_fdist,unigrams_cosine_sim,bigrams_cosine_sim,trigrams_cosine_sim,language,id,label_tr
0,supriyadi,Supriyadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, u, p, r, i, y, a, d, i]","[(s, u), (u, p), (p, r), (r, i), (i, y), (y, a...","[(s, u, p), (u, p, r), (p, r, i), (r, i, y), (...","[s, u, p, r, i, y, a, d, i, (s, u), (u, p), (p...",[supriyadi],9,9.00,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.664809,0.250640,0.085949,Indonesian,,
1,triyaningsih,Triyaningsih,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[t, r, i, y, a, n, i, n, g, s, i, h]","[(t, r), (r, i), (i, y), (y, a), (a, n), (n, i...","[(t, r, i), (r, i, y), (i, y, a), (y, a, n), (...","[t, r, i, y, a, n, i, n, g, s, i, h, (t, r), (...",[triyaningsih],12,12.00,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.08333333333333333...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.686625,0.353292,0.117226,Indonesian,,
2,soerjadi,Soerjadi,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, j, a, d, i]","[(s, o), (o, e), (e, r), (r, j), (j, a), (a, d...","[(s, o, e), (o, e, r), (e, r, j), (r, j, a), (...","[s, o, e, r, j, a, d, i, (s, o), (o, e), (e, r...",[soerjadi],8,8.00,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.688312,0.197139,0.090295,Indonesian,,
3,undunsyah,Undunsyah,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[u, n, d, u, n, s, y, a, h]","[(u, n), (n, d), (d, u), (u, n), (n, s), (s, y...","[(u, n, d), (n, d, u), (d, u, n), (u, n, s), (...","[u, n, d, u, n, s, y, a, h, (u, n), (n, d), (d...",[undunsyah],9,9.00,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.581396,0.155386,0.060083,Indonesian,,
4,soeripto,Soeripto,"[LATIN, LATIN, LATIN, LATIN, LATIN, LATIN, LAT...","[s, o, e, r, i, p, t, o]","[(s, o), (o, e), (e, r), (r, i), (i, p), (p, t...","[(s, o, e), (o, e, r), (e, r, i), (r, i, p), (...","[s, o, e, r, i, p, t, o, (s, o), (o, e), (e, r...",[soeripto],8,8.00,...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.463215,0.176917,0.052811,Indonesian,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56009,nil i̇pek hülagü öztürkmen,Nil İpek Hülagü Öztürkmen,LATIN,"['n', 'i', 'l', ' ', 'i', '̇', 'p', 'e', 'k', ...","[('n', 'i'), ('i', 'l'), ('l', ' '), (' ', 'i'...","[('n', 'i', 'l'), ('i', 'l', ' '), ('l', ' ', ...","['n', 'i', 'l', ' ', 'i', '̇', 'p', 'e', 'k', ...",,23,5.75,...,3,[[0.11538462 0. 0. 0.03846154 ...,[[0. 0. 0. ... 0. 0. 0.]],[[0. 0. 0. ... 0. 0. 0.]],0.766280,0.250772,0.100388,Turkish,http://www.wikidata.org/entity/Q49703809,Nil İpek Hülagü Öztürkmen
56010,fatma betül sayan kaya,Fatma Betül Sayan Kaya,LATIN,"['f', 'a', 't', 'm', 'a', ' ', 'b', 'e', 't', ...","[('f', 'a'), ('a', 't'), ('t', 'm'), ('m', 'a'...","[('f', 'a', 't'), ('a', 't', 'm'), ('t', 'm', ...","['f', 'a', 't', 'm', 'a', ' ', 'b', 'e', 't', ...",,19,4.75,...,3,[[0.13636364 0. 0. 0.27272727 ...,[[0. 0. 0. ... 0. 0. 0.]],[[0. 0. 0. ... 0. 0. 0.]],0.812166,0.453787,0.210581,Turkish,http://www.wikidata.org/entity/Q24230049,Fatma Betül Sayan Kaya
56011,elif nur bozkurt tandoğan,Elif Nur Bozkurt Tandoğan,LATIN,"['e', 'l', 'i', 'f', ' ', 'n', 'u', 'r', ' ', ...","[('e', 'l'), ('l', 'i'), ('i', 'f'), ('f', ' '...","[('e', 'l', 'i'), ('l', 'i', 'f'), ('i', 'f', ...","['e', 'l', 'i', 'f', ' ', 'n', 'u', 'r', ' ', ...",,22,5.50,...,3,[[0.12 0. 0. 0.08 0.04 0. 0.04 0.04 0.04...,[[0. 0. 0. ... 0. 0. 0.]],[[0. 0. 0. ... 0. 0. 0.]],0.820321,0.366293,0.123383,Turkish,http://www.wikidata.org/entity/Q6053953,Elif Nur Bozkurt Tandoğan
56012,muhammed ali fatih erbakan,Muhammed Ali Fatih Erbakan,LATIN,"['m', 'u', 'h', 'a', 'm', 'm', 'e', 'd', ' ', ...","[('m', 'u'), ('u', 'h'), ('h', 'a'), ('a', 'm'...","[('m', 'u', 'h'), ('u', 'h', 'a'), ('h', 'a', ...","['m', 'u', 'h', 'a', 'm', 'm', 'e', 'd', ' ', ...",,23,5.75,...,3,[[0.11538462 0. 0. 0.19230769 ...,[[0. 0. 0. ... 0. 0. 0.]],[[0. 0. 0. ... 0. 0. 0.]],0.894470,0.462341,0.148177,Turkish,http://www.wikidata.org/entity/Q6085044,Muhammed Ali Fatih Erbakan


### 7. Keeping numerical columns only for each dataset (Same process as step 4, except we don't have to repeat lines of code...)

In [13]:
merged_df = merged_df.select_dtypes(exclude = 'object')
merged_df.drop('trigrams_cosine_sim', inplace = True, axis = 1)
merged_df

Unnamed: 0,name_length,avg_token_length,num_tokens,period_freq,dash_freq,apostrophe_freq,space_freq,unigrams_cosine_sim,bigrams_cosine_sim
0,9,9.00,1,0,0,0.0,0,0.664809,0.250640
1,12,12.00,1,0,0,0.0,0,0.686625,0.353292
2,8,8.00,1,0,0,0.0,0,0.688312,0.197139
3,9,9.00,1,0,0,0.0,0,0.581396,0.155386
4,8,8.00,1,0,0,0.0,0,0.463215,0.176917
...,...,...,...,...,...,...,...,...,...
56009,23,5.75,4,0,0,,3,0.766280,0.250772
56010,19,4.75,4,0,0,,3,0.812166,0.453787
56011,22,5.50,4,0,0,,3,0.820321,0.366293
56012,23,5.75,4,0,0,,3,0.894470,0.462341


### Steps

Next steps:
1. Clean up columns so we can combine dataframes into one (focus on making an all-Latin dataset first)
    - do not combine in this step
2. Frequency distributions for Latin names -> redo
3. Add a_hat_freq
4. Only keep numerical columns
    - turn some categorical features -> numerical so we have more things to feed into model
5. Add in label (language) for each dataset
6. Combine Latin and non-Latin names to make one big dataset
    - may need to repeat some of the above steps for non-Latin names
7. Train test split
8. MODEL TRAINING!
9. Model evaluation

Reminder:
- We decided to keep period_freq, dash_freq, apostrophe_freq for now. After our first run of model training, we can remove them to see if it improves the performance

**You can work on these steps out of order** (act as if the previous steps r there), but in the end we ideally want all of these steps implemented in this order.

For example, you could write the code for model training and train the model on one or a few datasets. Later on, we'll just replace the variables you used with the ones containing all the languages/names.

### Start train test split

In [14]:
#can replace file names later
#filename = os.path.join(os.getcwd(), "Name_Of_Origin_Project-", "company.csv")
#df = pd.read_csv(filename, header=0)

In [15]:
#y = merged_df['language']
#X = merged_df.drop(columns = 'language', axis = 1)
#X.head()

In [16]:
#can change test data size
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1234)

#### 8. Model Training

Random forest, SVM, RNNs, Naive Bayes

use gridsearchcv

In [None]:
# Random forest
# rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 20)
# rf.fit(X_train, y_train)
# rf_predictions = list(rf_20_model.predict_proba(X_test)[:,1])
# in ML foundations we used ROC and AUC to evaluate

In [None]:
# there r multiple types of support vector machines
# not sure if this is correct
# svc = svm.SVC()
# svc.fit(X_train, y_train)
# svc_predictions = svc.predict(X_test, y_test)

In [None]:
# RNNs - not sure if this is correct
# mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, ... hidden_layer_sizes=(5, 2), random_state=1)
# mlp.fit(X_train, y_train)
# mlp_predictions = mlp.predict(X_test, y_test)

In [None]:
# naive bayes - there r diff types
# this is multinomialNB, is said to be used for text classification
# mn_nb = MultinomialNB(force_alpha=True) # idk
# mn_nb.fit(X_train, y_train)
# mn_nb_predictions = mn_nb.predict(X_test, y_test)

evaluation: precision, recall, f1 score

In [None]:
# f1 score
# need multiple cells, one for each evaluation
# rf_f1 = f1_score(y_test, rf_predictions, average=None)