### Imports

In [11]:
import pandas as pd
import numpy as np
import math

### Load N-Gram Master Vocab CSV

###### This CSV / Data currently contains the master vocabulary of N-Grams (unique set) and their frequencies in each language. 

In [12]:
df = pd.read_csv("../data/vocab/master_vocab_char3_ngrams.csv")
df = df.rename(columns={"Unnamed: 0": "ngram"})
ngrams = df["ngram"]

### Feature Engineering

###### Right now, the current feature only represents the **raw frequencies** of how many times each n-gram appears in a given language. This can **skew language similarity**, since common n-grams that appear across all languages will dominate the comparison. For example, languages might be considered similar if N-Grams that contain common vowels such as "_ea" or _ou" have high raw frequencies for both, even when in reality they are not similar languages.  To address this, we will apply **TF-IDF**: a weighting method that adjusts n-gram importance based on how **unique** and **informative** each one is to a specific language.


#### TF (Term Frequency)

###### This is the amount of times the N-Gram appears in the document per language. Since each of our documents vary in size, we will have to normalize to account for this by dividing by the total number of terms per document.

In [13]:
df_numeric = df.drop(columns=["ngram"])
tf_numeric = df_numeric.div(df_numeric.sum(axis=0), axis=1)
tf = pd.concat([ngrams, tf_numeric], axis=1)

In [14]:
tf.head()

Unnamed: 0,ngram,adasen,bikolano,cebuano,chavacano,english,ilokano,ilonggo,kinaray-a,masbatenyo,paranan,romblomanon,spanish,tagalog,tausug,waray,yami
0,gol,2e-06,3.9e-05,4.2e-05,9e-06,2.2e-05,2.3e-05,3e-05,1.9e-05,2.1e-05,5e-06,5e-06,5.5e-05,9.3e-05,0.0,8.9e-05,3.2e-05
1,tig,2.2e-05,4.1e-05,0.000326,0.00024,0.0,1.4e-05,0.000175,4.7e-05,0.000212,5.4e-05,0.000123,0.000117,4e-05,3.7e-05,0.000267,0.0
2,sej,0.0,0.0,0.0,2.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7e-06,0.0,0.0,0.0,0.0
3,ubd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,3e-06,0.0
4,tka,1.8e-05,0.0,0.0,0.0,0.0,0.000138,0.0,0.0,0.0,0.000202,0.0,0.0,0.0,0.0,3e-06,8.3e-05


### DF (Document Frequency)

###### This is how many documents/languages the N-Gram appears in.

In [15]:
df_numeric = df.drop(columns=["ngram"])
df_count = (df_numeric > 0).sum(axis=1)
df_full = pd.DataFrame({"ngram": df["ngram"], "df": df_count})

In [16]:
df_full.head()

Unnamed: 0,ngram,df
0,gol,15
1,tig,14
2,sej,2
3,ubd,2
4,tka,5


### IDF (Inverse Document Frequency)

###### This gets how unique a document is: **log(N/DF)**, where N = total amount of documents / languages.

In [17]:
N = df_numeric.shape[1]
idf = np.log(N / (df_count + 1e-10)) # we add +1e-10 to avoid division by zero, although with how the data is processed, this should never happen
idf_full = pd.DataFrame({"ngram": df["ngram"], "idf": idf})

### TF-IDF (Term Frequency - Inverse Document Frequency)

###### Now we multiply TF by IDF to get the final TF-IDF scores for each n-gram in each language.

In [18]:
# multiply TF by IDF for each language
tfidf_numeric = tf_numeric.multiply(idf.values, axis=0)
tfidf = pd.concat([ngrams, tfidf_numeric], axis=1)

In [19]:
tfidf.head()

Unnamed: 0,ngram,adasen,bikolano,cebuano,chavacano,english,ilokano,ilonggo,kinaray-a,masbatenyo,paranan,romblomanon,spanish,tagalog,tausug,waray,yami
0,gol,1.297854e-07,2e-06,3e-06,5.534609e-07,1e-06,1e-06,2e-06,1e-06,1e-06,3.344294e-07,3.175897e-07,4e-06,6e-06,0.0,6e-06,2e-06
1,tig,2.953813e-06,6e-06,4.4e-05,3.206338e-05,0.0,2e-06,2.3e-05,6e-06,2.8e-05,7.265377e-06,1.642748e-05,1.6e-05,5e-06,5e-06,3.6e-05,0.0
2,sej,0.0,0.0,0.0,4.45815e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4e-05,0.0,0.0,0.0,0.0
3,ubd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,5e-06,0.0
4,tka,2.105161e-05,0.0,0.0,0.0,0.0,0.00016,0.0,0.0,0.0,0.000235064,0.0,0.0,0.0,0.0,3e-06,9.7e-05


In [20]:
tfidf.to_csv("../results/tfidf.csv", index=False)