# Translite - A Low Resoure Modular Multi-Lingual NLP Translator

## Section 1 - Configuration
This is where the datasets will be prepared and configured. User is capable of adding as many languages as they'd like to this. The main requirement will be that datasets must be stored on Kaggle and in the following formats:
  - CSV
  - Excel
  - JSON
  - Parquet

This can be checked by looking at the **Data Explorer** on the Kaggle page for your dataset and seeing if the files have the extension ".json", ".csv", ".xlsx", or ".parquet".

#### 1.1 - Install Requirements

In [None]:
!pip install ipywidgets pandas kaggle

#### 1.2 - Load/Create Config

Here you will add your datasets for all of the languages you'd like to translate to English. You will need to add the url to the raw dataset, the name of the column where the English versions are, the name of the column where the non-English versions are stored, and the actual name of the language. It is suggested that you use the export button to save your config when you're done so that you can quickly import again when you return. A sample dataset with Spanish and Italian is available as well.

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
from google.colab import files

df = pd.DataFrame(columns=[
    "Kaggle Dataset Name",
    "Language Name",
    "English Column Name",
    "Other Language Column Name"])

def display_table(df):
    clear_output(wait=True)
    display(ds_uri, lang, eng_col, natv_col)
    display(widgets.HBox((add_button, import_button, export_button, clear_button)))
    display(df)

def add_row(ds_uri, lang, eng_col, natv_col):
    global df
    new_row = pd.DataFrame([[ds_uri, lang, eng_col, natv_col]], columns=df.columns.tolist())
    df = pd.concat([df, new_row], ignore_index=True)
    display_table(df)

ds_uri = widgets.Text(description="Kaggle Dataset Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
lang = widgets.Text(description="Language Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
eng_col = widgets.Text(description="English Column Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
natv_col = widgets.Text(description="Other Language Column Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
add_button = widgets.Button(description="Add Row")
import_button = widgets.Button(description="Import Dataset")
export_button = widgets.Button(description="Export Dataset")
clear_button = widgets.Button(description="Clear Table")

def on_import_button_click(b):
    uploaded = files.upload()
    filename = next(iter(uploaded))
    global df
    df = pd.read_json(filename)
    display_table(df)

def on_export_button_click(b):
    df.to_json("translite_datasets.json", index=False)
    files.download("translite_datasets.json")

def on_add_button_click(b):
    add_row(ds_uri.value, lang.value, eng_col.value, natv_col.value)
    ds_uri.value = ""
    lang.value = ""
    eng_col.value = ""
    natv_col.value = ""

def on_clear_button_click(b):
    global df
    df = pd.DataFrame(columns=[
        "Kaggle Dataset Name",
        "Language Name",
        "English Column Name",
        "Other Language Column Name"])
    display_table(df)

add_button.on_click(on_add_button_click)
import_button.on_click(on_import_button_click)
export_button.on_click(on_export_button_click)
clear_button.on_click(on_clear_button_click)
display_table(df)


##### Load Sample (Optional)
If you just want to test this script and don't want to curate a list of languages yourself please use this feature to load a sample dataset.

In [None]:
import json
import pandas as pd
from google.colab import files

raw_config = '{"Kaggle Dataset Name":{"0":"devicharith\/language-translation-englishfrench","1":"lonnieqin\/englishspanish-translation-dataset"},"Language Name":{"0":"French","1":"Spanish"},"English Column Name":{"0":"English words\/sentences","1":"english"},"Other Language Column Name":{"0":"French words\/sentences","1":"spanish"}}'
df = pd.DataFrame(json.loads(raw_config))
df.to_json("translite_datasets.json", index=False)
files.download("translite_datasets.json")

#### 1.3 - Connect to Kaggle API
Datasets for this product will need to be provided from Kaggle, this means that if you need to utilize cutom language translation datasets you must first host them on https://kaggle.com. You need to get a Kaggle API Key as well. To get this go to your account settings page on Kaggle, and click "Create New API Token". This will create a file called `kaggle.json` that you will upload here.



In [None]:
from google.colab import files
files.upload()
!mkdir -p ~/.config/kaggle
!cp kaggle.json ~/.config/kaggle/
!chmod 600 ~/.config/kaggle/kaggle.json

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

#### 1.4 - Load Datasets from Config
This is where you'll test out your config and ensure that you can properly load datasets from it.

In [None]:
import os

allowed_types = {
    'csv': pd.read_csv,
    'json': pd.read_json,
    'xlsx': pd.read_excel,
    'parquet': pd.read_parquet
}

for _, dataset in df.iterrows():
    if os.path.isdir(dataset["Language Name"]):
        continue

    print(f"Loading {dataset['Language Name']}")
    dataset_name = dataset["Kaggle Dataset Name"]
    language_name = dataset["Language Name"]

    # Get and prep datasets from kaggle
    !kaggle datasets download {dataset_name} -d {language_name}
    !unzip {dataset_name.split('/')[-1]}.zip -d {language_name}

    files = [(f, allowed_types.get(f.split('.')[-1])) for f in os.listdir(language_name)]

    curr = pd.DataFrame(columns=[
        "class", "english", "native"
    ])

    # Look for all files in the dir that are readable as datasets and
    # reformat them
    for f, fn in files:
      if fn:
        tmp = fn(f"{language_name}/{f}")
        tmp = tmp.rename(columns={
            dataset["English Column Name"]: "english",
            dataset["Other Language Column Name"]: "native"
        })
        tmp["class"] = language_name
        tmp["class_int"] = int(df.apply(lambda row: row[row == 'Spanish'].index, axis=0).iloc[1].values[0])
        curr = pd.concat([curr, tmp], ignore_index=True)

    curr.to_csv(f"{language_name}.csv", index=False)