In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import seaborn as sns
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import figure, Figure, show
from bokeh.io import output_notebook, push_notebook, show, output_file, save
from bokeh.transform import factor_cmap
from bokeh.palettes import Colorblind
from bokeh.layouts import layout, gridplot, column, row
from bokeh.models import ColumnDataSource, Slider, CustomJS, Select, DateRangeSlider, Legend, LegendItem, HoverTool
import bokeh.io
from bokeh.resources import INLINE
import translators as ts

Using United States server backend.


# Import Reduced List to Translate and Plot from Pickle

In [8]:
reduced_list = pd.read_pickle("/data/amatoe/Jupyter Notebooks/Embedding Code/Swahili/reduced_list_Swahili_umap248_HDBSCAN_min_clust=9_min_samples=1_final.pickle")

In [9]:
lang_abbreviation = "sw" # 2-letter abbreviation of language to load/download the model e.g. "es" for Spanish

lang_name = "Swahili" #for Plot Titles e.g. "Spanish"

noun_class_dict = {"kiVi":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili ki-vi Nouns 1-18-22 (389).txt",
                   "mMi":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili m-mi Nouns 1-18-22 (348).txt",
                   "mWa":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili m-wa Nouns 1-18-22 (449).txt",
                   "ma":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili ma Nouns 1-18-22 (764).txt",
                   "n":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili n Nouns 1-18-22 (1667).txt",
                   "pa":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili pa Nouns 1-18-22 (4).txt",
                   "u":r"/data/amatoe/Jupyter Notebooks/Language_Data/Swahili/Nouns/Swahili u Nouns 1-18-22 (432).txt"
                  } #Names of Noun Classes should be put in 'keys' and file locations should be put in 'values' of dict




In [10]:
noun_class_names = list(noun_class_dict.keys())
index_name_dict = {i:noun_class_names[i] for i in range(len(noun_class_names))}
reduced_list['noun class'] = reduced_list['noun class'].replace(index_name_dict)
reduced_list= reduced_list.astype({'cluster labels': 'str'})

## Initialize Blank DF for Translated Terms

In [11]:
translated_list = pd.DataFrame({'label':reduced_list['label'], 'translated':None})

In [12]:
import time
import re 
from IPython.display import clear_output

def translateChunks(df, chunk_size = 10): #this implementation calls Google Translate API in chunks of words rather than just 1 at a time -should be nicer?
    counter = 0
    try:
        for i in range(0, len(df), chunk_size):
            chunk = str(df[['label']][i:i+chunk_size].astype(str))
            chunk = re.sub(("\d*[^\S\r\n]|label\s"), "", chunk)
            translated_chunk = ts.google(chunk, to_language='en', from_language=lang_abbreviation)
            translated_chunk_list = pd.Series(translated_chunk.splitlines())
            translated_list['translated'][i:i+chunk_size] = translated_chunk_list
            time.sleep(5)
            counter+=chunk_size
            clear_output(wait=True)
            print(f"Nouns Translated: {counter}. Percent: {round((counter/len(df)*100),2)}%", end = " ")
        print("Completed")
        translated_list.to_pickle(f'translated{lang_name}ReducedList.pickle')
    except Exception as e: #saves if API throws error
            translated_list.to_pickle(f'translated{lang_name}ReducedList.pickle')
            print(e)
            clear_output(wait=True)
            return counter
        

        


## Translate Terms

In [13]:
translateChunks(reduced_list, chunk_size=40)

Nouns Translated: 3880. Percent: 100.67% Completed


In [14]:
reduced_list['translated'] = translated_list['translated'] #append translated terms to reduced list

In [15]:
reduced_list

Unnamed: 0,label,x_coord,y_coord,noun class,cluster labels,translated
0,Kiajemi,6.640720,-2.441250,kiVi,-1,Persian.
1,kiambaza,9.970647,-4.367177,kiVi,-1,distribution.
2,kiambishi,5.417746,-2.731451,kiVi,82,prefix.
3,kiamsha kinywa,8.601791,-1.549509,kiVi,-1,breakfast
4,kiamshakinywa,8.585474,-1.317514,kiVi,-1,breakfast
...,...,...,...,...,...,...
427,wizu,6.650084,-1.747648,u,-1,Wizi.
428,woga,9.323900,-4.534373,u,-1,nervous
429,wokovu,9.128816,-4.013641,u,-1,salvation.
430,wororo,6.075280,-2.017240,u,-1,tender


In [2]:
#Stand in for direct Plotting from translated Dataframe
reduced_list = pd.read_pickle(r"/data/amatoe/Jupyter Notebooks/Translated DF/Spanish/Spanish_translated_reduced_list_HDBSCAN")

# Interactive Plot of Translated Dataframe

In [None]:
df = reduced_list

clusters = [str(i) for i in range(len(df['cluster labels'].unique()))]

cols1 = df#[['cluster labels','x_coord', 'y_coord']]
cols2 = cols1[cols1['cluster labels'] == '0']

Overall = ColumnDataSource(data=cols1)
Curr = ColumnDataSource(data=cols2)



#plot and the menu is linked with each other by this callback function
callback = CustomJS(args=dict(source=Overall, sc=Curr), code="""
var f = cb_obj.value
sc.data['x_coord']=[]
sc.data['y_coord']=[]
sc.data['noun class'] =[]
sc.data['label']=[]
sc.data['translated']=[]

for(var i = 0; i <= source.get_length(); i++){
    if (source.data['cluster labels'][i] == f){
        sc.data['x_coord'].push(source.data['x_coord'][i])
        sc.data['y_coord'].push(source.data['y_coord'][i])
        sc.data['noun class'].push(source.data['noun class'][i])
        sc.data['cluster labels'].push(source.data['cluster labels'][i])
        sc.data['label'].push(source.data['label'][i])
        sc.data['translated'].push(source.data['translated'][i])
    }
}

sc.change.emit();
bokeh_p.legend.label.field = sc.data['noun class'];

""")

menu = Select(options=clusters, value='0', title = 'Cluster #')  # create drop down menu

bokeh_p=figure(x_axis_label ='X Coord', y_axis_label = 'Y Coord', y_axis_type="linear",x_axis_type="linear") #creating figure object 

mapper = factor_cmap(field_name = "noun class", palette = Colorblind[6], factors = reduced_list['noun class'].unique()) #color mapper

bokeh_p.circle(x='x_coord', y='y_coord', color='gray', alpha = .05, source=Overall)

bokeh_p.circle(x = 'x_coord', y = 'y_coord', fill_color = mapper, line_color = mapper, source = Curr, legend_field = 'noun class')

tooltips = [
            ('Token Name', '@label'),
            ('Translation', '@translated')
           ]


bokeh_p.add_tools(HoverTool(tooltips=tooltips))


bokeh_p.legend.title = "Noun Classes"



menu.js_on_change('value', callback) # calling the function on change of selection
bokeh.io.output_notebook(INLINE)
final_layout = layout(menu, bokeh_p)
output_file("TranslatedSpanish.html")
show(final_layout, notebook_handle=True)


## Save Translated DataFrame to Pickle for Easy Plotting Later

In [17]:
reduced_list.to_pickle(f"{lang_name}_translated_reduced_list_HDBSCAN")

## Look up Clusters for Noun Membership

In [18]:
lookup_cluster = 39

reduced_list.loc[reduced_list['cluster labels'] == str(lookup_cluster)]

Unnamed: 0,label,x_coord,y_coord,noun class,cluster labels,translated
220,msafiri,8.586594,-2.895705,mWa,39,traveler.
274,jawabu,9.322942,-3.905842,ma,39,Jawabu.
333,kafiri,9.212831,-3.471657,ma,39,disbelief.
1571,thawabu,9.49698,-4.052142,n,39,Refund
276,usafiri,8.377518,-3.110513,u,39,transportation.


## Lookup English Concepts for Cluster Membership

In [19]:
lookup_term = 'triangle'


reduced_list[reduced_list['translated'].str.contains(lookup_term)]

Unnamed: 0,label,x_coord,y_coord,noun class,cluster labels,translated
1165,pembetatu,8.607054,-1.869733,n,-1,triangle.


In [12]:
from bokeh.embed import file_html, components
from bokeh.resources import CDN
 
script1, div1 = components(final_layout)
cdn_js = CDN.js_files
cdn_css = CDN.css_files
#html = file_html(final_layout, CDN, "Spanish")

In [4]:
output_file(final_layout, "SpanishTranslated.html")

TypeError: stat: path should be string, bytes, os.PathLike or integer, not Column

In [9]:
cdn_js[0]

'https://cdn.bokeh.org/bokeh/release/bokeh-2.2.3.min.js'

In [13]:
cdn_css

[]

In [14]:
from platform import python_version

print(python_version())

3.8.5
