In [None]:
!wget https://noto-website-2.storage.googleapis.com/pkgs/NotoSansDevanagari-hinted.zip
!mkdir fonts
!unzip /content/NotoSansDevanagari-hinted.zip -d fonts/
!mv fonts/* /usr/share/fonts/truetype/

In [None]:

import sys
import os
import json
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np

def list_system_font():
    from matplotlib import font_manager
    font_paths = font_manager.findSystemFonts()
    font_objects = font_manager.createFontList(font_paths)
    font_names = [f.name for f in font_objects]
    print (sorted(font_names))

list_system_font()


In [None]:
def find_best_reference(pred_list, truth_list, topk = 3):
    def LCS_length(s1, s2):
        m = len(s1)
        n = len(s2)
        # An (m+1) times (n+1) matrix
        C = [[0] * (n+1) for i in range(m+1)]
        for i in range(1, m+1):
            for j in range(1, n+1):
                if s1[i-1] == s2[j-1]:
                    C[i][j] = C[i-1][j-1] + 1
                else:
                    C[i][j] = max(C[i][j-1], C[i-1][j])
        return C[m][n]

    best_ref = truth_list[0]
    best_cand = pred_list[0]
    best_ref_lcs = LCS_length(pred_list[0], truth_list[0])
    for cand in pred_list[1:topk]:
        for ref in truth_list[1:]:
            lcs = LCS_length(cand, ref)
            if (len(ref) - 2*lcs) < (len(best_ref) - 2*best_ref_lcs):
                best_ref = ref
                best_cand = cand
                best_ref_lcs = lcs

    return best_cand, best_ref


In [None]:

def generate_confusion(pred_file, truth_file, vocab):
    '''
    Returns a pandas dataframe with confusion matrix values
    '''
    with open(pred_file) as f:
        pred_data = json.load(f)
    with open(truth_file) as f:
        truth_data = json.load(f)

    conf_df = pd.DataFrame(0, columns=vocab, index=vocab)
    for k in pred_data:
        pred_list = pred_data[k]
        truth_list = truth_data[k]
        best_pred, best_truth = find_best_reference(pred_list, truth_list)

        max_len_ = max(len(best_pred), len(best_truth))
        pred = best_pred + ( "_" * (max_len_-len(best_pred)) )
        truth = best_truth + ( "_" * (max_len_-len(best_truth)) )

        for p,t in zip(pred, truth):
            conf_df.loc[p][t] += 1

    return conf_df

In [None]:

## -----------------------------------------------------------------------------

dgri_seg = {
    "vowel": ['ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ','ऍ', 'ऎ', 'ए', 'ऐ',
              'ऑ', 'ऒ', 'ओ', 'औ','ऋ','ॠ','ऌ','ॡ','ॲ', 'ॐ', ],
    "cons" : ['क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण',
              'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ऱ', 'ल',
              'ळ', 'ऴ', 'व', 'श', 'ष', 'स', 'ह', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़'],
    "vow_symb": [ '्', 'ा', 'ि', 'ी', 'ु', 'ू', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ',
                  'ृ', 'ॄ', 'ॢ', 'ॣ', 'ँ', 'ं', 'ः', '़', '॑', 'ऽ', ]
}

dgri_unicodes =  dgri_seg["vowel"] + dgri_seg["cons"] + dgri_seg["vow_symb"]+ [
    chr(0x200c), # ZeroWidth-NonJoiner U+200c
    chr(0x200d), # ZeroWidthJoiner U+200d
    "_", # empty pading
]


In [None]:


def plot_confusion(conf_df, show_chars = None, remove_empty = False, save_prefix="", title = "Plot"):
    plot_df = conf_df

    ## Drop rows/columns full of zeros
    if remove_empty:
        plot_df = plot_df.loc[:,(df != 0).any(axis=0)] #remove columns
        plot_df = plot_df.loc[(df!=0).any(axis=1), :] #remove rows


    if isinstance(show_chars, list):
        show_chars_ = {}
        show_chars_['x'], show_chars_['y'] = show_chars, show_chars

    ## Remove unnecessary char counts
    if show_chars:
        plot_df = plot_df.drop("_", axis = 0)
        plot_df = plot_df.drop("_", axis = 1)

        dfrows = list(plot_df.index.values)
        dfcols = list(plot_df.columns.values)
        plot_df.loc["other",:] = 0
        plot_df.loc[:, "other"] = 0

        for r in dfrows: #prediction
            if r not in show_chars['y']:
                plot_df.loc["other",:] += plot_df.loc[r,:]
                plot_df = plot_df.drop(r, axis = 0)

        for c in dfcols: #truth
            if c not in show_chars['x']:
                # plot_df.loc[:, "other"] += plot_df.loc[:, c]
                plot_df = plot_df.drop(c, axis = 1)

    ## Clip Values
    # plot_df = plot_df.clip(0, 100)

    ## Fonts and Layout ----------
    font_sz = 20; fig_sz = (20, 30)
    plt.figure(figsize = fig_sz)
    font_path = '/usr/share/fonts/truetype/NotoSansDevanagariUI-Condensed.ttf'
    fontprop = fm.FontProperties(fname=font_path, size= font_sz)
    # ---

    conf_plot = sns.heatmap(plot_df, annot=False)

    conf_plot.yaxis.set_ticklabels(conf_plot.yaxis.get_ticklabels(),
                                    ha='right', rotation=0, fontproperties=fontprop)
    conf_plot.xaxis.set_ticklabels(conf_plot.xaxis.get_ticklabels(),
                                    ha='left', rotation=0, fontproperties=fontprop)

    # conf_plot.tick_params(axis='both', which='major', pad=10)
    plt.ylabel('Predicted Character', fontsize = font_sz)
    plt.xlabel('True Character', fontsize = font_sz)
    plt.title (title, fontsize = font_sz)
    plt.show()

    conf_plot.figure.savefig( save_prefix +title+"plot.png")


In [None]:

truth_file = "/content/Toggled-GomEn_ann1_test.json"
pred_file ="/content/pred_GomEn_ann1_test.json"

df = generate_confusion(pred_file, truth_file, dgri_unicodes)

for c in dgri_unicodes:
    df[c][c] = 0

plot_confusion(df, {'x': dgri_seg['vowel'], 'y': dgri_unicodes} ,
                    save_prefix= "", title = "Vowels");
plot_confusion(df, {'x': dgri_seg['cons'], 'y': dgri_unicodes} , remove_empty = True,
                    save_prefix= "", title = "Consonants");
plot_confusion(df,  {'x': dgri_seg['vow_symb'], 'y': dgri_unicodes} ,
                    save_prefix= "", title = "Vowel_Matras");