In [None]:
#default double_tsek

In [None]:
#export
from collections import defaultdict
import cv2
import gzip
import json
import math
from pathlib import Path
import re

from deskew import determine_skew
import imutils
from MTM import matchTemplates, drawBoxesOnRGB
import numpy as np
from openpecha.serializers import Serialize
from xml.dom import minidom
import yaml

In [None]:
#exxport
class config:
    # data_path
    data_path = Path('/home/tenzin/ML/project/Esukhia/Google-OCR/archive')
    images_path = data_path/'images'/'W1PD95844'
    res_path = data_path/'output'/'W1PD95844'
    template_path = Path('data/peydurma/templates')
    # annotation
    double_tsek_sym = '$'
    tsek = '་'
    # image
    img_size = (3969, 2641)
    # dev
    debug = True

In [None]:
import matplotlib.pyplot as plt
import pprint

pp = pprint.PrettyPrinter(indent=4)

def plot(img, cmap=None, sz=(10, 10), axis=False):
    plt.figure(figsize=sz)
    plt.grid(True)
    if not axis:
        plt.axis('off')
        plt.grid(False)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img, cmap=cmap)
    plt.show()

# Create template

In [None]:
def to_box(coord):
    x, y, w, h = coord
    x2, y2 = x+w, y+h
    return x, y, x2, y2


def create_template(img_path, coord, t_fn=None, template=False):
    img = cv2.imread(str(img_path))
    print(img.shape)
    if not template:
        img = cv2.resize(img, (config.img_size[1], config.img_size[0]))
    img_copy = img.copy()
    x1, y1, x2, y2 = to_box(coord)
    cv2.rectangle(img_copy, (x1, y1), (x2, y2), (255, 0, 0), 3)
    
    x, y, w, h = coord
    template = img[y:y+h, x:x+w]
    plot(template)
    plot(img_copy, sz=(25, 25))

    if t_fn:
        cv2.imwrite(str(t_fn), template)

In [None]:
coord = (2139, 1282, 18, 135) # (x, y, w, h)

create_template('data/test-mantra.jpg', coord, t_fn=config.template_path/'double_tsek_02.png')

In [None]:
img_path = config.images_path/'I1PD95846'/'I1PD958460141.jpg'
coord = (1162, 1910, 13, 135) # (x, y, w, h)
create_template(img_path, coord, t_fn=config.template_path/'double_tsek_03.png')

# Pure OpenCV2

### detect paragraphs

In [None]:
# #image = cv2.imread('data/test.jpeg')
# image = cv2.imread('data/test_diff_size.jpeg')
# image = imutils.resize(image, height=3969, width=2645)
# gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# blur = cv2.GaussianBlur(gray, (7,7), 0)
# thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# # Create rectangular structuring element and dilate
# kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
# dilate = cv2.dilate(thresh, kernel, iterations=7)
# plot(dilate, cmap='gray', sz=(25, 25))

# # Find contours and draw rectangle
# cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# cnts = cnts[0] if len(cnts) == 2 else cnts[1]
# for c in cnts:
#     x,y,w,h = cv2.boundingRect(c)
#     cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)

# plot(image, sz=(25, 25))

### Text Skew Correction

In [None]:
# def image_deskew2(image, show_diff=False):
#     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#     gray = cv2.bitwise_not(gray)
#     thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
#     coords = np.column_stack(np.where(thresh > 2))
#     angle = cv2.minAreaRect(coords)[-1]

#     if angle < -45:
#         angle = -(90 + angle)
#     else:
#         angle = -angle

#     # rotate the image to deskew it
#     (h, w) = image.shape[:2]
#     center = (w // 2, h // 2)
#     M = cv2.getRotationMatrix2D(center, angle, 1.0)
#     rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
#     print(f'[INFO] Image dskewed by {angle:.3} angles')
    
#     if show_diff:
#         plot(image, sz=(15, 15), axis=True)
#         plot(rotated, sz=(15, 15), axis=True)
        
#     return rotated

def image_deskew(image, show_diff=False):
    def rotate(image, angle, background):
        old_width, old_height = image.shape[:2]
        angle_radian = math.radians(angle)
        width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
        height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)

        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        rot_mat[1, 2] += (width - old_width) / 2
        rot_mat[0, 2] += (height - old_height) / 2
        return cv2.warpAffine(image, rot_mat, (int(round(height)), int(round(width))), borderValue=background)
    
    if isinstance(image, (str, Path)):
        image = cv2.imread(str(image))
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    angle = determine_skew(gray)
    backgroud = tuple([int(x) for x in image[10][10]])
    rotated = rotate(image, angle, backgroud)
    
    print(f'[INFO] Image dskewed by {angle:.4} angles')
    
    if show_diff:
        plot(image, sz=(15, 15), axis=True)
        plot(rotated, sz=(15, 15), axis=True)
        
    return rotated

In [None]:
%%time
_ = image_deskew(cv2.imread('data/peydurma/test-set/white_skewed_01.jpg'), show_diff=True);

In [None]:
%%time
_ = image_deskew(cv2.imread('data/peydurma/test-set/white_skewed_dtsek_01.jpg'), show_diff=True);

In [None]:
def sorted_matches(matches):
    h_sorted_match = []
    for x, y in matches:
        found_group = False
        if h_sorted_match:
            for h_list in h_sorted_match:
                if abs(y-h_list[0][1]) < 5:
                    h_list.append((x, y))
                    found_group = True
        else:
            h_sorted_match.append([(x, y)])
            found_group = True

        if not found_group:
            h_sorted_match.append([(x, y)])
        
    full_sorted_match = []
    for h_list in h_sorted_match:
        full_sorted_match.append(sorted(h_list, key=lambda x: x[0]))
        
    return sum(full_sorted_match, [])
    

def remove_dup_match(match_locations):
    cleaned_match = []
    prev_x, prev_y = 0, 0
    th = 2
    for x, y in sorted_matches(zip(match_locations[1], match_locations[0])):
        if abs(x-prev_x) < 5 and abs(y-prev_y) < 5: continue
        cleaned_match.append((x, y))
        prev_x, prev_y = x, y
    return cleaned_match


def template_match(img, templates):
    # create edged image
    if isinstance(img, str):
        img = cv2.imread(img)
    if size:
        img = imutils.resize(img, height=config.img_size[0], width=config.img_size[1])
    print('Image size:', img.shape)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    #plot(gray, cmap='gray', sz=(50, 50))
    edged = cv2.Canny(gray, 100, 600)
    #plot(edged, cmap='gray', sz=(50, 50))
    
    output = defaultdict(list)
    clone = img.copy()
    for template_ in templates:
        t_type, template, th, data = template_
        
        # template matching
        result = cv2.matchTemplate(edged, template, cv2.TM_CCOEFF)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
        max_thresh = max_val * th
        match_locations = np.where(result>=max_thresh)
        cleaned_match_locations = remove_dup_match(match_locations)
        
        # Plot
        w, h = template.shape[::-1]
        for (x, y) in cleaned_match_locations:
            output[t_type].append((x, y))
            cv2.rectangle(clone, (x, y), (x+w, y+h), [0,0,255], 2)
    
        print(f'No. {t_type} detected: {len(output[t_type])}')

    plot(clone, cmap='gray', sz=(25, 25))
    return output

In [None]:
# templates = [
#     ('rectangle', rect_template, 0.9, {}),
#     #('circle', cir_template, 0.7, {'radius': radius})
# ]

### Original Image

In [None]:
#output = template_match('data/test-mantra.jpg', templates)

# Multi-Template-Matching Library

In [None]:
#export
def get_templates(path):
    templates = []
    for p in Path(path).iterdir():
        if not p.name.endswith('.png'): continue
        templates.append((p.stem, cv2.imread(str(p))))
    return templates

def mtm(image, templates, show=False, th=0.9):
    if isinstance(image, (str, Path)):
        image = cv2.imread(str(image))
    matches = []
    try:
        hits = matchTemplates(templates, image, score_threshold=th, method=cv2.TM_CCOEFF_NORMED, maxOverlap=0.3)
        for x, y, w, h in list(hits['BBox']):
            matches.append([x, y, x+w, y+h])
        if show: image = drawBoxesOnRGB(image, hits, boxThickness=5, boxColor=(255,0,0))
    except KeyError as ex:
        if ex.args[0] == 'Score':
            print('\t- double tsek not found !')
            return matches

    print(f'\t- no. of double tsek detected: {len(matches)}')    
    if show:
        plot(image, sz=(15, 15))
    
    return matches

In [None]:
templates = get_templates(config.template_path); len(templates)

In [None]:
test_output = mtm('data/test.jpeg', templates, show=True)

In [None]:
mtm('data/test-mantra.jpg', templates, show=True)

In [None]:
mtm('data/test-02.jpeg', templates, show=True)

In [None]:
mtm('data/test_diff_size.jpeg', templates, show=True)

In [None]:
mtm('data/test-03.jpg', templates, show=True)

## Test on actual Peydurma Image

In [None]:
mtm('data/peydurma-05.jpg', templates, show=True)

In [None]:
mtm('data/peydurma/test-set/yellow_01.jpg', templates, show=True)

In [None]:
mtm('data/peydurma/test-set/yellow_02.jpg', templates, show=True)

In [None]:
skewed_output = mtm('data/peydurma/test-set/white_skewed_dtsek_01.jpg', templates, show=True)

In [None]:
# deskewed_img = image_deskew(config.images_path/'I1PD95846'/'I1PD958460141.jpg')
# plot(deskewed_img)
mtm(config.images_path/'I1PD95846'/'I1PD958460141.jpg', templates, show=True)

# Find reinsertion span
find line number and char location of double tsek
- input: ocr_boxes, match_loc
- output: line number and char index

In [None]:
from google.cloud import vision
from google.cloud.vision import types
from google.protobuf.json_format import MessageToJson

vision_client = vision.ImageAnnotatorClient()

In [None]:
def ocr(image, path):
    path = Path(path)
    res_fn = path.parent/f'{path.stem}.json'
    if res_fn.is_file():
        response = json.load(res_fn.open())
    else:
        if isinstance(image, (str, Path)):
            with open(path, 'rb') as image_file:
                content = image_file.read()
        else:
            content = image
        image = types.Image(content=content)
        response_pb = vision_client.document_text_detection(image=image)
        response = eval(MessageToJson(response_pb))
        json.dump(response, res_fn.open('w'))
    return response

# convert image array to image bytes
# success, encoded_image = cv2.imencode('.jpg', image)
# image_bytes = encoded_image.tobytes()
# response = ocr(image_bytes, image_path)

### Get OCR output
- unzip ocr output and read the response json
- resize the box w.r.t config.img_size

In [None]:
#export
def get_ocr_output(path):
    imagegroup, img_fn = path.parts[-2:]
    res_fn = config.res_path/imagegroup/f'{img_fn.split(".")[0]}.json.gz'
    return json.load(gzip.open(str(res_fn), 'rb'))

def get_symbol(response):
    for page in response['fullTextAnnotation']['pages']:
        for block in page['blocks']:
            for paragraph in block['paragraphs']:
                for word in paragraph['words']:
                    for symbol in word['symbols']:
                        char = symbol['text']
                        v = symbol['boundingBox']['vertices']
                        box = [v[0]['x'], v[0]['y'], v[2]['x'], v[2]['y']]
                        yield char, box

def get_full_text_annotations(response):
    boxes, text = [], ''
    for char, box in get_symbol(response):
        text += char
        boxes.append(box)
    return boxes, text

def resize_boxes(boxes, old_size):
    "`boxes` are in top-right and bottom-left coord system."
    h, w = old_size[:2]
    h_scale = config.img_size[0]/h
    w_scale = config.img_size[1]/w
    result = []
    for box in boxes:
        # adjust the box
        box[0] *= w_scale
        box[1] *= h_scale
        box[2] *= w_scale
        box[3] *= h_scale
        box = list(map(int, box))
        result.append(box)
    return result

In [None]:
def plot_boxes(img, boxes, show=True, color=[0,0,255]):
    for x1, y1, x2, y2 in boxes[0]:
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)

    if len(boxes) > 1:
        for x1, y1, x2, y2 in boxes[1]:
            cv2.rectangle(img, (x1, y1), (x2, y2), [255,0,0], 5)
    if show: plot(img, sz=(25, 25))
    else: return img

In [None]:
def test_resize_boxes(image_path):
    image = cv2.imread(str(image_path))
    old_size = image.shape
    image = cv2.resize(image, (config.img_size[1], config.img_size[0]))
    response = get_ocr_output(image_path)
    boxes, text = get_full_text_annotations(response)
    print(text)
    boxes = resize_boxes(boxes, old_size)
    plot_boxes(image, [boxes, []])

In [None]:
 test_resize_boxes(config.images_path/'I1PD95846'/'I1PD958460142.jpg')

In [None]:
test_resize_boxes(config.images_path/'I1PD95846'/'I1PD958460141.jpg')

# Get context of Double Tsek

In [None]:
#export
def cls_box_into_line(boxes, th=20):
    lines = []
    line = []
    prev_y1 = boxes[0][1]
    for box in boxes:
        if abs(box[1] - prev_y1) < th:
            line.append(box)
        else:
            lines.append(line)
            line = []
            line.append(box)
        prev_y1 = box[1]
    else:
        if line: lines.append(line)
    return lines

In [None]:
def test_box_into_line(image_path):
    image = cv2.imread(str(image_path))
    old_size = image.shape
    image = cv2.resize(image, (config.img_size[1], config.img_size[0]))
    response = get_ocr_output(image_path)
    boxes, text = get_full_text_annotations(response)
    boxes = resize_boxes(boxes, old_size)
    boxe_lines = cls_box_into_line(boxes)
    
    for box_line in boxe_lines:
        r, g, b = map(int, np.random.choice(range(256), size=3))
        image = plot_boxes(image, [box_line], show=False, color=(r,g,b))
    plot(image, sz=(25, 25))
    
test_box_into_line(config.images_path/'I1PD95846'/'I1PD958460048.jpg')

In [None]:
#export
def find_double_tsek_bf(matched_box, boxes, th=20):
    box_lines = cls_box_into_line(boxes)
    pos = 0
    prev_x1 = 0
    for box_line in box_lines:
        if abs(matched_box[1] - box_line[0][1]) < th:
            for i, box in enumerate(box_line):
                if matched_box[0] > prev_x1 and matched_box[0] < box[0]:
                    pos += i-1
                    return pos
        pos += len(box_line)
        

def compute_iou(box_arr1, box_arr2):
    x11, y11, x12, y12 = np.split(box_arr1, 4, axis=1)
    x21, y21, x22, y22 = np.split(box_arr2, 4, axis=1)
    
    xA = np.maximum(x11, np.transpose(x21))
    yA = np.maximum(y11, np.transpose(y21))
    xB = np.minimum(x12, np.transpose(x22))
    yB = np.minimum(y12, np.transpose(y22))
    interArea = np.maximum((xB - xA + 1), 0) * np.maximum((yB - yA + 1), 0)
    boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1)
    boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1)
    iou = interArea / (boxAArea + np.transpose(boxBArea) - interArea)
    
    return iou


def get_double_tsek_idx(image_path, templates, deskew=False, show_boxes=False):
    # load, deskew and resize the image
    image = cv2.imread(str(image_path))
    old_size = image.shape
    if deskew: image = image_deskew(image)
    image = image = cv2.resize(image, (config.img_size[1], config.img_size[0]))
    
    # find the double tsek boxes
    matches = mtm(image, templates)
    
    # Get ocr boxes
    response = get_ocr_output(image_path)
    boxes, text = get_full_text_annotations(response)
    if not matches: return [], text
    boxes = resize_boxes(boxes, old_size)
    
    # find double tsek char index
    iou_matrix = compute_iou(np.array(matches), np.array(boxes))
    if show_boxes: plot_boxes(image, [boxes, matches])
    idxs = list(np.argmax(iou_matrix, axis=1))
    if 0 in idxs:
        undetected_box_idx = idxs.index(0)
        undetected_box_char_idx = find_double_tsek_bf(matches[undetected_box_idx], boxes)
        idxs[undetected_box_idx] = undetected_box_char_idx
    return idxs, text

In [None]:
def test_get_double_tsek_idx(image_path):
    idxs, text = get_double_tsek_idx(image_path, templates, show_boxes=True)
    for cc in idxs:
        print(text[cc-10:cc], text[cc], text[cc+1: cc+10])

In [None]:
test_get_double_tsek_idx(config.images_path/'I1PD95846'/'I1PD958460048.jpg')

In [None]:
test_get_double_tsek_idx(config.images_path/'I1PD95846'/'I1PD958460043.jpg')

In [None]:
test_get_double_tsek_idx(config.images_path/'I1PD95846'/'I1PD958460047.jpg')

In [None]:
#export
def rm_running_head(text):
    r_head_end_idx = text.find('༡')
    if r_head_end_idx >= 0  and r_head_end_idx < 500:
        return text[r_head_end_idx+1:]
    else:
        return text[text.find('།')+1:]

def rm_noise(text):
    'remove numbers and etc'
    text = re.sub(f'\d+', '', text)
    for r in ['=', '|', '“', '”', ']', '）', '>', '©', '–', '-', '༸', ('་ི', '་')]:
        if isinstance(r, tuple):
            text = text.replace(r[0], r[1])
        else:
            text = text.replace(r, '')
    return text

def postprocess(text, annotated=True):
    text = rm_running_head(text)
    text = rm_noise(text)
    if annotated:
        for f, t in [
            (f'{config.double_tsek_sym}{config.tsek}',
             f'{config.tsek}{config.double_tsek_sym}')
                 ]:
            text = text.replace(f, t)
    return text

def str_insert(text, idx, char):
    text = text[:idx] + char + text[idx:]
    return text

def add_double_tsek(text, idxs):
    for i, idx in enumerate(idxs):
        text = str_insert(text, idx+i, config.double_tsek_sym)
    return text

def get_double_tsek_text(path):
    base_text = ''
    ann_text = ''
    for i, path in enumerate(sorted((path).iterdir())):
        if i == 10: break
        print(f'[INFO] {i+1} - Processing {path.name} ...')
        idxs, text = get_double_tsek_idx(path, templates)
        base_text += postprocess(text, annotated=False)
        ann_text += f'\n\n{path.name}' if config.debug else ""
        ann_text += postprocess(add_double_tsek(text, idxs))
    return base_text, ann_text

In [None]:
base_text, ann_text = get_double_tsek_text(config.images_path/'I1PD95846')

In [None]:
print(ann_text)

# Transfer Annotations

Steps:
1. Paser peydurm-tengyur text index
1. Map peydurma-tengyur text-id to dergey-tengyur text-id
1. Extract corresponding dergey-tengyur text
1. Extract double-tsek from peyduma-tengyur text
1. Create dmp patch of double tsek
1. Apply the dmp patch to dergey-tengyur
1. Parse the dobule-tsek from dergey-tengyur

### 1. Parser peydurma tengyur text index

In [None]:
peydurma_meta_fn = Path('./data/peydurma/tanjura_matedata.xml')

In [None]:
import pdb

def get_vol_pages_num(loc):
    #pdb.set_trace()
    text_span = []
    for span in loc.split(' + '):   # 'vol.030 - 134-654 + vol.030 - 655-1547 + vol.013 - 3-503'
        vol, pg = span.split(' - ')
        vol_num = vol.split('.')[1]
        if pg:
            if '-' in pg:
                start, end = pg.split('-')
            else: # vol.025 - 587
                start, end = pg.strip(), None
        else: # vol.025
            start, end = (None, None)
        if start: start = int(start.strip())
        if end: end = int(end.strip())
        text_span.append({'vol': vol_num, 'start': start, 'end': end})
    return text_span

def get_text_index(fn):
    dom = minidom.parse(str(fn))
    text_index = {}
    last_end_pg = None
    pg_exist = True
    last_no_pg_vol = None
    last_no_pg_text_id = None
    last_text = {}
    for item in dom.getElementsByTagName("item"):
        text_id = item.getElementsByTagName("ref")[0].childNodes[0].data
        loc = item.getElementsByTagName("loc")[0].childNodes[0].data
        text_span = get_vol_pages_num(loc)
        
        if not pg_exist:
            text_index[last_no_pg_text_id] = [{
                'vol': last_no_pg_vol,
                'start': last_end_pg+1,
                'end': text_span[0]['start']-1
            }]
            pg_exist = True
        
        if not text_span[0]['start']:
            pg_exist = False
            last_no_pg_vol = text_span[0]['vol']
            last_no_pg_text_id = text_id
            continue
        else:
            text_index[text_id] = text_span
        
        last_end_pg = text_span[0]['end']
        
    return text_index

In [None]:
get_vol_pages_num('vol.030 - 134-654 + vol.030 - 655-1547 + vol.013 - 3-503')

In [None]:
get_vol_pages_num('vol.025 - 587')

In [None]:
get_vol_pages_num('vol.025 - ')

In [None]:
p_text_index = get_text_index(peydurma_meta_fn)

In [None]:
p_text_index['AT0131']

In [None]:
p_text_index['AT0455']

### 2. Map peydurma-tengyur text-id to dergey-tengyur text-id

In [None]:
def map_text_id(text_id):
    src_start, dst_start = 1, 1109
    src_id_str = text_id[2:]
    src_id_cat = ''
    if src_id_str[-1] in 'ab':
        src_id_cat = src_id_str[-1]
        src_id_str = src_id_str[:-1]
    src_id_int = int(src_id_str)
    
    if src_id_int in [72, 73, 75]:
        print('[INFO] Text missing in dergey-tengyur')
        return
    
    # skip the missing page
    if src_id_int == 74:
        src_id_int -= 2
    if src_id_int > 74:
        src_id_int -= 3
        
    dst_id = src_id_int + dst_start - src_start
    dst_id_str = f'D{dst_id:04}{src_id_cat}'
    return dst_id_str 

In [None]:
map_text_id('AT0131'), map_text_id('AT0131a'), map_text_id('AT0131b')

In [None]:
map_text_id('AT0072'), map_text_id('AT0073'), map_text_id('AT0075') 

### 3. Extract corresponding dergey-tengyur text

In [None]:
import openpecha
openpecha.__version__

In [None]:
dergey_tengyur_opf_path = Path('/home/tenzin/ML/project/Esukhia/openpecha-user/.openpecha/data/P000002/P000002.opf')

In [None]:
index_layer = yaml.safe_load((dergey_tengyur_opf_path/'index.yml').open())

In [None]:
def get_base_text(text_id, opf_path, index_layer):
    serializer = Serialize(opf_path, text_id=text_id, index_layer=index_layer)
    return ''.join(serializer.get_text_base_layer().values())

In [None]:
1109 + 296 - 1 - 3

In [None]:
test_id = 'AT0076'
d_text_id = map_text_id(test_id)
if d_text_id:
    print(d_text_id)
    text_base_text = get_base_text(d_text_id, dergey_tengyur_opf_path, index_layer); print(text_base_text)
    print(p_text_index[test_id])

### Text Similarity

In [None]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split('་')) 
    b = set(str2.split('་'))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
str1 = '༄༅༅། །རྒྱ་གར་སྐད་དུ། ཡོ་ག་རཏྣ་མཱ་ལ་ནཱ་མ་ཧེ་བཛྲ་པཉྩི་ཀཱ། བོད་སྐད་དུ། དགྱེས་པ་རྡོ་རྗེའི་དཀའ་འགྲེལ་རྣལ་འབྱོར་རིན་པོ་ཆེའི་ཕྲེང་བ་ཞེས་བྱ་བ། དཔལ་དགྱེས་པའི་རྡོ་རྗེ་ལ་ཕྱག་'
str2 = '༄༅༅། །རྒྱ་གར་སྐད་དུ། ཡོ་ག་རཏྣ་མཱ་ལ་ནཱ་མ་ཧེ་པཉྩི་ཀཱ། བོད་སྐད་དུ། དགྱེས་པ་རྡོ་རྗེའི་དཀའ་རྣལ་འབྱོར་རིན་ངག་གི་དབང་ཕྱུག་པོ་ཆེའི་ཕྲེང་བ་ཞེས་བྱ་བ། དཔལ་དགྱེས་པའི་རྡོ་རྗེ་ལ་ཕྱག་'

In [None]:
str3 = '༄༅༅། །རྒྱ་གར་སྐད་དུ། ཧེ་བཛྲ་ཏནྟྲ་པཉྫི་ཀཱ་པདྨ་ནི་ནཱ་མ། བོད་སྐད་དུ། ཀྱེའི་རྡོ་རྗེའི་རྒྱུད་ཀྱི་དཀའ་འགྲེལ་པདྨ་ཅན་ཞེས་བྱ་བ། ངག་གི་དབང་ཕྱུག་འཇམ་དཔལ་གཞོན་ནུར་གྱུར་པ་ལ་ཕྱག་འཚལ་ལོ།'

In [None]:
get_jaccard_sim(str1, str2)

In [None]:
get_jaccard_sim(str1, str3)

In [None]:
#