<u><h1>Semantic Assignment and Alignment Method:

<h2><u> Dependencies:

In [1]:
# !pip install gensim numpy scipy nltk pandas spacy
# !python -m spacy download en_core_web_sm

In [2]:
# nltk.download()

In [3]:
import gensim
import numpy as np
from scipy.spatial.distance import cosine
import math
import copy
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
import spacy
import pandas as pd
pd.set_option("max_colwidth", None)

In [4]:
model_path = './pre-trained-models/word2vec/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) # pre-trained word2vec model, trained on google news dataset
pos_tagger = spacy.load("en_core_web_sm")

<h2><u>Core logic:

In [5]:
finalDist = 1
finalCosDiff = -1
finalPosDiff = -1
finalInformationDisparity = -1
finalStatus = []

def calc_score(shorterStatus, longerStatus, shorter, longer):
    totalCosDiff = 0
    totalPosDiff = 0
    for i in range(len(shorterStatus)):
        posDiff = abs(i - shorterStatus[i]) / len(longer) # normalized position difference
        totalPosDiff = totalPosDiff + posDiff
        w1, w2 = (shorter[i].text, longer[shorterStatus[i]].text)
        pos1, pos2 = (shorter[i].pos_, longer[shorterStatus[i]].pos_)
        if w1 != w2:
            # if words are equal, cosDiff is definitely 0, nothing to add to totalCosDiff
            cosDiff = 1 # max cosine distance = 1
            if w1 in model.key_to_index and w2 in model.key_to_index:
                cosDiff = cosine(model.get_vector(w1), model.get_vector(w2)) # cosine distance
            else:
                # out of vocabulary, will have to use some other similarity measure
                if pos1 == pos2:
                    cosDiff = 0.2 # determined through iterative experimentation                
                    # otherwise, if out of vocabulary of POS tagging model, cosDiff stays 1 (maximum)
            totalCosDiff = totalCosDiff + cosDiff

    totalCosDiff = totalCosDiff / len(shorter) # tight upper bound => min(|ref|,|cand|)
    
    totalPosDiff = totalPosDiff / len(shorter) # average of normalized posDiffs

    totalInformationDisparity = (len(longer) - len(shorter)) / (len(longer) + len(shorter)) # normalized information disparity
    
    return totalCosDiff, totalPosDiff, totalInformationDisparity

def best_pairing(idx, shorterStatus, longerStatus, shorter, longer):
    global finalDist
    global finalCosDiff
    global finalPosDiff
    global finalInformationDisparity
    global finalStatus
    
    allPaired = 1
    for e in shorterStatus:
        if e == -1:
            allPaired = 0
            break


    if allPaired == 1:
        totalCosDiff, totalPosDiff, totalInformationDisparity = calc_score(shorterStatus, longerStatus, shorter, longer)
        dist = (totalCosDiff + totalPosDiff + totalInformationDisparity) / 3
        if dist < finalDist:
            finalDist = dist
            finalCosDiff = totalCosDiff
            finalPosDiff = totalPosDiff
            finalInformationDisparity = totalInformationDisparity
            finalStatus = copy.deepcopy(shorterStatus)
        return
    
    for i in range(len(longerStatus)):
        if longerStatus[i] == -1:
            shorterStatus[idx] = i
            longerStatus[i] = idx
            best_pairing(idx+1, shorterStatus, longerStatus, shorter, longer)
            longerStatus[i] = -1
            shorterStatus[idx] = -1

def saam(reference, translation):
    reference = pos_tagger(reference.lower())
    translation = pos_tagger(translation.lower())

    shorter, longer = (reference, translation) if len(reference) <= len(translation) else (translation, reference)
    
    global finalStatus
    global finalDist
    global finalCosDiff
    global finalPosDiff
    global finalInformationDisparity
    
    finalCosDiff = -1
    finalPosDiff = -1
    finalInformationDisparity = -1 
    finalStatus = [-1]*len(shorter)
    finalDist = 1
    
    best_pairing(0, [-1]*len(shorter), [-1]*len(longer), shorter, longer)

    word_pairs = []
    for i in range(len(finalStatus)):
        word_pairs.append((shorter[i].text, longer[finalStatus[i]].text))
    
    return finalDist, finalCosDiff, finalPosDiff, finalInformationDisparity, word_pairs


In [6]:
def get_scores(reference, translation):
  finalDist, finalCosDiff, finalPosDiff, finalInformationDisparity, word_pairs = saam(copy.deepcopy(reference),copy.deepcopy(translation))
  finalScore = (1 - finalDist) ** math.trunc(finalDist*100)
  return [reference, translation, finalScore, finalDist, finalCosDiff, finalPosDiff, finalInformationDisparity, word_pairs]

<h2><u> Observations:

In [8]:
reference = 'I go to the school'
candidates = [ 'I go to the school', 'I go to a school', 'I go in the school', 'I go to the schools', 'I eat to the school', 'I going to the school',
               'I go to the college', 'I play the guitar in school', 'I go to the hospital', 'You go to the school', 'I go the school', 'I go to school',
               'I I go go to to the the school', 'I go to the school I go to the school', 'To I go school the', 'I go', 'I go to the school in a bus',
               'He comes from cinema', 'I go to an institution for learning' ]

data = []

for i in range(len(candidates)):
  data.append(get_scores(reference, candidates[i]))

data.sort(reverse=True, key=lambda e: e[2])

pd.DataFrame(data, columns=["Reference", "Translation", "SAAM Score", "Net Difference Measure", "Final Cos Diff", "Final Pos Diff", "Final Information Disparity", "Final Word Pairings"])

Unnamed: 0,Reference,Translation,SAAM Score,Net Difference Measure,Final Cos Diff,Final Pos Diff,Final Information Disparity,Final Word Pairings
0,I go to the school,I go to the school,1.0,0.0,0.0,0.0,0.0,"[(i, i), (go, go), (to, to), (the, the), (school, school)]"
1,I go to the school,I go to a school,0.986667,0.013333,0.04,0.0,0.0,"[(i, i), (go, go), (to, to), (the, a), (school, school)]"
2,I go to the school,I go in the school,0.986667,0.013333,0.04,0.0,0.0,"[(i, i), (go, go), (to, in), (the, the), (school, school)]"
3,I go to the school,I go to the schools,0.982746,0.017254,0.051762,0.0,0.0,"[(i, i), (go, go), (to, to), (the, the), (school, schools)]"
4,I go to the school,I go to the college,0.948441,0.026121,0.078362,0.0,0.0,"[(i, i), (go, go), (to, to), (the, the), (school, college)]"
5,I go to the school,I going to the school,0.946804,0.026962,0.080885,0.0,0.0,"[(i, i), (go, going), (to, to), (the, the), (school, school)]"
6,I go to the school,I eat to the school,0.887466,0.039014,0.117041,0.0,0.0,"[(i, i), (go, eat), (to, to), (the, the), (school, school)]"
7,I go to the school,You go to the school,0.845039,0.04122,0.123659,0.0,0.0,"[(i, you), (go, go), (to, to), (the, the), (school, school)]"
8,I go to the school,I go to the hospital,0.826612,0.04649,0.139469,0.0,0.0,"[(i, i), (go, go), (to, to), (the, the), (school, hospital)]"
9,I go to the school,I go to school,0.758815,0.053704,0.0,0.05,0.111111,"[(i, i), (go, go), (to, to), (school, school)]"
