### Notebook written and maintained by Alex Jones (alexander.g.jones.23@dartmouth.edu, alexjones1925@gmail.com)

### For mining sentence pairs from comparable corpora for the purpose of training NMT systems (see https://github.com/AlexJonesNLP/alt-bitexts/tree/main/ComparableCorporaMaterials for data)

### May be used for replicating NMT results from paper "An Alternative to Thresholding for Margin-Based Bitext Mining" (Alex Jones and Derry Tanti Wijaya)

### Code for margin-based similarity search based on LASER implementation by Facebook AI: https://github.com/facebookresearch/LASER/blob/master/source/mine_bitex

### See https://github.com/facebookresearch/LASER for copyright and licensing specifications

In [13]:
import torch
from torch import nn
from sentence_transformers import SentenceTransformer, util
import numpy as np
import faiss
import re
import time
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import tokenize
nltk.download('punkt')
from bs4 import BeautifulSoup
from ast import literal_eval

[nltk_data] Downloading package punkt to
[nltk_data]     /usr2/collab/agjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Checks for GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [5]:
model_name = 'LaBSE'
sentence_model = SentenceTransformer(model_name)
sentence_model.cuda()

SentenceTransformer(
  (0): Transformer(
    (auto_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(501153, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
      

In [24]:
# wrapper for embedding function
def embed(sentences:list):
    global i
    i+=1
    if i%1000==0:
        print("Finished document {}".format(i))
    return sentence_model.encode(sentences)

In [7]:
# Setting up the GPU with FAISS
GPU = faiss.StandardGpuResources()

In [8]:
'''

Params
******
src_emb: array of size number_of_source_sentences X embedding_dimension
tgt_emb: array of size number_of_target_sentences X embedding_dimension
k: number of neighbors to return
batch_size: batch size

Returns
*******
cos_sims: cosine similarity scores for each of k nearest neighbors for each source sentence
inds: target indices of k nearest neighbors for each source sentence

Modeled off of LASER source code: https://github.com/facebookresearch/LASER/blob/master/source/mine_bitexts.py

'''

def knnSearch(src_emb, tgt_emb, k=1, batch_size=1):
    emb_dim = src_emb.shape[1] # Embedding dimension
    num_src_sents = src_emb.shape[0]
    num_tgt_sents = tgt_emb.shape[0]
    cos_sims = np.zeros((num_src_sents, k), dtype=np.float32)
    inds = np.zeros((num_src_sents, k), dtype=np.int64)
    for s_min in range(0, num_src_sents, batch_size):
        s_max = min(s_min + batch_size, num_src_sents)
        src_sims = []
        src_inds = []
        for t_min in range(0, num_tgt_sents, batch_size):
            t_max = min(t_min + batch_size, num_tgt_sents)
            idx = faiss.IndexFlatIP(emb_dim)
            idx = faiss.index_cpu_to_gpu(GPU, 0, idx)
            idx.add(tgt_emb[t_min : t_max])
            src_sim, src_ind = idx.search(src_emb[s_min : s_max], min(k, t_max-t_min))
            src_sims.append(src_sim)
            src_inds.append(src_ind + t_min)
            del idx
        src_sims = np.concatenate(src_sims, axis=1)
        src_inds = np.concatenate(src_inds, axis=1)
        sorted_inds = np.argsort(-src_sims, axis=1)
        for i in range(s_min, s_max):
            for j in range(k):
                cos_sims[i, j] = src_sims[i-s_min, sorted_inds[i-s_min, j]]
                inds[i, j] = src_inds[i-s_min, sorted_inds[i-s_min, j]]
    return cos_sims, inds

In [9]:
# Retrieves k-nearest neighbor indices and similarity means for margin scoring
# If forward: finds neearest neighbors and indices for all source sentences
# If backward: finds nearest neighbors and indices for all target sentences
# In the approach implemented in our paper, we perform both forward and backward search

def directedMeansAndInds(src_emb, tgt_emb, forward=False, backward=False, k=1, batch_size=1):
    assert forward != backward, "Please choose either forward or backward"
    if forward:
        cos_sims, inds = knnSearch(src_emb, tgt_emb, min(tgt_emb.shape[0], k), batch_size)
        return cos_sims.mean(axis=1), inds
    elif backward:
        cos_sims, inds = knnSearch(tgt_emb, src_emb, min(src_emb.shape[0], k), batch_size)
        return cos_sims.mean(axis=1), inds

In [11]:
'''

Params
******
src_embs: array of size number_of_source_sentences X embedding_dimension
tgt_embs: array of size number_of_source_sentences X embedding_dimension
batch_size: batch size
num_neighbors: number of neighbors

Returns
*******
concat_pairs: list of mined sentence pairs
margin_scores: list of scores corresponding to mined pairs

'''


def mineSentencePairs(src_embs, tgt_embs, batch_size=100, num_neighbors=4):

    # Retrieve means and indices in the forward direction . . .
    fwd_means, fwd_inds = directedMeansAndInds(src_embs, tgt_embs, forward=True, k=num_neighbors, batch_size=batch_size)
    # . . . and in the backward direction
    bwd_means, bwd_inds = directedMeansAndInds(src_embs, tgt_embs, backward=True, k=num_neighbors, batch_size=batch_size)

    fwd_margin_scores = np.zeros(fwd_inds.shape)
    for i in range(fwd_inds.shape[0]):
        for j in range(fwd_inds.shape[1]):
            tgt_ind = fwd_inds[i,j]
            # Compute ratio margin score between each source sentence and each of its k-nearest neighbors
            margin_score = (src_embs[i].dot(tgt_embs[tgt_ind])) / np.average((fwd_means[i], bwd_means[tgt_ind]))
            # Store the result
            fwd_margin_scores[i,j] = margin_score
    
    # We will store the source index, target index, and margin score for the best
    # pairs found using forward search
    best = np.zeros((fwd_inds.shape[0], 3))
    # Take pair that maximizes margin score for each source sentence
    best_inds = fwd_inds[np.arange(src_embs.shape[0]), fwd_margin_scores.argmax(axis=1)]
    for i in range(fwd_inds.shape[0]):
        best_score, ind = (np.max(fwd_margin_scores[i]), np.argmax(fwd_margin_scores[i]))
        best[i] = ((i+1, best_inds[i]+1, best_score)) # Assumption is that GROUND TRUTH VALUES ARE 1-INDEXED!!!

    # Repeat process in backward direction (finding matches in source text for target sentences)
    bwd_margin_scores = np.zeros(bwd_inds.shape)
    for i in range(bwd_inds.shape[0]):
        for j in range(bwd_inds.shape[1]):
            tgt_ind = bwd_inds[i,j]
            margin_score = (tgt_embs[i].dot(src_embs[tgt_ind])) / np.average((bwd_means[i], fwd_means[tgt_ind]))
            bwd_margin_scores[i,j] = margin_score
            
    bwd_best = np.zeros((bwd_inds.shape[0], 3))
    best_inds = bwd_inds[np.arange(tgt_embs.shape[0]), bwd_margin_scores.argmax(axis=1)]
    for i in range(bwd_inds.shape[0]):
        best_score, ind = (np.max(bwd_margin_scores[i]), np.argmax(bwd_margin_scores[i]))
        bwd_best[i] = ((best_inds[i]+1, i+1, best_score))
    
    # Best triples (src_idx, tgt_idx, margin_score) from forward/backward searches
    fwd_best = [tuple(best[i]) for i in range(best.shape[0])]
    bwd_best = [tuple(bwd_best[i]) for i in range(bwd_best.shape[0])]

    pairs_and_scores = []
    # Take INTERSECTION of forward and backward searches
    pairs_and_scores = list(set(fwd_best) & set(bwd_best))

    pairs_and_scores = list(dict.fromkeys(pairs_and_scores))
    concat_pairs = [(triplet[0], triplet[1]) for triplet in pairs_and_scores] # Store indices only
    concat_pairs_int = []
    for tup in concat_pairs:
        concat_pairs_int.append((int(tup[0]), int(tup[1]))) # Ground-truth indices are ints, so change type
    concat_pairs = concat_pairs_int

    margin_scores = [triplet[2] for triplet in pairs_and_scores] # Store scores only
                                    
    return concat_pairs, margin_scores

## Reading in en-kk and en-gu comparable corpora

In [None]:
# Where you stored the comparable corpora files
PATH = '/project/statnlp/ajones'

# Original English sentences
orig_en_kk = open('{}/orig.en-kk'.format(PATH), 'r').readlines()
# Translated English sentences
transl_en_kk = open('{}/translation.en-kk'.format(PATH), 'r').readlines()
# English doc IDs
en_doc_ids_1 = open('{}/id.en-kk'.format(PATH), 'r').readlines()

# Original Kazakh sentences
orig_kk_en = open('{}/orig.kk-en'.format(PATH), 'r').readlines()
# Translated Kazakh sentences
transl_kk_en = open('{}/translation.kk-en'.format(PATH), 'r').readlines()
# Gujarati doc IDs
kk_doc_ids = open('{}/id.kk-en'.format(PATH), 'r').readlines()
    

    
# Original English sentences
orig_en_gu = open('{}/orig.en-gu'.format(PATH), 'r').readlines()
# Translated English sentences
transl_en_gu = open('{}/translation.en-gu'.format(PATH), 'r').readlines()
# English doc IDs
en_doc_ids_2 = open('{}/id.en-gu'.format(PATH), 'r').readlines()

# Original Gujarati sentences
orig_gu_en = open('{}/orig.gu-en'.format(PATH), 'r').readlines()
# Translated Gujarati sentences
transl_gu_en = open('{}/translation.gu-en'.format(PATH), 'r').readlines()
# Gujarati doc IDs
gu_doc_ids = open('{}/id.gu-en'.format(PATH), 'r').readlines()

In [None]:
# Converting document IDs from strings to integers and getting rid of 
# new characters

for i in range(len(en_doc_ids_1)):
    en_doc_ids_1[i] = literal_eval(en_doc_ids_1[i].replace('\n', ''))

for i in range(len(kk_doc_ids)):
    kk_doc_ids[i] = literal_eval(kk_doc_ids[i].replace('\n', ''))
    
for i in range(len(en_doc_ids_2)):
    en_doc_ids_2[i] = literal_eval(en_doc_ids_2[i].replace('\n', ''))
    
for i in range(len(gu_doc_ids)):
    gu_doc_ids[i] = literal_eval(gu_doc_ids[i].replace('\n', ''))

In [None]:
# Initializing empty lists of lists in which to group sentence by document
# in order to prepare for document-level mining

orig_en_sents_1 = [[] for _ in range(max(en_doc_ids_1)+1)]
orig_kk_sents = [[] for _ in range(max(en_doc_ids_1)+1)]
transl_en_sents_1 = [[] for _ in range(max(en_doc_ids_1)+1)]
transl_kk_sents = [[] for _ in range(max(en_doc_ids_1)+1)]

orig_en_sents_2 = [[] for _ in range(max(en_doc_ids_2)+1)]
orig_gu_sents = [[] for _ in range(max(en_doc_ids_2)+1)]
transl_en_sents_2 = [[] for _ in range(max(en_doc_ids_2)+1)]
transl_gu_sents = [[] for _ in range(max(en_doc_ids_2)+1)]

In [None]:
# Grouping sentences into their original documents and removing newline characters

for i in range(len(orig_en_kk)):
    doc_idx = en_doc_ids_1[i]
    orig_en_sents_1[doc_idx].append(orig_en_kk[i].replace('\n', ''))
    transl_en_sents_1[doc_idx].append(transl_en_kk[i].replace('\n', ''))

for i in range(len(orig_kk_en)):
    doc_idx = kk_doc_ids[i]
    orig_kk_sents[doc_idx].append(orig_kk_en[i].replace('\n', ''))
    transl_kk_sents[doc_idx].append(transl_kk_en[i].replace('\n', ''))
    
for i in range(len(orig_en_gu)):
    doc_idx = en_doc_ids_2[i]
    orig_en_sents_2[doc_idx].append(orig_en_gu[i].replace('\n', ''))
    transl_en_sents_2[doc_idx].append(transl_en_gu[i].replace('\n', ''))

for i in range(len(orig_gu_en)):
    doc_idx = gu_doc_ids[i]
    orig_gu_sents[doc_idx].append(orig_gu_en[i].replace('\n', ''))
    transl_gu_sents[doc_idx].append(transl_gu_en[i].replace('\n', ''))

In [None]:
# Identifying especially short documents in each corpus

bad_en_idx_1 = [i for i in range(len(orig_en_sents_1)) if len(orig_en_sents_1[i])<30]
bad_kk_idx = [i for i in range(len(orig_kk_sents)) if len(orig_kk_sents[i])<8]

bad_en_idx_2 = [i for i in range(len(orig_en_sents_2)) if len(orig_en_sents_2[i])<21]
bad_gu_idx = [i for i in range(len(orig_gu_sents)) if len(orig_gu_sents[i])<5]

In [None]:
# Removing extremely short documents from all corpora

new_orig_en_sents_1, new_orig_kk_sents, new_transl_en_sents_1, new_transl_kk_sents = [],[],[],[]
for i in range(len(orig_en_sents_1)):
    if i not in bad_en_idx_1 and i not in bad_kk_idx:
        new_orig_en_sents_1.append(orig_en_sents_1[i])
        new_orig_kk_sents.append(orig_kk_sents[i])
        new_transl_en_sents_1.append(transl_en_sents_1[i])
        new_transl_kk_sents.append(transl_kk_sents[i])

new_orig_en_sents_2, new_orig_gu_sents, new_transl_en_sents_2, new_transl_gu_sents = [],[],[],[]
for i in range(len(orig_en_sents_2)):
    if i not in bad_en_idx_2 and i not in bad_gu_idx:
        new_orig_en_sents_2.append(orig_en_sents_2[i])
        new_orig_gu_sents.append(orig_gu_sents[i])
        new_transl_en_sents_2.append(transl_en_sents_2[i])
        new_transl_gu_sents.append(transl_gu_sents[i])

In [None]:
# Changing variable names back to originals
orig_en_sents_1, orig_kk_sents, transl_en_sents_1, transl_kk_sents = new_orig_en_sents_1, new_orig_kk_sents, new_transl_en_sents_1, new_transl_kk_sents
orig_en_sents_2, orig_gu_sents, transl_en_sents_2, transl_gu_sents = new_orig_en_sents_2, new_orig_gu_sents, new_transl_en_sents_2, new_transl_gu_sents

In [None]:
# Embedding all sentences

start = time.time()

orig_en_embs_1 = [[embed(doc)] for doc in orig_en_sents_1]
print("Finished orig_en_embs")
orig_kk_embs = [[embed(doc)] for doc in orig_kk_sents]
print("Finished orig_kk_embs")
transl_en_embs_1 = [[embed(doc)] for doc in transl_en_sents_1]
print("Finished transl_en_embs")
transl_kk_embs = [[embed(doc)] for doc in transl_kk_sents]
print("Finished transl_kk_embs")

orig_en_embs_2 = [[embed(doc)] for doc in orig_en_sents_2]
print("Finished orig_en_embs")
orig_gu_embs = [[embed(doc)] for doc in orig_gu_sents]
print("Finished orig_gu_embs")
transl_en_embs_2 = [[embed(doc)] for doc in transl_en_sents_2]
print("Finished transl_en_embs")
transl_gu_embs = [[embed(doc)] for doc in transl_gu_sents]
print("Finished transl_gu_embs")

end = time.time()
print("Total embedding time: {:} seconds".format(end-start))

In [None]:
# Getting embeddings in the correct format for similarity search

for i in range(len(orig_en_embs_1)):
    orig_en_embs_1[i], orig_kk_embs[i], transl_en_embs_1[i], transl_kk_embs[i] = np.asarray(orig_en_embs_1[i]), np.asarray(orig_kk_embs[i]), np.asarray(transl_en_embs_1[i]), np.asarray(transl_kk_embs[i])
for i in range(len(orig_en_embs_1)):
    orig_en_embs_1[i], orig_kk_embs[i], transl_en_embs_1[i], transl_kk_embs[i] = orig_en_embs_1[i][0,:,:], orig_kk_embs[i][0,:,:], transl_en_embs_1[i][0,:,:], transl_kk_embs[i][0,:,:]
    
for i in range(len(orig_en_embs_2)):
    orig_en_embs_2[i], orig_gu_embs[i], transl_en_embs_2[i], transl_gu_embs[i] = np.asarray(orig_en_embs_2[i]), np.asarray(orig_gu_embs[i]), np.asarray(transl_en_embs_2[i]), np.asarray(transl_gu_embs[i])
for i in range(len(orig_en_embs_2)):
    orig_en_embs_2[i], orig_gu_embs[i], transl_en_embs_2[i], transl_gu_embs[i] = orig_en_embs_2[i][0,:,:], orig_gu_embs[i][0,:,:], transl_en_embs_2[i][0,:,:], transl_gu_embs[i][0,:,:]

### en_kk_mined_old_method.csv

In [None]:
# Mining using original en + original kk sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_kk_orig, all_margin_scores_en_kk_orig = [], []
# Mine on a document level, rather than globally
for en_doc, kk_doc in zip(orig_en_embs_1, orig_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, kk_doc)
    all_sent_pairs_en_kk_orig.append(doc_sent_pairs)
    all_margin_scores_en_kk_orig.append(doc_margin_scores)

end = time.time()
print(end-start)

### en_kk_mined_old_method-1.06.csv

### en_kk_mined_old_method-1.20.csv

### en_kk_mined_old_method-1.35.csv

In [None]:
MARGIN_THRESHOLD = 1.06 # Set this to whichever margin you want to use
orig_en_kk_pairs = []
for i in range(len(all_sent_pairs_en_kk_orig)):
    doc = all_sent_pairs_en_kk_orig[i]
    for j in range(len(doc)):
        if all_margin_scores_en_kk_orig[i][j] > MARGIN_THRESHOLD:
            pair = doc[j]
            en_sent = orig_en_sents_1[i][pair[0]-1]
            kk_sent = orig_kk_sents[i][pair[1]-1]
            orig_en_kk_pairs.append((en_sent, kk_sent))

en_sents_1, kk_sents = [pair[0] for pair in orig_en_kk_pairs], [pair[1] for pair in orig_en_kk_pairs]

### en_kk_mined_old_method_top20k.csv

In [None]:
sent_pairs_with_docidx_ms = []
for i in range(len(all_sent_pairs_en_kk_orig)):
    doc_pairs = []
    for j in range(len(all_sent_pairs_en_kk_orig[i])):
        doc_pairs.append((all_sent_pairs_en_kk_orig[i][j][0], all_sent_pairs_en_kk_orig[i][j][1], 
                         i, all_margin_scores_en_kk_orig[i][j]))
    sent_pairs_with_docidx_ms.append(doc_pairs)
    
four_tuples = []
for doc in sent_pairs_with_docidx_ms:
    for tup in doc:
        four_tuples.append(tup)
        
sorted_by_margins = sorted(four_tuples, key=lambda x: x[3], reverse=True)

top20k_en_sents = [orig_en_sents_1[tup[2]][tup[0]-1] for tup in top20k]
top20k_kk_sents = [orig_kk_sents[tup[2]][tup[0]-1] for tup in top20k]

### en_gu_orig_pairs_1.06.csv

### en_gu_orig_pairs_1.20.csv

### en_gu_orig_pairs_1.35.csv

In [None]:
# Mining using original en + original gu sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_gu_orig, all_margin_scores_en_gu_orig = [], []
# Mine on a document level, rather than globally
for en_doc, gu_doc in zip(orig_en_embs_2, orig_gu_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_2)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, gu_doc)
    all_sent_pairs_en_gu_orig.append(doc_sent_pairs)
    all_margin_scores_en_gu_orig.append(doc_margin_scores)

end = time.time()
print(end-start)

In [None]:
MARGIN_THRESHOLD = 1.06 # Set this to whichever margin you want to use
orig_en_gu_pairs = []
for i in range(len(all_sent_pairs_en_gu_orig)):
    doc = all_sent_pairs_en_gu_orig[i]
    for j in range(len(doc)):
        if all_margin_scores_en_gu_orig[i][j] > MARGIN_THRESHOLD:
            pair = doc[j]
            en_sent = orig_en_sents_2[i][pair[0]-1]
            gu_sent = orig_gu_sents[i][pair[1]-1]
            orig_en_gu_pairs.append((en_sent, gu_sent))

en_sents_2, gu_sents = [pair[0] for pair in orig_en_gu_pairs], [pair[1] for pair in orig_en_gu_pairs]

### en_kk_mined_kk_to_en_1.35.csv

In [None]:
# Mining using original en + translated kk sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_kk_to_en, all_margin_scores_kk_to_en = [], []
for en_doc, transl_kk_doc in zip(orig_en_embs_1, transl_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, transl_kk_doc)
    all_sent_pairs_kk_to_en.append(doc_sent_pairs)
    all_margin_scores_kk_to_en.append(doc_margin_scores)

end = time.time()
print(end-start)

In [None]:
MARGIN_THRESHOLD = 1.35 # Set this to whichever margin you want to use
kk_to_en_pairs = []
for i in range(len(all_sent_pairs_kk_to_en)):
    doc = all_sent_pairs_kk_to_en[i]
    for j in range(len(doc)):
        if all_margin_scores_kk_to_en[i][j] > MARGIN_THRESHOLD:
            pair = doc[j]
            en_sent = orig_en_sents_1[i][pair[0]-1]
            kk_sent = orig_kk_sents[i][pair[1]-1]
            kk_to_en_pairs.append((en_sent, kk_sent))

en_sents_3, kk_sents_2 = [pair[0] for pair in kk_to_en_pairs], [pair[1] for pair in kk_to_en_pairs]

### en_kk_mined_majority_vote_ALL.csv

### en_kk_mined_majority_vote_ALL_1.20.csv

### en_kk_mined_majority_vote_ALL_1.35.csv

In [None]:
# Mining using original kk + translated en sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_to_kk, all_margin_scores_en_to_kk = [], []
for transl_en_doc, kk_doc in zip(transl_en_embs_1, orig_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(transl_en_doc, kk_doc)
    all_sent_pairs_en_to_kk.append(doc_sent_pairs)
    all_margin_scores_en_to_kk.append(doc_margin_scores)

end = time.time()
print(end-start)

In [None]:
# Performing majority vote retrieval

mv_pairs_1, mv_margins_1 = [], []
for i in range(len(all_sent_pairs_en_kk_orig)):
    
    # Grabbing all three sets of sentence pairs:
    # 1. orig en + orig kk
    # 2. orig en + translated kk
    # 3. translated en + orig kk
    doc1, doc2, doc3 = all_sent_pairs_en_kk_orig[i], all_sent_pairs_en_to_kk[i], all_sent_pairs_kk_to_en[i]
    
    # Taking pairwise intersections of these sets
    int1, int2, int3 = set(doc1)&set(doc2), set(doc1)&set(doc3), set(doc2)&set(doc3)
    # Taking union of pairwise intersections (voting step)
    pairwise_int = list(set(int1|int2|int3))
    
    for j in range(len(pairwise_int)):
        pair = pairwise_int[j]
        en_sent = orig_en_sents_1[i][pair[0]-1]
        kk_sent = orig_kk_sents[i][pair[1]-1]
        mv_pairs_1.append((en_sent, kk_sent))
        
        # Storing margin scores associated with sentence pairs
        if pair in doc1:
            idx = all_sent_pairs_en_kk_orig[i].index(pair)
            mv_margins_1.append(all_margin_scores_en_kk_orig[i][idx])
        elif pair in doc2:
            idx = all_sent_pairs_en_to_kk[i].index(pair)
            mv_margins_1.append(all_margin_scores_en_to_kk[i][idx])
        else:
            idx = all_sent_pairs_kk_to_en[i].index(pair)
            mv_margins_1.append(all_margin_scores_kk_to_en[i][idx])

In [None]:
# Optional threshold

MARGIN_THRESHOLD = 1.06 # Set this to whichever margin you want to use
new_mv_pairs = []
for i in range(len(mv_pairs_1)):
    doc = mv_pairs_1[i]
    for j in range(len(doc)):
        if mv_margins_1[i][j] > MARGIN_THRESHOLD:
            pair = doc[j]
            en_sent = orig_en_sents_1[i][pair[0]-1]
            kk_sent = orig_kk_sents[i][pair[1]-1]
            new_mv_pairs.append((en_sent, kk_sent))

en_sents_4, kk_sents_3 = [pair[0] for pair in new_mv_pairs], [pair[1] for pair in new_mv_pairs]

### en_gu_pairs_majority_vote.csv

In [None]:
# Mining using original en + translated gu sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_gu_to_en, all_margin_scores_gu_to_en = [], []
for en_doc, transl_gu_doc in zip(orig_en_embs_2, transl_gu_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_2)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, transl_gu_doc)
    all_sent_pairs_gu_to_en.append(doc_sent_pairs)
    all_margin_scores_gu_to_en.append(doc_margin_scores)

end = time.time()
print(end-start)

In [None]:
# Mining using original en + translated gu sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_to_gu, all_margin_scores_en_to_gu = [], []
for transl_en_doc, gu_doc in zip(transl_en_embs_2, orig_gu_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_2)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(transl_en_doc, gu_doc)
    all_sent_pairs_en_to_gu.append(doc_sent_pairs)
    all_margin_scores_en_to_gu.append(doc_margin_scores)

end = time.time()
print(end-start)

In [None]:
mv_pairs_2, mv_margins_2 = [], []
for i in range(len(all_sent_pairs_en_gu_orig)):
    doc1, doc2, doc3 = all_sent_pairs_en_gu_orig[i], all_sent_pairs_en_to_gu[i], all_sent_pairs_gu_to_en[i]
    int1, int2, int3 = set(doc1)&set(doc2), set(doc1)&set(doc3), set(doc2)&set(doc3)
    pairwise_int = list(set(int1|int2|int3))
    for j in range(len(pairwise_int)):
        pair = pairwise_int[j]
        en_sent = orig_en_sents_2[i][pair[0]-1]
        gu_sent = orig_gu_sents[i][pair[1]-1]
        mv_pairs_2.append((en_sent, gu_sent))
        
        if pair in doc1:
            idx = all_sent_pairs_en_gu_orig[i].index(pair)
            mv_margins_2.append(all_margin_scores_en_gu_orig[i][idx])
        elif pair in doc2:
            idx = all_sent_pairs_en_to_gu[i].index(pair)
            mv_margins_2.append(all_margin_scores_en_to_gu[i][idx])
        else:
            idx = all_sent_pairs_gu_to_en[i].index(pair)
            mv_margins_2.append(all_margin_scores_gu_to_en[i][idx])

en_sents_5, gu_sents_2 = [pair[0] for pair in mv_pairs_2], [pair[1] for pair in mv_pairs_2]

## Iteration 2, majority vote

In [1]:
# Where you stored the comparable corpora files
PATH = '/project/statnlp/ajones'

# Original English sentences
orig_en_kk_iter1 = open('{}/orig-iter1.en-kk'.format(PATH), 'r').readlines()
# Translated English sentences
transl_en_kk_iter1 = open('{}/translation-iter1.en-kk'.format(PATH), 'r').readlines()
# English doc IDs
en_doc_ids_1_iter1 = open('{}/id-iter1.en-kk'.format(PATH), 'r').readlines()

# Original Kazakh sentences
orig_kk_en_iter1 = open('{}/orig-iter1.kk-en'.format(PATH), 'r').readlines()
# Translated Kazakh sentences
transl_kk_en_iter1 = open('{}/translation-iter1.kk-en'.format(PATH), 'r').readlines()
# Gujarati doc IDs
kk_doc_ids_iter1 = open('{}/id-iter1.kk-en'.format(PATH), 'r').readlines()

In [14]:
for i in range(len(en_doc_ids_1_iter1)):
    en_doc_ids_1_iter1[i] = literal_eval(en_doc_ids_1_iter1[i].replace('\n', ''))

for i in range(len(kk_doc_ids_iter1)):
    kk_doc_ids_iter1[i] = literal_eval(kk_doc_ids_iter1[i].replace('\n', ''))

In [15]:
orig_en_sents_1_iter1 = [[] for _ in range(max(en_doc_ids_1_iter1)+1)]
orig_kk_sents_iter1 = [[] for _ in range(max(en_doc_ids_1_iter1)+1)]
transl_en_sents_1_iter1 = [[] for _ in range(max(en_doc_ids_1_iter1)+1)]
transl_kk_sents_iter1 = [[] for _ in range(max(en_doc_ids_1_iter1)+1)]

In [16]:
# Grouping sentences into their original documents and removing newline characters

for i in range(len(orig_en_kk_iter1)):
    doc_idx = en_doc_ids_1_iter1[i]
    orig_en_sents_1_iter1[doc_idx].append(orig_en_kk_iter1[i].replace('\n', ''))
    transl_en_sents_1_iter1[doc_idx].append(transl_en_kk_iter1[i].replace('\n', ''))

for i in range(len(orig_kk_en_iter1)):
    doc_idx = kk_doc_ids_iter1[i]
    orig_kk_sents_iter1[doc_idx].append(orig_kk_en_iter1[i].replace('\n', ''))
    transl_kk_sents_iter1[doc_idx].append(transl_kk_en_iter1[i].replace('\n', ''))

In [18]:
bad_en_idx_1_iter1 = [i for i in range(len(orig_en_sents_1_iter1)) if len(orig_en_sents_1_iter1[i])<30]
bad_kk_idx_iter1 = [i for i in range(len(orig_kk_sents_iter1)) if len(orig_kk_sents_iter1[i])<8]

In [19]:
# Removing extremely short documents from all corpora

new_orig_en_sents_1_iter1, new_orig_kk_sents_iter1, new_transl_en_sents_1_iter1, new_transl_kk_sents_iter1 = [],[],[],[]
for i in range(len(orig_en_sents_1_iter1)):
    if i not in bad_en_idx_1_iter1 and i not in bad_kk_idx_iter1:
        new_orig_en_sents_1_iter1.append(orig_en_sents_1_iter1[i])
        new_orig_kk_sents_iter1.append(orig_kk_sents_iter1[i])
        new_transl_en_sents_1_iter1.append(transl_en_sents_1_iter1[i])
        new_transl_kk_sents_iter1.append(transl_kk_sents_iter1[i])

In [20]:
# Changing variable names back to originals
orig_en_sents_1_iter1, orig_kk_sents_iter1, transl_en_sents_1_iter1, transl_kk_sents_iter1 = new_orig_en_sents_1_iter1, new_orig_kk_sents_iter1, new_transl_en_sents_1_iter1, new_transl_kk_sents_iter1

In [None]:
# Embedding all sentences

i = 0

start = time.time()

print(len(orig_en_sents_1_iter1))
orig_en_embs_1 = [[embed(doc)] for doc in orig_en_sents_1_iter1]
print("Finished orig_en_embs")

i = 0
orig_kk_embs = [[embed(doc)] for doc in orig_kk_sents_iter1]
print("Finished orig_kk_embs")

i = 0
transl_en_embs_1 = [[embed(doc)] for doc in transl_en_sents_1_iter1]
print("Finished transl_en_embs")

i = 0
transl_kk_embs = [[embed(doc)] for doc in transl_kk_sents_iter1]
print("Finished transl_kk_embs")

end = time.time()
print("Total embedding time: {:} seconds".format(end-start))

14662
Finished document 1000
Finished document 2000
Finished document 3000
Finished document 4000
Finished document 5000
Finished document 6000
Finished document 7000
Finished document 8000
Finished document 9000
Finished document 10000
Finished document 11000
Finished document 12000
Finished document 13000
Finished document 14000
Finished orig_en_embs
Finished document 1000
Finished document 2000
Finished document 3000
Finished document 4000


In [31]:
for i in range(len(orig_en_embs_1)):
    orig_en_embs_1[i], orig_kk_embs[i], transl_en_embs_1[i], transl_kk_embs[i] = np.asarray(orig_en_embs_1[i]), np.asarray(orig_kk_embs[i]), np.asarray(transl_en_embs_1[i]), np.asarray(transl_kk_embs[i])
for i in range(len(orig_en_embs_1)):
    orig_en_embs_1[i], orig_kk_embs[i], transl_en_embs_1[i], transl_kk_embs[i] = orig_en_embs_1[i][0,:,:], orig_kk_embs[i][0,:,:], transl_en_embs_1[i][0,:,:], transl_kk_embs[i][0,:,:]

In [33]:
# Mining using original en + original kk sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_kk_orig_iter1, all_margin_scores_en_kk_orig_iter1 = [], []
# Mine on a document level, rather than globally
for en_doc, kk_doc in zip(orig_en_embs_1, orig_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1_iter1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, kk_doc)
    all_sent_pairs_en_kk_orig_iter1.append(doc_sent_pairs)
    all_margin_scores_en_kk_orig_iter1.append(doc_margin_scores)

end = time.time()
print(end-start)

Completed document 1000 of 14662
Completed document 2000 of 14662
Completed document 3000 of 14662
Completed document 4000 of 14662
Completed document 5000 of 14662
Completed document 6000 of 14662
Completed document 7000 of 14662
Completed document 8000 of 14662
Completed document 9000 of 14662
Completed document 10000 of 14662
Completed document 11000 of 14662
Completed document 12000 of 14662
Completed document 13000 of 14662
Completed document 14000 of 14662
188.85500073432922


In [34]:
# Mining using original en + translated kk sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_kk_to_en_iter1, all_margin_scores_kk_to_en_iter1 = [], []
for en_doc, transl_kk_doc in zip(orig_en_embs_1, transl_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1_iter1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(en_doc, transl_kk_doc)
    all_sent_pairs_kk_to_en_iter1.append(doc_sent_pairs)
    all_margin_scores_kk_to_en_iter1.append(doc_margin_scores)

end = time.time()
print(end-start)

Completed document 1000 of 14662
Completed document 2000 of 14662
Completed document 3000 of 14662
Completed document 4000 of 14662
Completed document 5000 of 14662
Completed document 6000 of 14662
Completed document 7000 of 14662
Completed document 8000 of 14662
Completed document 9000 of 14662
Completed document 10000 of 14662
Completed document 11000 of 14662
Completed document 12000 of 14662
Completed document 13000 of 14662
Completed document 14000 of 14662
188.7188708782196


In [35]:
# Mining using original kk + translated en sentences

start = time.time()

i = 0
MOD = 1e3
all_sent_pairs_en_to_kk_iter1, all_margin_scores_en_to_kk_iter1 = [], []
for transl_en_doc, kk_doc in zip(transl_en_embs_1, orig_kk_embs):
    i += 1
    if i % MOD == 0:
        print("Completed document {} of {}".format(i, len(orig_en_sents_1_iter1)), flush=True)
    doc_sent_pairs, doc_margin_scores = mineSentencePairs(transl_en_doc, kk_doc)
    all_sent_pairs_en_to_kk_iter1.append(doc_sent_pairs)
    all_margin_scores_en_to_kk_iter1.append(doc_margin_scores)

end = time.time()
print(end-start)

Completed document 1000 of 14662
Completed document 2000 of 14662
Completed document 3000 of 14662
Completed document 4000 of 14662
Completed document 5000 of 14662
Completed document 6000 of 14662
Completed document 7000 of 14662
Completed document 8000 of 14662
Completed document 9000 of 14662
Completed document 10000 of 14662
Completed document 11000 of 14662
Completed document 12000 of 14662
Completed document 13000 of 14662
Completed document 14000 of 14662
189.28361177444458


In [36]:
# Performing majority vote retrieval

mv_pairs_3, mv_margins_3 = [], []
for i in range(len(all_sent_pairs_en_kk_orig_iter1)):
    
    # Grabbing all three sets of sentence pairs:
    # 1. orig en + orig kk
    # 2. orig en + translated kk
    # 3. translated en + orig kk
    doc1, doc2, doc3 = all_sent_pairs_en_kk_orig_iter1[i], all_sent_pairs_en_to_kk_iter1[i], all_sent_pairs_kk_to_en_iter1[i]
    
    # Taking pairwise intersections of these sets
    int1, int2, int3 = set(doc1)&set(doc2), set(doc1)&set(doc3), set(doc2)&set(doc3)
    # Taking union of pairwise intersections (voting step)
    pairwise_int = list(set(int1|int2|int3))
    
    for j in range(len(pairwise_int)):
        pair = pairwise_int[j]
        en_sent = orig_en_sents_1_iter1[i][pair[0]-1]
        kk_sent = orig_kk_sents_iter1[i][pair[1]-1]
        mv_pairs_3.append((en_sent, kk_sent))
        
        # Storing margin scores associated with sentence pairs
        if pair in doc1:
            idx = all_sent_pairs_en_kk_orig_iter1[i].index(pair)
            mv_margins_3.append(all_margin_scores_en_kk_orig_iter1[i][idx])
        elif pair in doc2:
            idx = all_sent_pairs_en_to_kk_iter1[i].index(pair)
            mv_margins_3.append(all_margin_scores_en_to_kk_iter1[i][idx])
        else:
            idx = all_sent_pairs_kk_to_en_iter1[i].index(pair)
            mv_margins_3.append(all_margin_scores_kk_to_en_iter1[i][idx])

In [39]:
en_sents = [pair[0] for pair in mv_pairs_3]
kk_sents = [pair[1] for pair in mv_pairs_3]

In [40]:
pd.DataFrame({'source': en_sents, 'target': kk_sents}).to_csv('Data/en_kk_mined_majority_vote_iter2.csv')