In [179]:
from camel_tools.calima_star.database import CalimaStarDB
from camel_tools.calima_star.analyzer import CalimaStarAnalyzer
from camel_tools.disambig.mle import MLEDisambiguator
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random
import csv
import json
import copy
import os
import argparse
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import re

In [74]:
class InputExample:
    """Simple object to encapsulate each data example"""
    def __init__(self, src, trg, 
                 src_g, trg_g):    
        self.src = src
        self.trg = trg
        self.src_g = src_g
        self.trg_g = trg_g
    
    def __repr__(self):
        return str(self.to_json_str())
    
    def to_json_str(self):
        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
    
    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        return output

In [75]:
class RawDataset:
    """Encapsulates the raw examples in InputExample objects"""
    def __init__(self, data_dir):
        self.train_examples = self.get_train_examples(data_dir)
        self.dev_examples = self.get_dev_examples(data_dir)
        self.test_examples = self.get_dev_examples(data_dir)
        
    def create_examples(self, src_path, trg_path):
        
        src_txt = self.get_txt_examples(src_path)
        src_gender_labels = self.get_labels(src_path + '.label')
        trg_txt = self.get_txt_examples(trg_path)
        trg_gender_labels = self.get_labels(trg_path + '.label')
        
        examples = []
        
        for i in range(len(src_txt)):
            src = src_txt[i].strip()
            trg = trg_txt[i].strip()
            src_g = src_gender_labels[i].strip()
            trg_g = trg_gender_labels[i].strip()
            input_example = InputExample(src, trg, src_g, trg_g)
            examples.append(input_example)
        
        return examples
    
    def get_labels(self, data_dir):
        with open(data_dir) as f:
            return f.readlines()
        
    def get_txt_examples(self, data_dir):
        with open(data_dir, encoding='utf8') as f:
            return f.readlines()
    
    def get_train_examples(self, data_dir):
        """Reads the train examples of the dataset"""
        return self.create_examples(os.path.join(data_dir, 'D-set-train.arin'), 
                                    os.path.join(data_dir, 'D-set-train.ar.M'))
    
    def get_dev_examples(self, data_dir):
        """Reads the dev examples of the dataset"""
        return self.create_examples(os.path.join(data_dir, 'D-set-dev.arin'), 
                                    os.path.join(data_dir, 'D-set-dev.ar.M'))
    
    def get_test_examples(self, data_dir):
        """Reads the test examples of the dataset"""
        return self.create_examples(os.path.join(data_dir, 'D-set-test.arin'), 
                                    os.path.join(data_dir, 'D-set-test.ar.M'))

In [76]:
class Vocabulary:
    """Base vocabulary class"""
    def __init__(self, token_to_idx=None):
        
        if token_to_idx is None:
            token_to_idx = dict()
        
        self.token_to_idx = token_to_idx
        self.idx_to_token = {idx: token for token, idx in self.token_to_idx.items()}
    
    def add_token(self, token):
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index
    
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token):
        return self.token_to_idx[token]
    
    def lookup_index(self, index):
        return self.idx_to_token[index]
    
    def to_serializable(self):
        return {'token_to_idx': self.token_to_idx}
    
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    
    def __len__(self):
        return len(self.token_to_idx)
    
class SeqVocabulary(Vocabulary):
    """Sequence vocabulary class"""
    def __init__(self, token_to_idx=None, unk_token='<unk>',
                 pad_token='<pad>', sos_token='<s>',
                 eos_token='</s>'):
        
        super(SeqVocabulary, self).__init__(token_to_idx)
        
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        
        self.pad_idx = self.add_token(self.pad_token)
        self.unk_idx = self.add_token(self.unk_token)
        self.sos_idx = self.add_token(self.sos_token)
        self.eos_idx = self.add_token(self.eos_token)
        
    def to_serializable(self):
        contents = super(SeqVocabulary, self).to_serializable()
        contents.update({'unk_token': self.unk_token,
                         'pad_token': self.pad_token,
                         'sos_token': self.sos_token, 
                         'eos_token': self.eos_token})
        return contents
    
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    
    def lookup_token(self, token):
        return self.token_to_idx.get(token, self.unk_idx)

In [376]:
class MorphFeaturizer:
    """Morphological Featurizer Class"""
    def __init__(self, analyzer_db_path):
        self.db = CalimaStarDB(analyzer_db_path)
        self.analyzer = CalimaStarAnalyzer(db, cache_size=46000)
        self.disambiguator = MLEDisambiguator(analyzer)
        self.w_to_features = {}
    
    def featurize(self, sentence):
        """
        Args:
            - sentence (str): a sentence in Arabic
        Returns:
            - a dictionary of word to vector mapping for each word in the sentence.
              Each vector will be a one-hot representing the following features:
              [lex+m lex+f spvar+m spvar+f]
        """
        # using the MLEDisambiguator to get the analyses
        disambiguations = self.disambiguator.disambiguate(sentence.split(' '), top=0)
        # disambiguations is a list of DisambiguatedWord objects
        # each DisambiguatedWord object is a tuple of: (word, scored_analyses)
        # scored_analyses is a list of ScoredAnalysis objects
        # each ScoredAnalysis object is a tuple of: (score, analysis)
    
        for disambig in disambiguations:
            word, scored_analyses = disambig
            if word not in self.w_to_features:
                self.w_to_features[word] = list()
                if scored_analyses:
                    for scored_analysis in scored_analyses:
                        # each analysis will have a vector
                        score, analysis = scored_analysis
                        features = np.zeros(4, dtype=int)

                        # getting the source and gender features
                        src = analysis['source']
                        gen = analysis['gen']

                        if src == 'lex' and gen == 'm':
                            features[0] = 1
                        elif src == 'lex' and gen == 'f':
                            features[1] = 1
                        elif src == 'spvar' and gen == 'm':
                            features[2] = 1
                        elif src == 'spvar' and gen == 'f':
                            features[3] = 1

                        self.w_to_features[word].append(features)

                    # squashing all the vectors into one
                    self.w_to_features[word] = np.array(self.w_to_features[word])
                    self.w_to_features[word] = np.array(self.w_to_features[word].sum(axis=0) != 0, dtype=int).tolist()
                else:
                    self.w_to_features[word] = features = np.zeros(4, dtype=int).tolist()

    def featurize_sentences(self, sentences):
        for sentence in sentences:
            self.featurize(sentence)
    
    def to_serializable(self):
        return {'morph_features': self.w_to_features}
    
    def from_serializable(self, contents):
        self.w_to_features = contents['morph_features']
        
    def save_morph_features(self, path):
        with open(path, mode='w', encoding='utf8') as f:
            return json.dump(self.to_serializable(), f, ensure_ascii=False)
    
    def load_morph_features(self, path):
        with open(path) as f:
            return self.from_serializable(json.load(f))

In [362]:
class Vectorizer:
    """Vectorizer Class"""
    def __init__(self, src_vocab_char, trg_vocab_char, src_vocab_word, trg_vocab_word):
        """src_vocab_char and trg_vocab_char 
        are on the char level. src_vocab_word and 
        trg_vocab_word are on the word level"""
        self.src_vocab_char = src_vocab_char
        self.trg_vocab_char = trg_vocab_char
        self.src_vocab_word = src_vocab_word
        self.trg_vocab_word = trg_vocab_word
        
    @classmethod
    def create_vectorizer(cls, data_examples):
        """Class method which builds the vectorizer
        vocab"""
        
        src_vocab_char = SeqVocabulary()
        trg_vocab_char = SeqVocabulary()
        src_vocab_word = SeqVocabulary()
        trg_vocab_word = SeqVocabulary()
        
        for ex in data_examples:
            src = ex.src
            trg = ex.trg
            
            # splitting by a regex to maintain the space
            src = re.split(r'(\s+)', src)
            trg = re.split(r'(\s+)', trg)
    
            for word in src:
                src_vocab_word.add_token(word)
                src_vocab_char.add_many(list(word))
                
            for word in trg:
                trg_vocab_word.add_token(word)
                trg_vocab_char.add_many(list(word))
        
        return cls(src_vocab_char, trg_vocab_char, src_vocab_word, trg_vocab_word)
    
    def get_src_indices(self, seq):
        """
        Args:
          - seq (str): The src sequence
        
        Returns:
          - char_level_indices (list): <s> + List of chars to index mapping + </s>
          - word_level_indices (list): <s> + List of words to index mapping + </s>
        """
        char_level_indices = [self.src_vocab_char.sos_idx]
        word_level_indices = [self.src_vocab_word.sos_idx]
        seq = re.split(r'(\s+)', seq)
        for word in seq:
            for c in word:
                char_level_indices.append(self.src_vocab_char.lookup_token(c))
                word_level_indices.append(self.src_vocab_word.lookup_token(word))
        
        word_level_indices.append(self.src_vocab_word.eos_idx)
        char_level_indices.append(self.src_vocab_char.eos_idx)
        
        assert len(word_level_indices) == len(char_level_indices)
        return char_level_indices, word_level_indices
    
    def get_trg_indices(self, seq):
        """
        Args:
          - seq (str): The trg sequence
        
        Returns:
          - trg_x_indices (list): <s> + List of tokens to index mapping
          - trg_y_indices (list): List of tokens to index mapping + </s>
        """
        indices = [self.trg_vocab_char.lookup_token(t) for t in seq]
        
        trg_x_indices = [self.trg_vocab_char.sos_idx] + indices
        trg_y_indices = indices + [self.trg_vocab_char.eos_idx]
        return trg_x_indices, trg_y_indices
    
    
    def vectorize(self, src, trg):
        """
        Args:
          - src (str): The src sequence
          - src (str): The trg sequence
        Returns:
          - vectorized_src 
          - vectorized_trg_x 
          - vectorized_trg_y
        """
        src = src
        trg = trg
        
        vectorized_src_char, vectorized_src_word = self.get_src_indices(src)
        vectorized_trg_x, vectorized_trg_y = self.get_trg_indices(trg)
        
        return {'src_char': torch.tensor(vectorized_src_char, dtype=torch.long),
                'src_word': torch.tensor(vectorized_src_word, dtype=torch.long),
                'trg_x': torch.tensor(vectorized_trg_x, dtype=torch.long),
                'trg_y': torch.tensor(vectorized_trg_y, dtype=torch.long)
               }
    
    def to_serializable(self):
        return {'src_vocab_char': self.src_vocab_char.to_serializable(),
                'trg_vocab_char': self.trg_vocab_char.to_serializable(),
                'src_vocab_word': self.src_vocab_word.to_serializable(),
                'trg_vocab_word': self.trg_vocab_word.to_serializable()
               }
    
    @classmethod
    def from_serializable(cls, contents):
        src_vocab_char = SeqVocabulary.from_serializable(contents['src_vocab_char'])
        trg_vocab_char = SeqVocabulary.from_serializable(contents['trg_vocab_char'])
        src_vocab_word = SeqVocabulary.from_serializable(contents['src_vocab_word'])
        trg_vocab_word = SeqVocabulary.from_serializable(contents['trg_vocab_word'])
        return cls(src_vocab_char, trg_vocab_char, src_vocab_word, trg_vocab_word)

In [363]:
class MT_Dataset(Dataset):
    """MT Dataset as a PyTorch dataset"""
    def __init__(self, raw_dataset, vectorizer):
        self.vectorizer = vectorizer
        self.train_examples = raw_dataset.train_examples
        self.dev_examples = raw_dataset.dev_examples
        self.test_examples = raw_dataset.test_examples
        self.lookup_split = {'train': self.train_examples,
                             'dev': self.dev_examples,
                             'test': self.test_examples}
        self.set_split('train')
    
    def get_vectorizer(self):
        return self.vectorizer
    
    @classmethod
    def load_data_and_create_vectorizer(cls, data_dir):
        raw_dataset = RawDataset(data_dir)
        # Note: we always create the vectorized based on the train examples
        vectorizer = Vectorizer.create_vectorizer(raw_dataset.train_examples)
        return cls(raw_dataset, vectorizer)
    
    @classmethod
    def load_data_and_load_vectorizer(cls, data_dir, vec_path):
        raw_dataset = RawDataset(data_dir)
        vectorizer = cls.load_vectorizer(vec_path)
        return cls(raw_dataset, vectorizer)
    
    @staticmethod
    def load_vectorizer(vec_path):
        with open(vec_path) as f:
            return Vectorizer.from_serializable(json.load(f))
    
    def save_vectorizer(self, vec_path):
        with open(vec_path, 'w') as f:
            return json.dump(self.vectorizer.to_serializable(), f)
        
    def set_split(self, split):
        self.split = split
        self.split_examples = self.lookup_split[self.split]
        return self.split_examples
    
    def __getitem__(self, index):
        example = self.split_examples[index]
        src, trg = example.src, example.trg
        vectorized = self.vectorizer.vectorize(src, trg)
        return vectorized
    
    def __len__(self):
        return len(self.split_examples)
    
    
class Collator:
    def __init__(self, src_pad_idx, trg_pad_idx):
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        
    def __call__(self, batch):
        # Sorting the batch by src seqs length in descending order
        sorted_batch = sorted(batch, key=lambda x: x['src_char'].shape[0], reverse=True)
        
        src_char_seqs = [x['src_char'] for x in sorted_batch]
        src_word_seqs = [x['src_word'] for x in sorted_batch]
        assert len(src_word_seqs) == len(src_char_seqs)
        trg_x_seqs = [x['trg_x'] for x in sorted_batch]
        trg_y_seqs = [x['trg_y'] for x in sorted_batch]
        lengths = [len(seq) for seq in src_char_seqs]
        
        padded_src_char_seqs = pad_sequence(src_char_seqs, batch_first=True, padding_value=self.src_pad_idx)
        padded_src_word_seqs = pad_sequence(src_word_seqs, batch_first=True, padding_value=self.src_pad_idx)
        padded_trg_x_seqs = pad_sequence(trg_x_seqs, batch_first=True, padding_value=self.trg_pad_idx)
        padded_trg_y_seqs = pad_sequence(trg_y_seqs, batch_first=True, padding_value=self.trg_pad_idx)
        lengths = torch.tensor(lengths, dtype=torch.long)
        
        return {'src_char': padded_src_char_seqs,
                'src_word': padded_src_word_seqs,
                'trg_x': padded_trg_x_seqs,
                'trg_y': padded_trg_y_seqs,
                'src_lengths': lengths}

In [445]:
def make_morph_embeddings(morph_featurizer, morph_feature_path, word_vocab):
    """Creating a morphological features embedding matrix"""
    # Loading the morph features
    morph_featurizer.load_morph_features(morph_feature_path)
    morph_features = morph_featurizer.w_to_features
    
    # Note: morph_features will have all the words in word_vocab
    # except: <s>, pad, unk, </s>, ' '
    
    # Creating a zero embedding matrix of shape: (len(src_word_vocab), 4)
    morph_embedding_matrix = torch.zeros((len(word_vocab), 4))
    for word in word_vocab.token_to_idx:
        if word in morph_features:
            index = word_vocab.lookup_token(word)
            morph_embedding_matrix[index] = torch.tensor(morph_features[word], dtype=torch.float64)
    return morph_embedding_matrix

In [468]:
class Encoder(nn.Module):
    """Encoder bi-GRU"""
    def __init__(self, input_dim, embed_dim,
                 hidd_dim, morph_embedding,
                 char_padding_idx=0, word_padding_idx=0):
        
        super(Encoder, self).__init__()
        self.char_embedding_layer = nn.Embedding(input_dim, embed_dim, padding_idx=char_padding_idx)
        self.morph_embedding_layer = nn.Embedding.from_pretrained(morph_embedding, padding_idx=word_padding_idx)
        self.rnn = nn.GRU(embed_dim + 4, hidd_dim, batch_first=True, bidirectional=True)
        
    def forward(self, char_src_seqs, word_src_seqs, src_seqs_lengths):
    
        embedded_char_seqs = self.char_embedding_layer(char_src_seqs)
        # embedded_char_seqs shape: [batch_size, max_src_seq_len, embed_dim]
        
        embedded_word_seqs = self.morph_embedding_layer(word_src_seqs)
        # embedded_char_seqs shape: [batch_size, max_src_seq_len, 4]

        embedded_seqs = torch.cat((embedded_char_seqs, embedded_word_seqs), dim=2)
        # embedded_seqs shape: [batch_size, max_src_seq_len, embed_dim + 4]
        
        # packing the embedded_seqs
        packed_embedded_seqs = pack_padded_sequence(embedded_seqs, src_seqs_lengths, batch_first=True)
        
        output, hidd = self.rnn(packed_embedded_seqs)
        # hidd shape: [num_layers * num_dirs, batch_size, hidd_dim]
        
        # changing hidd shape to: [batch_size, num_layers * num_dirs, hidd_dim]
        hidd = hidd.permute(1, 0 ,2)
        
        # changing hidd shape to: [batch_size, num_layers * num_dirs * hidd_dim]
        hidd = hidd.contiguous().view(hidd.shape[0], -1)
        
        # unpacking the output
        output, lengths = pad_packed_sequence(output, batch_first=True)
        # output shape: [batch_size, src_seqs_length, num_dirs * hidd_dim]
        return output, hidd

In [458]:
morph_featurizer = MorphFeaturizer('/home/ba63/databases/calima-msa/calima-msa.0.2.2.utf8.db')

In [459]:
morph_featurizer.load_morph_features('/home/ba63/databases/morph_features.json')

In [460]:
dataset = MT_Dataset.load_data_and_create_vectorizer('/home/ba63/gender-bias/data/christine_2019/'\
                                                     'Arabic-parallel-gender-corpus')

In [461]:
vectorizer = dataset.get_vectorizer()

In [462]:
matrix = make_morph_embeddings(morph_featurizer, morph_feature_path='/home/ba63/databases/morph_features.json',
                             word_vocab=vectorizer.src_vocab_word)

In [463]:
matrix[0:10]

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 0.],
        [1., 0., 1., 0.],
        [1., 1., 0., 0.],
        [0., 0., 0., 0.],
        [1., 0., 0., 0.]])

In [469]:
ENCODER_INPUT_DIM = len(vectorizer.src_vocab_char)
embed_dim = 10
hidd_dim = 64
CHAR_PADDING_IDX = vectorizer.src_vocab_char.pad_idx
WORD_PADDING_IDX = vectorizer.src_vocab_word.pad_idx

In [470]:
encoder = Encoder(input_dim=ENCODER_INPUT_DIM, 
                  embed_dim=embed_dim,
                  hidd_dim=hidd_dim,
                  morph_embedding=matrix,
                  char_padding_idx=0,
                  word_padding_idx=0)

In [471]:
collator = Collator(src_pad_idx=vectorizer.src_vocab_char.pad_idx, 
                    trg_pad_idx=vectorizer.trg_vocab_char.pad_idx)
loader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=collator)

In [472]:
for batch in loader:
    src_char = batch['src_char']
    src_word = batch['src_word']
    lengths = batch['src_lengths']
    output, hidd = encoder(src_char, src_word, lengths)
    break

In [None]:
class Encoder(nn.Module):
    """Encoder bi-GRU"""
    def __init__(self, input_dim, embed_dim,
                 hidd_dim, padding_idx=0):
        
        super(Encoder, self).__init__()
        self.embedding_layer = nn.Embedding(input_dim, embed_dim, padding_idx=padding_idx)
        self.rnn = nn.GRU(embed_dim, hidd_dim, batch_first=True, bidirectional=True)
        
    def forward(self, src_seqs, src_seqs_lengths):
    
        embedded_seqs = self.embedding_layer(src_seqs)
        # embedded_seqs shape: [batch_size, max_src_seq_len, embed_dim]
        
        # packing the embedded_seqs
        packed_embedded_seqs = pack_padded_sequence(embedded_seqs, src_seqs_lengths, batch_first=True)
        
        output, hidd = self.rnn(packed_embedded_seqs)
        # hidd shape: [num_layers * num_dirs, batch_size, hidd_dim]
        
        # changing hidd shape to: [batch_size, num_layers * num_dirs, hidd_dim]
        hidd = hidd.permute(1, 0 ,2)
        
        # changing hidd shape to: [batch_size, num_layers * num_dirs * hidd_dim]
        hidd = hidd.contiguous().view(hidd.shape[0], -1)
        
        # unpacking the output
        output, lengths = pad_packed_sequence(output, batch_first=True)
        # output shape: [batch_size, src_seqs_length, num_dirs * hidd_dim]
        return output, hidd

In [95]:
dataset = MT_Dataset.load_data_and_create_vectorizer('/home/ba63/gender-bias/data/christine_2019/'\
                                                     'Arabic-parallel-gender-corpus')

In [96]:
dataset[0]

{'src': tensor([ 2,  4,  5,  6,  7,  8,  9, 10,  4, 11, 10, 12,  5, 13, 14, 10, 15, 10,
         16, 14, 11, 12, 17, 14, 18, 10, 19, 11,  8,  9, 10, 15,  3]),
 'trg_x': tensor([ 2,  4,  5,  6,  7,  8,  9, 10,  4, 11, 10, 12,  5, 13, 14, 10, 15, 10,
         16, 14, 11, 12, 17, 14, 18, 10, 19, 11,  8,  9, 10, 15]),
 'trg_y': tensor([ 4,  5,  6,  7,  8,  9, 10,  4, 11, 10, 12,  5, 13, 14, 10, 15, 10, 16,
         14, 11, 12, 17, 14, 18, 10, 19, 11,  8,  9, 10, 15,  3])}

In [168]:
sentence = 'my name is bashar'
char_vocab = Vocabulary()
word_vocab = Vocabulary()

In [169]:
for w in sentence.split(' '):
    word_vocab.add_token(w)
    char_vocab.add_many(list(w))

In [171]:
char_vocab.token_to_idx

{'m': 0,
 'y': 1,
 'n': 2,
 'a': 3,
 'e': 4,
 'i': 5,
 's': 6,
 'b': 7,
 'h': 8,
 'r': 9}

In [116]:
vectorized_char_seq = torch.tensor([[char_vocab.lookup_token(t) for t in sentence]])
vectorized_word_seq = torch.tensor([[word_vocab.lookup_token(t) for t in sentence.split(' ')]])

In [117]:
word_embedding = nn.Embedding(len(word_vocab), 10)
char_embedding = nn.Embedding(len(char_vocab), 10)

In [118]:
embedded_word = word_embedding(vectorized_word_seq)
embedded_char = char_embedding(vectorized_char_seq)

In [119]:
embedded_word.shape

torch.Size([1, 4, 10])

In [128]:
embedded_word = embedded_word.view(embedded_word.shape[0], -1)

In [130]:
embedded_word.shape

torch.Size([1, 40])

In [120]:
embedded_char.shape

torch.Size([1, 17, 10])

In [124]:
words = [torch.tensor()]

Running a check on the model before training.
Sentences:
everybody eat the food . I kept looking out the window , trying to find the one I was waiting for .


TypeError: 'function' object is not subscriptable