In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import math
import copy
import spacy
import pathlib
from pathlib import Path
import csv

from funnel_transformer_conan import *
from data_loader import *

from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/funnel_experiment2')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Data train test split

In [None]:
class EnFrDataset2(Dataset):
    def __init__(self, Generate_train_test_split:bool):
        """_summary_

        Args:
            abridged (bool): Use the generated abridged dataset 

        Returns:
            _type_: _description_
        """
        self.Generate_train_test_split = Generate_train_test_split
        self.full_dataset_path = Path("data/en-fr.csv")
        self.abridged_dataset_path = Path("data/en-fr-training.csv")
        
        self.en_tokenizer = get_tokenizer(tokenizer='spacy',language='en_core_web_sm')
        self.fr_tokenizer = get_tokenizer(tokenizer='spacy',language='fr_core_news_sm')
        
        self.process()
        pass
    
    def process(self):
         # Create abridged dataset if it doesnt exist and load either full or abridged data into self.ds 
        # full_dataset_path = 'data/en-fr.csv'
        # abridged_dataset_path = 'data/en-fr-abridged.csv'
        self.full_dataset_path.parent.mkdir(parents=True, exist_ok=True) # make datafolder if it doesn't exist
        
        # Check if the full dataset exists
        if not self.full_dataset_path.exists():
            raise FileNotFoundError("The full dataset does not exist. Please download it from https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset/data and place in the /data folder.")
        
        # Create the abridged dataset if it does not exist
        if  self.Generate_train_test_split:      
            print("Creating abridged dataset...")
            print("reading full dataset...")
            full_dataset = pd.read_csv(self.full_dataset_path)
            print(len(full_dataset))
            training_len = int(len(full_dataset)*0.9)
            print((training_len))
            testing_len = int(len(full_dataset)-training_len)
            print((testing_len))
            print("Creating abridged back dataset train...")
            abridged_dataset = full_dataset.head(training_len)
            print("Creating abridged back dataset test...")
            abridged_dataset_back = full_dataset[training_len:]
            del full_dataset
            print("Creating abridged dataset...")
            abridged_dataset.to_csv(self.abridged_dataset_path, index=False)
            abridged_dataset_back.to_csv("data/en-fr-testing.csv", index=False)

        
        # self.ds = pd.read_csv(self.abridged_dataset_path, encoding="utf-8", keep_default_na=False) if self.use_abridged_data else pd.read_csv(self.full_dataset_path, encoding="utf-8", keep_default_na=False)
        
        # self._data_preprocessing() # create data_pairs of lists of tokens
        pass

        
    def __getitem__(self, idx):
        return torch.tensor(self.data_pairs[idx][0]), torch.tensor(self.data_pairs[idx][1])
    
    def __len__(self):
        return len(self.data_pairs)


In [None]:
en_tokenizer = get_tokenizer(tokenizer='spacy',language='en_core_web_sm')
fr_tokenizer = get_tokenizer(tokenizer='spacy',language='fr_core_news_sm')

In [None]:
_ = EnFrDataset2(Generate_train_test_split=True)

In [None]:
def read_lang(ds, eng_str="en", fr_str="fr"):
    # return if the dataformat is wrong
    if type(ds) != pd.core.frame.DataFrame:
        raise TypeError("Wrong dataframe format!")
        
    print("Reading the dataframe and storing untokenized pairs...")
    pairs = [(ds[eng_str][i], ds[fr_str][i]) for i in tqdm(range(len(ds)))]
    
    # create the class objects of Langs for English and French to count the 
    eng_lang = Langs("en")
    fr_lang = Langs("fr")
    return eng_lang, fr_lang, pairs

In [None]:
max_seq_length = 1000
def string_to_token_list(sequence, lang):
        """Tokenize a sequence string in the given english/french language and return the list of tokens.

        Args:
            sequence (string): _description_
            lang (_type_): _description_
            en_tokenizer (_type_): _description_
            fr_tokenizer (_type_): _description_

        Returns:
            _type_: _description_
        """

        token_list = []
        if lang.lang == "en":
            words = en_tokenizer(sequence.lower())
        else:
            words = fr_tokenizer(sequence.lower())
            
        # truncate the word list if it exceeds the max allowed sequence length
        words = words[:max_seq_length - 2] # -2 is to account for the appended SOS and EOS token
        
        token_list.append(CustomTokens.SOS.value)
        for word in words:
            if word in lang.word2index:
                token_list.append(lang.word2index[word])
            else:
                token_list.append(CustomTokens.UNK.value)
        
        token_list.append(CustomTokens.EOS.value)
        
        # # pad the remainder of the token list 
        # while len(token_list) < max_seq_length:
        #     token_list.append(CustomTokens.PAD.value)
        
        return token_list

def string_data_to_tokens(data, en_lang, fr_lang, filename):
    """Create tokenized pairs of english and french sentences

    Args:
        data (_type_): Dictionary of english and french sentences

    Returns:
        _type_: _description_
    """
    tokenized_data = []
    fr_string = "_fr.csv"
    en_string = "_en.csv"
    print("Creating tokenized pairs of english and french sentences...")
    
    with open(filename+en_string, 'w') as csvfile1, open(filename+fr_string, 'w') as csvfile2:  
        # creating a csv writer object  
        csvwriter1 = csv.writer(csvfile1)  
        csvwriter2 = csv.writer(csvfile2)
        for i in tqdm(range(len(data))):
        # writing the fields  
            csvwriter1.writerow(string_to_token_list(data[i][0], en_lang))  
            csvwriter2.writerow(string_to_token_list(data[i][1], fr_lang))  

    return 1


In [None]:
def data_preprocessing(ds, eng_str="en", fr_str="fr"):
    """_summary_

    Args:
        en_tokenizer (_type_): _description_
        fr_tokenizer (_type_): _description_
        eng_str (str, optional): _description_. Defaults to "en".
        fr_str (str, optional): _description_. Defaults to "fr".
        data_pd (_type_, optional): _description_. Defaults to None.
        index_output (bool, optional): _description_cuda. Defaults to True.

    Returns:
        _type_: _description_
    """
    # initialize the language classes and get the data pairs (English, France)
    en_lang, fr_lang, data_pairs = read_lang(eng_str=eng_str, fr_str=fr_str, ds=ds) # Initialize language objects
    print("Adding sentences to Langs amd geting data pairs...")
    # for i in tqdm(range(len(data_pairs))): # create language dictionaries
    #     en_lang.addSentence(data_pairs[i][0].lower(), en_tokenizer, fr_tokenizer)
    #     fr_lang.addSentence(data_pairs[i][1].lower(), en_tokenizer, fr_tokenizer)
    print("generating the pickle files for dictionary")
    with open(f'data/en_lang_90.pickle', 'rb') as handle:
        en_lang = pickle.load(handle)
    with open(f'data/fr_lang_90.pickle', 'rb') as handle:
        fr_lang = pickle.load(handle)
    data_pairs = string_data_to_tokens(data_pairs,en_lang, fr_lang,"data/tokenized_test") # converts sequence to tokens
    print("Done Converting")
    #  return en_lang, fr_lang, data_pairs
    pass

In [None]:
ds = pd.read_csv("data/en-fr-training.csv")
data_preprocessing(ds, eng_str="en", fr_str="fr")

In [None]:
from data_loader_full import *

test_ds = Test_dataset("data/tokenized2_en.csv","data/tokenized2_fr.csv","data/en_lang_90.pickle", "data/fr_lang_90.pickle", sequence_length=100)

dataloader = DataLoader(test_ds, batch_size=8, shuffle=False, num_workers=6)

# f_transformer_test is the transformer

def inference(model, src_data, tgt_data):
    model.eval()
    batch_size = src_data.shape[0]
    # initialize start of sentence
    y_init = torch.LongTensor([CustomTokens.SOS.value]).unsqueeze(0).cuda().view(1, 1)
    y_init = y_init.repeat(batch_size,1)

    # generate output positional encoding
    toy_embeddings = f_transformer_test.decoder_embedding(tgt_data)
    output_encoding_for_inference = f_transformer_test.positional_encoding(toy_embeddings)

    # generate the mask for decoder
    _,tgt_mask = f_transformer_test.generate_mask(inputs, outputs)

    # generate the encoder output from the encoder
    _, encoder_output = f_transformer_test(src_data, tgt_data)

    # inferencing
    for i in range(out_seq_len - 1):
        # get the embedding of the decoder input
        inf_emb = f_transformer_test.decoder_embedding(y_init)
        # added up with the positional encoding
        output_encoding_for_inference[:,:y_init.shape[1],:] = inf_emb + output_encoding_for_inference[:,:y_init.shape[1],:]
        # get the decoder output and the probabilities of all the values
        decoder_output = f_transformer_test.pass_through_decoder(output_encoding_for_inference, encoder_output, tgt_mask)
        decoder_output = f_transformer_test.fc(decoder_output)
        # get the final word with highest probabilities
        _, next_word = torch.max(
                decoder_output[:, y_init.shape[1] - 1 : y_init.shape[1],:], dim=2
            )
        # generate the final output
        y_init = torch.cat([y_init, next_word.view(32,1)], dim=1)


    ## this part i haven't finish
    # convert original input from list to tokens
    # input_test = data_en.list_of_tokens_to_list_of_words(torch.squeeze(inputs), data_en.en_lang)
    # convert output from list to tokens
    # output_test = data_en.list_of_tokens_to_list_of_words(torch.squeeze(y_init), data_en.fr_lang)
    # convert output ground truth from list to tokens
    # output_real = data_en.list_of_tokens_to_list_of_words(torch.squeeze(outputs), data_en.fr_lang)

        
    return input_test, output_test, output_real