### IMPORT LIBRARIES

In [1]:
import pandas as pd
import gensim
import nltk
import numpy as np
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from torch.utils.data import Dataset
from os import getcwd
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
from matplotlib import pyplot as plt
from numpy.linalg import norm

The Natural Language Toolkit, or NLTK, is a library in Python that provides tools for working with human language data (text).
It provides easy-to-use interfaces to over 50 corpora and lexical resources.

In [2]:
# The 'stopwords' corpus is a list of common words that are often considered irrelevant when processing natural language data.
# These include words like 'is', 'at', 'which', and 'on'. By default, these words are filtered out during the preprocessing step.

nltk.download('stopwords')

# The 'twitter_samples' corpus contains a set of tweet texts that are often used for training and testing in sentiment analysis.
# This dataset is useful for building and evaluating sentiment analysis models.

nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

Loading Embeddings for the English and French Language

## The data

Download
* English embeddings from Google code archive word2vec
[look for GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/)
    * You'll need to unzip the file first.
* and the French embeddings from
[cross_lingual_text_classification](https://github.com/vjstark/crosslingual_text_classification).
    * in the terminal, type (in one line)
    `curl -o ./wiki.multi.fr.vec https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec`

The two files will be named as 
* `GoogleNews-vectors-negative300.bin`
* `wiki.multi.fr.vec`

These files have been used in the code below.

In [3]:
from gensim.models import KeyedVectors

def load_embeddings():
    """
    Loads English and French word embeddings from pre-trained models.
    
    Returns:
    en_embeddings (gensim.models.keyedvectors.Word2VecKeyedVectors): English word embeddings.
    fr_embeddings (gensim.models.keyedvectors.Word2VecKeyedVectors): French word embeddings.
    """
    en_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
    fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')
    
    return en_embeddings, fr_embeddings

en_embeddings, fr_embeddings = load_embeddings()


Loading English to French Dictionary

In [4]:
def get_dict(file_name:str):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

In [5]:
train_dict = get_dict('./en-fr.train.txt')
test_dict = get_dict('./en-fr.test.txt')

  en = my_file.loc[i][0]
  fr = my_file.loc[i][1]
  en = my_file.loc[i][0]
  fr = my_file.loc[i][1]


In [6]:
def get_matrices(en_fr:dict,en_embeddings,fr_embeddings):
    """
    Get matrices X and Y for training data from English to French translations.

    Parameters:
    en_fr (dict): A dictionary containing English-French translation pairs.
    en_embeddings: The embeddings object for English words.
    fr_embeddings: The embeddings object for French words.

    Returns:
    X (numpy.ndarray): The matrix containing English word vectors.
    Y (numpy.ndarray): The matrix containing French word vectors.
    """
    X = []
    Y = []
    for en,fr in en_fr.items():
        if en_embeddings.has_index_for(en) and fr_embeddings.has_index_for(fr):
            X.append(en_embeddings.get_vector(en))
            Y.append(fr_embeddings.get_vector(fr))
    return np.vstack(X),np.vstack(Y)


In [7]:
X_train,Y_train = get_matrices(train_dict,en_embeddings,fr_embeddings)

In [8]:
class language_dataset(Dataset):
    def __init__(self, X: np.array, Y: np.array):
        """
        A dataset class for language data.

        Args:
            X (np.array): The input data.
            Y (np.array): The target labels.
        """
        self.X = torch.tensor(X)
        self.Y = torch.tensor(Y)
    
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index) -> torch.Tensor:
        return self.X[index], self.Y[index]


In [9]:
class TranslatorModel(nn.Module):
    """
    A class representing a translator model.
    """

    def __init__(self):
        super().__init__()
        self.hidden_layer = nn.Sequential(
            nn.Linear(300, 300, bias=False),
        )

    def forward(self, x):
        """
        Performs a forward pass through the translator model.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.

        """
        return self.hidden_layer(x)

In [10]:
torch.manual_seed(42)
translator = TranslatorModel()

### Hyperparameters

In [11]:
learning_rate = 0.8
epochs = 100

In [12]:
optimizer = torch.optim.SGD(params=translator.parameters(),lr=learning_rate)

In [13]:
loss_function = nn.MSELoss()

In [14]:
train_dataset = language_dataset(X_train,Y_train)
train_dataloader = DataLoader(train_dataset,shuffle=False)

### Train Loop

In [15]:
def train_loop(dataloader, model, loss_function, optimizer):
    """
    Trains the model using the given dataloader, loss function, and optimizer.

    Args:
        dataloader (torch.utils.data.DataLoader): The dataloader containing the training data.
        model (torch.nn.Module): The model to be trained.
        loss_function (torch.nn.Module): The loss function used to compute the training loss.
        optimizer (torch.optim.Optimizer): The optimizer used to update the model's parameters.

    Returns:
        None
    """
    size = len(dataloader.dataset)
    num_batches = len(train_dataloader)
    model.train()
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_function(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()            
    train_loss /= num_batches
    print(f"Train loss: {loss.item():>7f}")
    


In [16]:

def k_nearest_neighbours(v,candidates:list,k=1):
    """
    This function returns the k closest neighbours to a vector v from a list of candidates.
    Args:
    v (numpy.ndarray): The input vector.
    candidates (list): A list of vectors.
    k (int): The number of closest neighbours to return.
    Returns:
    list: The indices of the k closest neighbours in the candidates list.
    """
    similarity_score = []
    for c in candidates:
        similarity_score.append(np.dot(v,c)/(norm(v)*norm(c)))
    sorted_ids = np.argsort(similarity_score)
    return sorted_ids[-k:]

### Test Data

In [17]:
X_test,y_test = get_matrices(test_dict,en_embeddings,fr_embeddings)
test_dataset = language_dataset(X_test,y_test)
test_dataloader = DataLoader(test_dataset)

### Test Loop

In [18]:
def test_loop(test_dataloader, model, loss_fn):
    """
    Function to evaluate the performance of a model on a test dataset.

    Parameters:
    - test_dataloader (torch.utils.data.DataLoader): DataLoader for the test dataset.
    - model: The trained model to be evaluated.
    - loss_fn: The loss function used for evaluation.

    Returns:
    - None
    """
    model.eval()
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    test_loss, correct = 0, 0
    with torch.no_grad():
        for index, (X, y) in enumerate(test_dataloader):
            pred = model(X)
            loss = loss_fn(pred, y).item()
            test_loss += loss      
            X_num = pred.numpy()
            for w in X_num:
                if index == k_nearest_neighbours(w, y_test)[0]:
                    correct += 1
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [19]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------\n")
    train_loop(train_dataloader,translator,loss_function,optimizer)


Epoch 1
-------------------------------

Train loss: 0.004742
Epoch 2
-------------------------------

Train loss: 0.003021
Epoch 3
-------------------------------

Train loss: 0.002431
Epoch 4
-------------------------------

Train loss: 0.002175
Epoch 5
-------------------------------

Train loss: 0.002048
Epoch 6
-------------------------------

Train loss: 0.001980
Epoch 7
-------------------------------

Train loss: 0.001940
Epoch 8
-------------------------------

Train loss: 0.001916
Epoch 9
-------------------------------

Train loss: 0.001900
Epoch 10
-------------------------------

Train loss: 0.001889
Epoch 11
-------------------------------

Train loss: 0.001881
Epoch 12
-------------------------------

Train loss: 0.001876
Epoch 13
-------------------------------

Train loss: 0.001872
Epoch 14
-------------------------------

Train loss: 0.001868
Epoch 15
-------------------------------

Train loss: 0.001866
Epoch 16
-------------------------------

Train loss: 0.001864
E

In [20]:
test_loop(test_dataloader,translator,loss_function)

Test Error: 
 Accuracy: 55.8%, Avg loss: 0.002189 

