<a href="https://colab.research.google.com/github/Alinehbg/CRP/blob/Aline_Trial/Siamese_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import everythiiiiing
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/CRP/Datasets/FINAL_DF.csv") 

In [None]:
class NamesDataset(Dataset):
    def __init__(self, data):
      """It defines a custom PyTorch Dataset class named NamesDataset 
      for processing a list of names (strings) that will be used for training 
      a siamese model for text similarity. 

      1. initializes the dataset and creates a vocabulary set of all the words 
      in the data list. It also creates a dictionary word2idx that maps each 
      word to a unique integer index, with indices starting at 2 (0 and 1 are 
      reserved for padding and unknown words). The method splits each name into 
      its constituent words, maps each word to its corresponding index in 
      word2idx, and returns a PyTorch LongTensor of the indices, as well 
      as a PyTorch FloatTensor with a value of 1.0 (since we are not interested 
      in the labels, this is just a placeholder).

      2. len method returns the length of the data list (==number of names)
      
      3. getitem__ method is used to retrieve an item from the 
      dataset at a particular index idx. It returns the LongTensor of word 
      indices for the name at that index, as well as the FloatTensor with the 
      value 1.0."""
      
        self.data = data
        self.vocab = set()
        for name in self.data:
            self.vocab.update(name.split())

        self.word2idx = {word: i+2 for i, word in enumerate(self.vocab)}
        self.word2idx['<PAD>'] = 0
        self.word2idx['<UNK>'] = 1

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name = self.data[idx]
        name_words = name.split()
        name_idx = [self.word2idx.get(word, 1) for word in name_words]
        return torch.LongTensor(name_idx), torch.FloatTensor([1.0])

In [None]:
# Define the LSTM-based siamese model
class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward_once(self, x):
        x_embedded = self.embedding(x)
        _, (h_n, _) = self.lstm(x_embedded)
        h_n = h_n[-1]
        x_out = self.fc(h_n)
        return x_out

    def forward(self, x1, x2):
        x1_out = self.forward_once(x1)
        x2_out = self.forward_once(x2)
        cosine_sim = nn.functional.cosine_similarity(x1_out, x2_out, dim=1)
        return cosine_sim

In [None]:
# Set the hyperparameters and other variables
BATCH_SIZE = 64
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
LR = 0.001
EPOCHS = 1

In [None]:
train_size = int(0.8 * len(df))
train_data = df['Name'][:train_size]
val_data = df['Name'][train_size:]

train_dataset = NamesDataset(train_data)
val_dataset = NamesDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)


In [None]:
model = SiameseLSTM(len(train_dataset.vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CosineEmbeddingLoss()

In [None]:
# Train
for epoch in range(EPOCHS):
    train_loss = 0.0
    val_loss = 0.0

    model.train()
    for x1, x2 in train_loader:
        optimizer.zero_grad()

        output = model(x1, x2)
        labels = x2.view(-1, 1)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * x1.size(0)

    model