<a href="https://colab.research.google.com/github/AdamMohsen4/FreightFixer/blob/ml/freightFixer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
from google.colab import files
import io
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


uploaded = files.upload()

filename = list(uploaded.keys())[0]


if filename.endswith('.csv'):
    df = pd.read_csv(filename)
    print("Loaded CSV file successfully!")

elif filename.endswith('.xlsx') or filename.endswith('.xls'):
    df = pd.read_excel(filename)
    print("Loaded Excel file successfully!")

else:
    raise ValueError("Unsupported file type. Please upload a CSV or Excel file.")

df.head()


Using device: cuda


Saving synthetic_shipment_data.csv to synthetic_shipment_data.csv
Loaded CSV file successfully!


Unnamed: 0,clean_name,clean_company,clean_street,clean_postal_code,clean_city,noisy_name,noisy_company,noisy_street,noisy_postal_code,noisy_city
0,Tapio Kukkonen,Lahtinen,Tuulilasintie 41,20100,Turku,TapioKutkkonen,20100.0,Tuulilsintie 41,,Tukru
1,Kai Savolainen,Savolainen Tiainen Oyj,Degermosankuja 109,4300,Tuusula,aKi Savolainen,4300.0,Degecrmosankuaj 109,4300.0,Tuuusla
2,Hilkka Tuominen,Eskola Oyj,Jyväbulevardi 149,4200,Kerava,,4200.0,Jyväbulevardi 149,,eKrava
3,Juhani Huhtala,Mäkinen Oyj,Nils Westermarckin kuja 29,33100,Tampere,Juhani Huhala,33100.0,Nils Wesetrmarckn kuja 29,33100.0,Tampree
4,Pirjo Laaksonen,Partanen Rantala Osk,Runokylänkuja 116,5810,Hyvinkää,Pirjo Laaksonne,,Runoklänkuja 1o16,5810.0,Hyviknää


In [7]:
# Clean and filter data (just city for now)
df = df[['noisy_city', 'clean_city']]
# df = df.dropna()
df = df.astype(str)

# Buid character vocabulary
all_text = ''.join(df['noisy_city']) + '' .join(df['clean_city'])
chars = sorted(set(all_text))
char2idx = {c: i+1 for i, c in enumerate(chars)}
idx2char = {i+1: c for i, c in enumerate(chars)}
vocab_size = len(char2idx) + 1





  Convert a string into a list of integers using the character-to-index mapping.
    The string is converted to lowercase (optional) for consistency.
    If the resulting list is shorter than max_len, pad it with 0s (the padding token).


In [8]:
# Convert text to indexed sequences

def encode(text, char2idx, max_len):
  encoded = [char2idx.get(c,0) for c in text.lower()]
  return encoded + [0]*(max_len - len(encoded))

max_input_len = df['noisy_city'].str.len().max()
max_target_len = df['clean_city'].str.len().max()

inputs = [encode(x, char2idx, max_input_len) for x in df['noisy_city']]
targets = [encode(x, char2idx, max_target_len) for x in df['clean_city']]

A custom PyTorch Dataset class.
    It provides easy access to each input-target pair and converts the lists into tensors.


In [9]:
class CorrectionDataset(Dataset):
  def __init__(self, inputs, targets):
    self.inputs = torch.tensor(inputs, dtype = torch.long)
    self.targets = torch.tensor(targets, dtype = torch.long)

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

dataset = CorrectionDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)


 The Encoder reads the input sequence and encodes it into a context vector.
    It uses an Embedding layer to convert indices to vector representations,
    followed by an LSTM layer to process the sequence.


In [10]:
class Encoder(nn.Module):

  def __init__ (self, vocab_size, embed_dim, hidden_dim):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first = True)

  def forward(self, x):
    embedded = self.embedding(x)
    _, (hidden, cell) = self.lstm(embedded)
    return hidden, cell

 The Decoder generates the output sequence based on the encoded context from the Encoder.
    It also uses an Embedding layer for input tokens, an LSTM to process the sequence,
    and a Fully Connected layer to predict the next token.
    


In [11]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_dim):
      super().__init__()
      self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
      self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first = True)
      self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(slef, x, hidden, cell):
      embedded = self.embedding(x)
      output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
      prediction = self.fc(output)
      return prediction, hidden, cell

 The Seq2Seq model integrates both the Encoder and Decoder.
    It loops over each time step to generate the output sequence.
    Teacher forcing is optionally used during training.


In [13]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

    def forward(self, source, target = None, teacher_forcing_ratio = 0.5):
      batch_size = source.shape[0]
      target_len = target.shape[1] if target is not None else max_target_len
      outputs = torch.zeros(batch_size, target_len, vocab_size).to(device)

      hidden, cell = self.encoder(source)

      x = torch.zeros(batch_size, dtype = torch.long).to(device)

      for t in range(target_len):
          x = x.unsqueeze(1)
          output, hidden, cell = self.decoder(x, hidden, cell)
          outputs[:, t, :] = output.squeeze(1)

          if target[:, t]:
             x = target[:, t]

          else:
            x = output.argsmax(dime = 2).squeeze(1)

      return outputs

