<a href="https://colab.research.google.com/github/AdamMohsen4/FreightFixer/blob/ml/freightFixer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
from google.colab import files
import io
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import random

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


uploaded = files.upload()

filename = list(uploaded.keys())[0]


if filename.endswith('.csv'):
    df = pd.read_csv(filename)
    print("Loaded CSV file successfully!")

elif filename.endswith('.xlsx') or filename.endswith('.xls'):
    df = pd.read_excel(filename)
    print("Loaded Excel file successfully!")

else:
    raise ValueError("Unsupported file type. Please upload a CSV or Excel file.")

df.head()


Using device: cuda


Saving synthetic_shipment_data.csv to synthetic_shipment_data.csv
Loaded CSV file successfully!


Unnamed: 0,clean_name,clean_company,clean_street,clean_postal_code,clean_city,noisy_name,noisy_company,noisy_street,noisy_postal_code,noisy_city
0,Elisa Ruotsalainen,Saari Oyj,Orisaarenbulevardi 189,4300,Tuusula,,4300.0,Oirsaarenybulevardi 189,1300.0,Tuuslua
1,Juha Ojanen,Laitinen Kivinen Oyj,Agricolanbulevardi 45,80100,Joensuu,Juha Ojanen,80100.0,Agricolanbuleavrdi 45,,Joensuu
2,Aino Penttinen,Eskelinen Perälä Oyj,Nikonkuja 109,15110,Lahti,Aino Peenttinen,15110.0,Nikonkua 109,,alhti
3,Antero Nieminen,Lampinen Turunen Oy,Ahvenkoskenbulevardi 41,20100,Turku,Antero Nieminen,20100.0,Ahvnekoskenbulevardi 41,20100.0,Tukru
4,Susanna Koskinen,Pöllänen Räsänen Oyj,Maanmittarintie 125,80100,Joensuu,Susanna KosinenX,80100.0,Maanmittarintie 125,80140.0,joensuu


In [8]:
# Clean and filter data (just city for now)
df = df[['noisy_city', 'clean_city']]
# df = df.dropna()
df = df.astype(str)

# Buid character vocabulary
all_text = ''.join(df['noisy_city']) + '' .join(df['clean_city'])
chars = sorted(set(all_text))
char2idx = {c: i+1 for i, c in enumerate(chars)}
idx2char = {i+1: c for i, c in enumerate(chars)}
vocab_size = len(char2idx) + 1





  Convert a string into a list of integers using the character-to-index mapping.
    The string is converted to lowercase (optional) for consistency.
    If the resulting list is shorter than max_len, pad it with 0s (the padding token).


In [9]:
# Convert text to indexed sequences

def encode(text, char2idx, max_len):
  encoded = [char2idx.get(c,0) for c in text.lower()]
  return encoded + [0]*(max_len - len(encoded))

max_input_len = df['noisy_city'].str.len().max()
max_target_len = df['clean_city'].str.len().max()

inputs = [encode(x, char2idx, max_input_len) for x in df['noisy_city']]
targets = [encode(x, char2idx, max_target_len) for x in df['clean_city']]

A custom PyTorch Dataset class.
    It provides easy access to each input-target pair and converts the lists into tensors.


In [10]:
class CorrectionDataset(Dataset):
  def __init__(self, inputs, targets):
    self.inputs = torch.tensor(inputs, dtype = torch.long)
    self.targets = torch.tensor(targets, dtype = torch.long)

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

dataset = CorrectionDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)


 The Encoder reads the input sequence and encodes it into a context vector.
    It uses an Embedding layer to convert indices to vector representations,
    followed by an LSTM layer to process the sequence.


In [11]:
class Encoder(nn.Module):

  def __init__ (self, vocab_size, embed_dim, hidden_dim):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first = True)

  def forward(self, x):
    embedded = self.embedding(x)
    _, (hidden, cell) = self.lstm(embedded)
    return hidden, cell

 The Decoder generates the output sequence based on the encoded context from the Encoder.
    It also uses an Embedding layer for input tokens, an LSTM to process the sequence,
    and a Fully Connected layer to predict the next token.
    


In [12]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_dim):
      super().__init__()
      self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
      self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first = True)
      self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
      embedded = self.embedding(x)
      output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
      prediction = self.fc(output)
      return prediction, hidden, cell

 The Seq2Seq model integrates both the Encoder and Decoder.
    It loops over each time step to generate the output sequence.
    Teacher forcing is optionally used during training.


In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target=None, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1] if target is not None else max_target_len  # Use max_target_len when target is None
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Start with <SOS> token (if you have one, otherwise use a suitable start token)
        decoder_input = torch.tensor([char2idx.get("<SOS>", 0)] * batch_size, dtype=torch.long, device=device)

        for t in range(target_len):
            decoder_input = decoder_input.unsqueeze(1)  # Reshape for LSTM input
            output, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[:, t, :] = output.squeeze(1)

            # Teacher forcing
            if target is not None and random.random() < teacher_forcing_ratio:
                decoder_input = target[:, t]  # Use ground truth
            else:
                decoder_input = torch.argmax(output, dim=2).squeeze(1)  # Use model prediction

        return outputs

Initialize model and set up training

In [14]:
embedding_dim = 128
hidden_dim = 256

encoder = Encoder(vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim)

model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)
num_epochs = 1



for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for input_batch, target_batch in dataloader:
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)
        optimizer.zero_grad()
        # Get model predictions for the current batch using teacher forcing
        output = model(input_batch, target_batch)

        loss = criterion(output.view(-1, vocab_size), target_batch.view(-1))
        loss.backward()  # Compute gradients via backpropagation
        optimizer.step()  # Update model weights using the optimizer
        total_loss += loss.item()  # Add the batch loss to the epoch loss

    avg_loss = total_loss / len(dataloader)  # Average loss for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/1, Average Loss: 1.1984


In [20]:
import pickle
import os

In [23]:
# Create a directory to save the model if it doesn't exist
if not os.path.exists('saved_model'):
    os.makedirs('saved_model')

# Save the model state
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'vocab_size': vocab_size,
    'embedding_dim': embedding_dim,
    'hidden_dim': hidden_dim
}, 'saved_model/city_correction_model.pth')

# Save the vocabulary and mappings
model_data = {
    'char2idx': char2idx,
    'idx2char': idx2char,
    'max_input_len': max_input_len,
    'max_target_len': max_target_len
}

with open('saved_model/model_vocab.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model and vocabulary saved successfully in 'saved_model' directory!")

Model and vocabulary saved successfully in 'saved_model' directory!


In [22]:
# Test loading the saved model
checkpoint = torch.load('saved_model/city_correction_model.pth')

# Create new model instance
test_encoder = Encoder(checkpoint['vocab_size'], checkpoint['embedding_dim'], checkpoint['hidden_dim'])
test_decoder = Decoder(checkpoint['vocab_size'], checkpoint['embedding_dim'], checkpoint['hidden_dim'])
test_model = Seq2Seq(test_encoder, test_decoder)

# Load the saved state
test_model.load_state_dict(checkpoint['model_state_dict'])
test_model.eval()

print("Model loaded successfully!")

Model loaded successfully!


In [29]:
from google.colab import files

# Download the model file
files.download('saved_model/city_correction_model.pth')

# Download the vocabulary file
files.download('saved_model/model_vocab.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>