In [5]:
import pandas as pd
import torch
import json
import re
import numpy
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [90]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [88]:
BATCH_SIZE=64

#Data Preporation



##Tokinizer

In [64]:
class TokinizerUtil():
  def __init__(self, maxSquanceLength=25, maxVocabalaryLength=10000):
    self.dictonary={"PAD": 0, "UNK": 1}
    self.maxVocabalaryLength=maxVocabalaryLength
    self.maxSquanceLength=maxSquanceLength

  def addWord(self, word:str) -> None:
    if self.maxVocabalaryLength>len(self.dictonary):
      if word not in self.dictonary:
        self.dictonary[word]=len(self.dictonary)

  def findWordsToken(self, vocabalary:dict, word:str) -> int:
    if word in vocabalary:
      return vocabalary[word]
    return vocabalary["UNK"]

  def pad(self, message:list[int]):
    paddingRequired=self.maxSquanceLength-len(message)
    paddingList=[self.dictonary["PAD"]]*paddingRequired

    return message+paddingList

  def findTokensWord(self, vocabalary:dict, token:int) -> str:
    if token in vocabalary:
      return vocabalary[token]
    return vocabalary[1]

In [68]:
class Tokinizer(TokinizerUtil):
  def encode(self, message:str):
    encodedMessage=[]

    for word in message.split(" "):
      if len(encodedMessage)>=self.maxSquanceLength:
        return self.pad(encodedMessage)

      self.addWord(word)
      token=self.findWordsToken(self.dictonary, word)
      encodedMessage.append(token)

    return self.pad(encodedMessage)

  def decode(self, encodedMessage:list[int]):
    decodedMessage=""
    decodeHash={v: k for k, v in self.dictonary.items()}

    for token in encodedMessage:
      word=self.findTokensWord(decodeHash, token)
      decodedMessage+=word+" "

    return decodedMessage


    def __len__(self):
      return len(self.dictonary)

##Dataset

In [69]:
class DatasetUtil(Dataset):
  def selectColumns(self, df:pd.core.frame.DataFrame, *args:str):
    columns=[]
    for columnName in args:
      columns.append(list(df[columnName]))

    return columns

  def reformatData(self, words:list[str], tags:list[str], tokinizer:Tokinizer) -> list[list[str]]:
    resultSentance = []
    resultTag = []

    tempSentence = []
    tempTag = []

    for i, word in enumerate(words):
      tempSentence.append(word)
      tempTag.append(tags[i])

      if word == ".":
        resultSentance.append(tempSentence)
        resultTag.append(tempTag)
        tempSentence = []
        tempTag = []

    return resultSentance, resultTag


In [74]:
class CustomDataset(DatasetUtil):
  def __init__(self, csvFilePath:str):
    self.wordTokinizer=Tokinizer()
    tagTokinizer=Tokinizer()

    pandasDf=pd.read_csv(csvFilePath, encoding='latin1')
    columns=self.selectColumns(pandasDf, "Word", "Tag")

    words, tags=columns[0], columns[1]
    resultSentance, resultTag = self.reformatData(words, tags, self.wordTokinizer)

    processedSentance, processedTag = self.encode(resultSentance, resultTag, self.wordTokinizer)

    self.sentanceTensor=torch.tensor(processedSentance)
    self.tagTensor=torch.tensor(processedTag)

  def encode(self, resultSentance, resultTag, wordTokinizer):
    processedSentance=[]
    processedTag=[]
    for index in range(len(resultSentance)):
      sentance = " ".join(resultSentance[index])
      tag = " ".join(resultTag[index])

      processedSentance.append(wordTokinizer.encode(sentance))
      processedTag.append(wordTokinizer.encode(tag))

    return processedSentance, processedTag

  def __getitem__(self, index):
    return self.sentanceTensor[index], self.tagTensor[index]

  def __len__(self):
    return len(self.sentanceTensor)


dataset = CustomDataset("/content/drive/MyDrive/ner_dataset.csv")

self.wordTokinizer:  2


##DataLoader

In [71]:
class Dataloader():
  def __init__(self, dataset, trainSize: float):
    self.dataset=dataset
    self.trainSize=trainSize

  def trainTestDataloader(self):
    trainDataset, testDataset=self.splitDataset()

    trainDataloader=DataLoader(trainDataset,batch_size=BATCH_SIZE,shuffle=True)
    testDataloader=DataLoader(testDataset,batch_size=BATCH_SIZE,shuffle=True)
    return trainDataloader, testDataloader

  def splitDataset(self):
    trainSize = int(self.trainSize * len(self.dataset))
    testSize = len(self.dataset) - trainSize
    trainDataset, testDataset = torch.utils.data.random_split(self.dataset, [trainSize, testSize])

    return trainDataset, testDataset

#Model

In [130]:
class NERModel(nn.Module):
  def __init__(self, vocabalarySize, hiddenSize):
    super().__init__()
    self.sequential=nn.Sequential(
      nn.Embedding(vocabalarySize, hiddenSize),
      nn.LSTM(hiddenSize, hiddenSize),
    )

    self.output=nn.Sequential(
      nn.ReLU(),
      nn.Linear(hiddenSize, vocabalarySize),
      nn.Softmax(1)
    )
  def forward(self, sentance):
    x,_=self.sequential(sentance)
    return self.output(x).argmax(2)

nerModel=NERModel(len(dataset.wordTokinizer.dictonary), 25)


#Optimizer & Loss

In [131]:
optimizer = torch.optim.Adam(nerModel.parameters(), lr=0.001)
loss=nn.CrossEntropyLoss()

#Training

In [None]:
class Trainig():
  def __init__(self, dataset, epochs, model, loss, optimizer, device):
    self.trainDataloader, self.testDataloader = Dataloader(dataset, .8).trainTestDataloader()

    self.epochs = epochs
    self.model = model
    self.loss = loss
    self.optimizer = optimizer
    self.device=device

    self.trainingLoop()

  def trainingLoop(self):
    for epoch in range(self.epochs):
      trainGenerator=self.train()
      trainLoss, trainAccuracy=self.unpackGenerator(trainGenerator)

      testGenerator=self.test()
      testLoss, testAccuracy=self.unpackGenerator(testGenerator)

      print(f"train acc: {trainAccuracy:.2f}, train loss, {trainLoss:.2f} | test acc: {testAccuracy:.2f}, test loss, {testLoss:.2f}")

  def train(self):
    for input, target in self.trainDataloader:
      prediction = self.model(input)

      loss, accuracy=self.getLossAndAccuracy(prediction, target)
      loss.requires_grad = True
      loss.backward()
      self.optimizer.step()

      yield loss, accuracy


  def test(self):
    for input, target in self.testDataloader:
      prediction = self.model(input)
      loss, accuracy=self.getLossAndAccuracy(prediction, target)

      yield loss, accuracy

  def unpackGenerator(self, generator)->torch.tensor:
    generator=next(iter(generator))
    loss, accuracy=generator[0],generator[1]
    return loss, accuracy

  def getLossAndAccuracy(self,prediction,target)->torch.tensor:
    prediction=prediction.float().to(self.device)
    target=target.float().to(self.device)

    prediction_loss=self.loss(prediction,target)
    prediction_acc=self.accuracy(prediction,target)

    return prediction_loss,prediction_acc

  def accuracy(self,predictions,targets)->torch.tensor:
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match."

    num_correct = (predictions == targets).sum().item()

    total_samples = targets.numel()
    accuracy_value = num_correct / total_samples
    return accuracy_value*100

Trainig(dataset, 1000, nerModel, loss, optimizer, device)

train acc: 0.00, train loss, 2800062.00 | test acc: 0.00, test loss, 3175264.50
train acc: 0.00, train loss, 3013414.50 | test acc: 0.00, test loss, 2709776.25
train acc: 0.00, train loss, 3266611.50 | test acc: 0.00, test loss, 2975950.50
train acc: 0.00, train loss, 3485143.50 | test acc: 0.00, test loss, 3632354.25
train acc: 0.00, train loss, 3148631.00 | test acc: 0.00, test loss, 3368162.75
train acc: 0.00, train loss, 3255262.25 | test acc: 0.00, test loss, 3067940.00
train acc: 0.00, train loss, 2773809.25 | test acc: 0.00, test loss, 3085017.75
train acc: 0.00, train loss, 2994151.00 | test acc: 0.00, test loss, 4192210.75
train acc: 0.00, train loss, 2604864.50 | test acc: 0.00, test loss, 3286033.50
train acc: 0.00, train loss, 3206524.00 | test acc: 0.00, test loss, 3260446.50
train acc: 0.00, train loss, 3453101.50 | test acc: 0.00, test loss, 3201768.50
train acc: 0.00, train loss, 2955147.75 | test acc: 0.00, test loss, 3081964.25
train acc: 0.00, train loss, 3353851.00 