#Imports

In [27]:
import torch
import json
import re
from torch import nn
from torch.utils.data import Dataset

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

#Data Preparation

##Tokinizer

In [97]:
class Tokinizer():
  def __init__(self, maxSquenceLength=200):
    self.maxSquenceLength=maxSquenceLength

    self.tokinizerDict=dict({"PAD":0, "UNK":1})

  def encode(self, sentanceList: list[str])->list[int]:
    encodedSentance=[]

    for word in sentanceList:
      if len(encodedSentance)>=self.maxSquenceLength:
        break

      if word in self.tokinizerDict.keys():
        encodedSentance.append(self.tokinizerDict[word])
      else:
        encodedSentance.append(self.tokinizerDict["UNK"])


    return self.addPaddingToEncoding(encodedSentance)


  def addPaddingToEncoding(self, encoding: list[int])->list[int]:
    paddingLengthRequired=self.maxSquenceLength-len(encoding)
    paddingArray=[self.tokinizerDict["PAD"]]*paddingLengthRequired

    return encoding+paddingArray


  def decode(self, encodedSentance: list[int])->list[str]:
    decodedString=""
    dictKeys=list(self.tokinizerDict.keys())

    for token in encodedSentance:
      try:
        wordPosition=list(self.tokinizerDict.values()).index(token)
        decodedString+=dictKeys[wordPosition]
      except ValueError:
        raise ValueError(f"token {token} was not found in dictionary")

    return decodedString


  def loadTokinizerDictionary(self, filePath)->None:
    jsonfile=open(filePath)
    jsonObject=json.load(jsonfile)
    self.tokinizerDict=jsonObject


  def createDataset(self, dataset: list[str], fileSavePath=None)->None:
    for sentance in dataset:
      for word in sentance.split(" "):

        if word in self.tokinizerDict:
          pass
        else:
          self.tokinizerDict[word]=len(self.tokinizerDict)

    if fileSavePath !=None:
      self._saveDictionary(fileSavePath )


  def _saveDictionary(self, savePath):
    with open(savePath, "w") as outfile:
      json.dump(self.tokinizerDict, outfile)

##Text Cleaner


In [32]:
class TextProcessor():
  def __init__(self):
    pass
  def processText(self, string:str)->str:
    removePunctuations = re.sub(r"\s+[a-zA-Z]\s+", ' ', string)
    removeMultipuleSpace = re.sub(r'\s+', ' ', removePunctuations)

    return removeMultipuleSpace




##Make Dataset

In [103]:
class PosDataset(Dataset):

  def __init__(self,filePath):
    self.tokinizer=Tokinizer()

    self.tokinizer.createDataset(["this is some test data to fit the tokinizer"],"/content/drive/MyDrive/posTagging/tokinizerDict.json")
    self.tokinizer.loadTokinizerDictionary("/content/drive/MyDrive/posTagging/tokinizerDict.json")

    self.filePath=filePath

    self.datasetSentances=torch.tensor(self.unpackSentances())
    self.datasetLabels=torch.tensor(self.unpackLabels())

  def unpackSentances(self)->list[int]:
    jsonFile = open(self.filePath)
    jsonArray = json.load(jsonFile)

    sentanceList=[]

    for object in jsonArray:
      tokinizedSentace=self.tokinizedSentace(object["sentence"])

      sentanceList.append(tokinizedSentace)

    return sentanceList

  def unpackLabels(self)->list[int]:
    jsonFile = open(self.filePath)
    jsonArray = json.load(jsonFile)

    labelsList=[]

    for object in jsonArray:
      tokinizedSentace=self.tokinizedSentace(object["sentence"])
      tokinizedLabels=self.tokinizedLabels(object["labels"])


      labelsList.append(tokinizedLabels)

    return labelsList


  def tokinizedSentace(self, sentance: list[str])->list[int]:
    return self.tokinizer.encode(sentance)

  def tokinizedLabels(self, labels: list[str])->list[int]:
    return self.tokinizer.encode(labels)




  def __getitem__(self,idx):
    return self.datasetSentances[idx],self.datasetLabels[idx]

  def __len__(self)->int:
    return len(self.datasetSentances);

dataset=PosDataset("/content/drive/MyDrive/posTagging/test.json")