In [None]:
import nltk
from nltk.tag.perceptron import PerceptronTagger

nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
tagger = PerceptronTagger()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [None]:
from google.colab import files
uploaded = files.upload()

Saving BrownCorpus.txt to BrownCorpus.txt
Saving BrownToUniversalTagMap.txt to BrownToUniversalTagMap.txt


In [None]:
with open("BrownCorpus.txt") as file:
  brownCorpusData = file.readlines()

In [None]:
class Token:
    def __init__(self, Spelling, POSTag):
        self.Spelling = Spelling.lower().strip()
        self.POSTag = POSTag.strip()

    def GetSpelling(self):
        return self.Spelling

    def GetPOSTag(self):
        return self.POSTag

class TokenData:
    def __init__(self, token):
        self.token = token

    def GetToken(self):
        return self.token

class Sentence:
    def __init__(self):
        self.TokenDataList = []

    def GetTokenDataList(self):
        return self.TokenDataList

class POSData:
    def __init__(self):
        self.SentenceList = []

    def AddSentence(self, sentence):
        self.SentenceList.append(sentence)

    def GetSentenceList(self):
        return self.SentenceList


Structuring the BrownCorpus data to accessible tokens, with Spelling and POSTag.

In [None]:
completeDataSet = POSData()

for line in brownCorpusData:
    line = line.strip()  # Remove leading/trailing whitespaces
    if line:
        lineSplit = line.split()
        tokenDataList = []
        sentence = Sentence()

        for lineSplitItem in lineSplit:
            spellingAndTag = lineSplitItem.split('_')
            if len(spellingAndTag) == 2:
                token = Token(spellingAndTag[0], spellingAndTag[1])
                tokenData = TokenData(token)
                if len(token.POSTag) == 1 or token.POSTag[1] != '|':
                    tokenDataList.append(tokenData)

        sentence.TokenDataList = tokenDataList
        completeDataSet.AddSentence(sentence)

Creating a mapping from BrownTag to 12 universal tags.

In [None]:
with open("BrownToUniversalTagMap.txt") as file:
  mapData = file.readlines()

In [None]:
class TagConversionPair:
  def __init__(self, oldTag, newTag):
    self.oldTag = oldTag
    self.newTag = newTag

  def GetOldTag(self):
    return self.oldTag

  def GetNewTag(self):
    return self.newTag


class ConversionInstruction:
  def __init__(self):
    self.TagConversionPairList = []

  def AddTagConversionPair(self, TagPair):
    self.TagConversionPairList.append(TagPair)

  def GetTagConversionPair(self):
    return self.TagConversionPairList

In [None]:
completeConversionSet = ConversionInstruction()

for line in mapData:
    line = line.strip()  # Remove leading/trailing whitespaces
    if line:
        lineSplit = line.split('\t')
        oldTag = lineSplit[0]
        newTag = lineSplit[1]
        tagPair = TagConversionPair(oldTag, newTag)
        completeConversionSet.AddTagConversionPair(tagPair)

Before translating the old Tags, create a test set. Then use the completeConversionSet to convert the POSTags of the BrownCorpus to the 12 universal tags.

In [None]:
import random

totalSentences = len(completeDataSet.GetSentenceList())
trainSize = int(0.8 * totalSentences)
testSize = totalSentences - trainSize

sentences = completeDataSet.GetSentenceList()

# Shuffle the list of sentences to ensure randomness
random.shuffle(sentences)

trainSet = sentences[:trainSize]
testSet = sentences[trainSize:]

print("Training set size:", len(trainSet)/(len(trainSet)+len(testSet)))
print("Test set size:", len(testSet)/(len(trainSet)+len(testSet)))


Training set size: 0.7999929332367542
Test set size: 0.20000706676324576


Now the testSet POSTags will be converted and are ready to be compared to the POSTags of the PerceptronTagger in the next step.

In [None]:
def convertTags(token, conversionSet):
    for pair in conversionSet.GetTagConversionPair():
        if token.GetPOSTag() == pair.GetOldTag():
            token.POSTag = pair.GetNewTag()
            break
    return token

for sentence in testSet:
    for tokenData in sentence.GetTokenDataList():
        tokenData.token = convertTags(tokenData.token, completeConversionSet)

Now run the Perceptron tagger over the testSet and convert the tags to universal tags.

In [None]:
listOfSentences = []
listOfPerceptronTags = []


for sentence in testSet:
    spellingString = ""

    for tokenData in sentence.GetTokenDataList():
        spellingString += tokenData.token.GetSpelling() + ' '  # Add a space after each token

    listOfSentences.append(spellingString)

for sentence in listOfSentences:
    taggedSentence = tagger.tag(sentence.split())
    universalTags = [(word, nltk.tag.mapping.map_tag('en-ptb', 'universal', tag)) for word, tag in taggedSentence]
    listOfPerceptronTags.append(universalTags)

In [None]:
perceptronTags = []
for sentence in listOfPerceptronTags:
  for pair in sentence:
    perceptronTags.append(pair[1])

testSetTags = []
for testSentence in testSet:
  for tag in testSentence.GetTokenDataList():
    testSetTags.append(tag.GetToken().GetPOSTag())

totalTags = len(perceptronTags)
correctCounter = 0
for i in range(len(perceptronTags)):
  if perceptronTags[i] == testSetTags[i]:
    correctCounter += 1

print(correctCounter/totalTags)

0.9041564250758432
