### Part-of-Speech Tagging Rule-Based and Statistical-based

In [11]:
import nltk
import re
nltk.download('punkt')
nltk.download('treebank')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker') #The maxent_ne_chunker contains two pre-trained English named entity chunkers trained on an ACE corpus (perhaps ACE ACE 2004 Multilingual Training Corpus?)
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\flore\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\flore\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\flore\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\flore\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [12]:
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger #important for POS tagging


# Part 1: Rule-based and Statistical Approaches for Part-of-Speech Tagging

# Rule-based POS Tagger
def rule_based_pos_tagger(sentence):
    # Define your rules here
    rules = [
          (re.compile(r'\bThe\b'), 'DT'),
          (re.compile(r'\bcat\b'), 'NN'),
          (re.compile(r'\bis\b'), 'VB'),
          (re.compile(r'\bsitting\b'), 'VB'),
          (re.compile(r'\bon\b'), 'IN'),
          (re.compile(r'\bthe\b'), 'DT'),
          (re.compile(r'\bmat\b'), 'NN'),
      ]
    tagged_sentence = []
    words = word_tokenize(sentence)
    for word in words:
        for pattern, tag in rules:
            if pattern.match(word):
                tagged_sentence.append((word, tag))
                break
        else:
            tagged_sentence.append((word, 'UNKNOWN'))
    return tagged_sentence

# Statistical POS Tagger
def statistical_pos_tagger(sentence):
    # Train your model on a labeled corpus (e.g., treebank)
    train_data = treebank.tagged_sents()[:3000]
    # Train your statistical model here

    # Split data into training and testing sets
    train_size = int(len(train_data) * 0.8)
    train_set = train_data[:train_size]
    test_set = train_data[train_size:]

    # Create taggers
    default_tagger = DefaultTagger('NN')  # Default tagger assigns 'NN' to all words
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)  # Unigram tagger using training set
    bigram_tagger = BigramTagger(train_set, backoff=unigram_tagger)  # Bigram tagger using training set and fallback to unigram tagger

    # Evaluate on test set
    accuracy = bigram_tagger.accuracy(test_set)
    print("Accuracy:", accuracy)


    # Apply the trained model to tag the sentence
    tagged_sentence  = bigram_tagger.tag(word_tokenize(sentence))
    #tagged_sentence = nltk.pos_tag(words)
    #tagged_sentence.append(tagged_sentence)
    return tagged_sentence

In [13]:
# Part 1: Rule-based and Statistical Approaches for Part-of-Speech Tagging
sample_sentence = "The cat is sitting on the mat."

# Rule-based POS Tagging
rule_based_tags = rule_based_pos_tagger(sample_sentence)
print("Rule-based POS Tags:")
print(rule_based_tags)

# Statistical POS Tagging
statistical_tags = statistical_pos_tagger(sample_sentence)
print("Statistical POS Tags:")
print(statistical_tags)

Rule-based POS Tags:
[('The', 'DT'), ('cat', 'NN'), ('is', 'VB'), ('sitting', 'VB'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('.', 'UNKNOWN')]
Accuracy: 0.8748033560566335
Statistical POS Tags:
[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('.', '.')]


In [14]:
sample_sentence = "The cat is sitting on the mat."

tagged_sentence = nltk.pos_tag(word_tokenize(sample_sentence))
print(tagged_sentence)

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('.', '.')]


In [15]:
(r'\b\w+s\b|\b\w+es\b', 'NN'),     # Nouns ending ```

(('\\b\\w+s\\b|\\b\\w+es\\b', 'NN'),)

In [16]:
def updated_rule_based_pos_tagger(sentence): # Here is pdated Rule-based POS Tagger
    
    # Definition of the Rules
    
    rules = [     
          (re.compile(r'\bThe\b'), 'DT'),
          (re.compile(r'\bquick\b'), 'JJ'),
          (re.compile(r'\bbrown\b'), 'JJ'),
          (re.compile(r'\bfox\b'), 'NN'),
          (re.compile(r'\bjumps\b'), 'VBZ'),
          (re.compile(r'\bover\b'), 'IN'),
          (re.compile(r'\bthe\b'), 'DT'),
          (re.compile(r'\blazy\b'), 'JJ'),
          (re.compile(r'\bdog\b'), 'NN'),
          (re.compile(r'\bwhile\b'), 'IN'),
          (re.compile(r'\bit\'s\b'), 'NN'), 
          (re.compile(r'\braining\b'), 'VBG'),
          (re.compile(r'\bheavily\b'), 'RB')
      ]
    tagged_sentence = []
    words = word_tokenize(sentence)
    for word in words:
        for pattern, tag in rules:
            if pattern.match(word):
                tagged_sentence.append((word, tag))
                break
        else:
            tagged_sentence.append((word, 'UNKNOWN'))
    return tagged_sentence

def updated_statistical_pos_tagger(sentence): # Let's updated Statistical POS Tagger
    
    # Let's train the model on a labeled corpus
    train_data = treebank.tagged_sents()[:3000]

    # Split data into training and testing sets
    train_size = int(len(train_data) * 0.8)
    train_set = train_data[:train_size]
    test_set = train_data[train_size:]

    # Create taggers
    default_tagger = DefaultTagger('NN')  # Default tagger assigns 'NN' to all words
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)  # Unigram tagger using training set
    bigram_tagger = BigramTagger(train_set, backoff=unigram_tagger)  # Bigram tagger using training set and fallback to unigram tagger

    # Evaluate on test set
    accuracy = bigram_tagger.accuracy(test_set)
    print("Accuracy:", accuracy)


    # Apply the trained model to tag the sentence
    tagged_sentence  = bigram_tagger.tag(word_tokenize(sentence))
    return tagged_sentence

# Updated Rule-based POS Tagger
updated_rule_based_tags = updated_rule_based_pos_tagger("The quick brown fox jumps over the lazy dog while it's raining heavily.")
print("Updated Rule-based POS Tags:")
print(updated_rule_based_tags)

# Updated Statistical POS Tagger
updated_statistical_tags = updated_statistical_pos_tagger("The quick brown fox jumps over the lazy dog while it's raining heavily.")
print("Updated Statistical POS Tags:")
print(updated_statistical_tags)

Updated Rule-based POS Tags:
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('while', 'IN'), ('it', 'UNKNOWN'), ("'s", 'UNKNOWN'), ('raining', 'VBG'), ('heavily', 'RB'), ('.', 'UNKNOWN')]
Accuracy: 0.8748033560566335
Updated Statistical POS Tags:
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NN'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'NN'), ('dog', 'NN'), ('while', 'IN'), ('it', 'PRP'), ("'s", 'VBZ'), ('raining', 'NN'), ('heavily', 'RB'), ('.', '.')]
