#Brown dataset


In [1]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
import pandas as pd


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [2]:
from sklearn.model_selection import train_test_split

sentences = brown.tagged_sents()
train_data, test_data = train_test_split(sentences, test_size=0.5, random_state=42)

sentences

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [3]:
from collections import defaultdict

def calculate_emission_prob(train_data):
    emission_prob = defaultdict(lambda: defaultdict(int))
    total_tags = defaultdict(int)

    for sentence in train_data:
        for word, tag in sentence:
            emission_prob[word][tag] += 1
            total_tags[tag] += 1

    for word in emission_prob:
        for tag in emission_prob[word]:
            emission_prob[word][tag] /= total_tags[tag]

    return emission_prob

emission_prob = calculate_emission_prob(train_data)


In [4]:
def calculate_transition_prob(train_data):
    transition_prob = defaultdict(lambda: defaultdict(int))
    total_tags = defaultdict(int)

    for sentence in train_data:
        prev_tag = None
        for _, tag in sentence:
            if prev_tag:
                transition_prob[prev_tag][tag] += 1
                total_tags[prev_tag] += 1
            prev_tag = tag

    for prev_tag in transition_prob:
        for tag in transition_prob[prev_tag]:
            transition_prob[prev_tag][tag] /= total_tags[prev_tag]

    return transition_prob

transition_prob = calculate_transition_prob(train_data)


In [5]:
def print_emission_table(emission_prob):
    print("Emission Probability Table:")
    df = pd.DataFrame.from_dict(emission_prob, orient='index').fillna(0)
    print(df)

def print_transition_table(transition_prob):
    print("Transition Probability Table:")
    df = pd.DataFrame.from_dict(transition_prob, orient='index').fillna(0)
    print(df)

print_emission_table(emission_prob)
print("\n")
print_transition_table(transition_prob)

Emission Probability Table:
                 BER  BER-TL  PPSS  PPSS-HL   QL   CS  CS-HL   IN   RB   JJ  \
Are         0.014229     1.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
are         0.983104     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
art         0.001779     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
Art         0.000445     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
ah          0.000445     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
...              ...     ...   ...      ...  ...  ...    ...  ...  ...  ...   
Steiners    0.000000     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
there'd     0.000000     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
contendere  0.000000     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
forisque    0.000000     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   
buncha      0.000000     0.0   0.0      0.0  0.0  0.0    0.0  0.0  0.0  0.0   

            ...  VB+IN 

#OWN DATASET

In [6]:
dataset = [
    "mary jane can see Will",
    "Spot will see mary",
    "will jane spot mary",
    "mary will pat Spot"
]

word_pos_dict = {
    "mary": "NOUN",
    "jane": "NOUN",
    "can": "MODIFIER",
    "see": "VERB",
    "Will": "NOUN",
    "will": "MODIFIER",
    "Spot": "NOUN",
    "spot": "VERB",
    "pat": "VERB"
}

In [7]:
from collections import defaultdict

# Create a dictionary to store total counts of POS tags for each word
word_pos_counts = defaultdict(lambda: defaultdict(int))

# Iterate through each sentence in the dataset
for sentence in dataset:
    words = sentence.split()
    for word in words:
        pos_tag = word_pos_dict[word]
        word_pos_counts[word][pos_tag] += 1

# Print the total counts of POS tags for each word
for word, pos_counts in word_pos_counts.items():
    print(f"{word}: {pos_counts}")


mary: defaultdict(<class 'int'>, {'NOUN': 4})
jane: defaultdict(<class 'int'>, {'NOUN': 2})
can: defaultdict(<class 'int'>, {'MODIFIER': 1})
see: defaultdict(<class 'int'>, {'VERB': 2})
Will: defaultdict(<class 'int'>, {'NOUN': 1})
Spot: defaultdict(<class 'int'>, {'NOUN': 2})
will: defaultdict(<class 'int'>, {'MODIFIER': 3})
spot: defaultdict(<class 'int'>, {'VERB': 1})
pat: defaultdict(<class 'int'>, {'VERB': 1})


In [12]:
transition_prob = defaultdict(lambda: defaultdict(int))
start_tag_count = defaultdict(int)
end_tag_count = defaultdict(int)

for sentence in dataset:
    words = sentence.split()
    prev_tag = "<s>"
    start_tag_count[prev_tag] += 1
    for word in words:
        tag = word_pos_dict[word]
        transition_prob[prev_tag][tag] += 1
        prev_tag = tag
    end_tag_count[prev_tag] += 1
    transition_prob[prev_tag]["</s>"] += 1

for tag, count in start_tag_count.items():
    transition_prob["<s>"][tag] = count / len(dataset)

for tag, count in end_tag_count.items():
    transition_prob[tag]["</s>"] = count / len(dataset)

transition_df = pd.DataFrame.from_dict(transition_prob, orient='index').fillna(0)
transition_df = transition_df.drop('<s>', axis=1)
transition_df = transition_df.div(transition_df.sum(axis=1), axis=0)

print("Transition Probability Table:")
print(transition_df)

emission_prob = defaultdict(lambda: defaultdict(float))
word_pos_count = defaultdict(lambda: defaultdict(int))

for sentence in dataset:
    words = sentence.split()
    for word in words:
        tag = word_pos_dict[word]
        word_pos_count[word][tag] += 1

pos_count = defaultdict(int)
for word, tag_count_dict in word_pos_count.items():
    for tag, count in tag_count_dict.items():
        pos_count[tag] += count

for word, tag_count_dict in word_pos_count.items():
    for tag, count in tag_count_dict.items():
        emission_prob[word][tag] = 1.0 * count / pos_count[tag]

emission_df = pd.DataFrame.from_dict(emission_prob, orient='index').fillna(0)

print("\nEmission Probability Table:")
print(emission_df)


Transition Probability Table:
              NOUN  MODIFIER      </s>      VERB
<s>       0.750000      0.25  0.000000  0.000000
NOUN      0.166667      0.50  0.166667  0.166667
MODIFIER  0.250000      0.00  0.000000  0.750000
VERB      1.000000      0.00  0.000000  0.000000

Emission Probability Table:
          NOUN  MODIFIER  VERB
mary  0.444444      0.00  0.00
jane  0.222222      0.00  0.00
Will  0.111111      0.00  0.00
Spot  0.222222      0.00  0.00
can   0.000000      0.25  0.00
will  0.000000      0.75  0.00
see   0.000000      0.00  0.50
spot  0.000000      0.00  0.25
pat   0.000000      0.00  0.25
