In [1]:
# Importing the required libraries 
import nltk
import numpy as np
import itertools

# Downloading and importing the brown corpus
nltk.download('brown')
from nltk.corpus import brown

#Getting the tagged sentences
sent_tag = brown.tagged_sents()
brown_sent_tag=[]
for s in sent_tag:
    s.insert(0,('##','##'))
    s.append(('&&','&&'))
    brown_sent_tag.append(s)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
#Splitting the data for train and test
split_num = int(len(brown_sent_tag)*0.9)
train_data = brown_sent_tag[0:split_num]
test_data = brown_sent_tag[split_num:]

In [3]:
# MAking a dict() variable to store TAG , word and its count in the following fashion:
# {'TAG' : {word : 'count of word with TAG'}  }

train_word_tag = {}
for data in train_data:
    #print(data)
    for word, tag in data:
        #print(word,tag)
        #print('------')
        word = word.lower()
        if tag in train_word_tag:
            if word in train_word_tag[tag]:
                train_word_tag[tag][word] +=1
            else:
                train_word_tag[tag][word]=1
        else:
            train_word_tag[tag] ={word:1}

In [4]:
# Making the emission probability matrix

# Making a nested disctionary to in the following fashion:
# { Tag : {word: emission probability form that tag' s state} }

# This is the variable to store the dictionaty
train_emission_prob_matrix = {}
for tag in train_word_tag.keys():
    
    # creating the blank nested dictionary for {word : emission prob}
    train_emission_prob_matrix[tag] = {}
    
    # calculating number of word with that tag
    count = sum(train_word_tag[tag].values())
    
    for word in train_word_tag[tag].keys():
        
        # Calculating the emmission prob of each word for that state or Tag
        train_emission_prob_matrix[tag][word] = train_word_tag[tag][word]/count
#train_emission_prob_matrix

In [14]:
# calculating sequence of tags as bigram in the following fashion
# {TAG of 1st word of a given bigram i.e. (T1) :{Tag of 2nd word of a given bigram i.e (T2) : count of (T1 followed by T2) }}

bigram_of_tag = {}

for sentence in train_data:
    
    # sentence consist of a Tagged-sentence so to get the bigram of that
    generated_bigrams = list(nltk.bigrams(sentence))
    
    for bigram1 ,bigram2 in generated_bigrams:
        
        ## checking if tag of first word of bigram exist 
            ## if yes then check if the tag of 2nd word of the bigram exixst
                    ## If es then increament its value by 1
            ## else assign the value to the new tag as 1
        ## else create the new entry with value 1 
        if bigram1[1] in bigram_of_tag:
            if bigram2[1] in bigram_of_tag[bigram1[1]]:
                bigram_of_tag[bigram1[1]][bigram2[1]] += 1
            else:
                bigram_of_tag[bigram1[1]][bigram2[1]] = 1
        else:
            bigram_of_tag[bigram1[1]] = { bigram2[1] : 1}
print(bigram_of_tag)

{'##': {'AT': 7874, '``': 3082, 'PPS': 5026, 'NN-HL': 430, 'WRB': 755, 'NN': 1324, 'RB': 3433, 'NNS-HL': 150, 'IN': 4521, 'VB-HL': 16, 'CD': 695, '(': 244, 'NP': 3335, 'NN-TL': 398, 'PP$': 905, 'PPSS': 2623, 'NNS': 924, 'NP$': 250, 'AT-HL': 114, 'CS': 2067, 'CC': 2481, 'NP-HL': 170, '--': 213, 'AP': 446, 'EX': 723, 'VBG': 541, 'PPSS+BEM': 44, 'DT': 1547, 'JJ': 996, 'ABN': 179, 'DTS': 403, 'VBZ-HL': 27, 'VBN': 266, 'VBD-HL': 3, 'ABX': 82, 'VBG-HL': 66, 'MD*-HL': 1, 'AP-HL': 19, 'JJ-HL': 181, 'QL': 258, ')': 166, 'JJ-TL': 97, 'NP-TL': 57, 'MD': 121, 'DTI': 234, 'PN': 231, 'NR': 61, 'RBR': 61, 'PPSS+MD': 91, 'NN-TL-HL': 45, 'CD-HL': 215, 'WDT': 307, 'VBN-TL-HL': 4, 'TO': 227, 'MD-HL': 3, 'QLP': 1, 'VBG-TL': 7, 'NNS-TL': 18, "'": 19, 'NP-TL-HL': 2, 'DT+BEZ': 42, 'PPSS+HV': 53, 'VBD': 45, '*': 130, 'PPS+BEZ': 104, 'NN$-HL': 18, 'PPS+MD': 28, ',': 11, '.': 40, 'RB-HL': 14, 'FW-AT-TL': 2, 'PPS+HVZ': 15, 'UH': 108, 'VB': 775, 'JJT': 13, 'PN+HVZ': 1, 'PPSS+BER': 35, 'VBN-HL': 28, 'ABN-HL': 2, '

In [17]:
prob_of_bigram_tag = {}

for tag in bigram_of_tag.keys():
    
    # creating a nested dictioonary for each tag
    prob_of_bigram_tag[tag] = {}
    
    # summing up all the occurance of the tag
    count = sum(bigram_of_tag[tag].values())
    
    for tag2 in bigram_of_tag[tag].keys():
        prob_of_bigram_tag[tag][tag2] = bigram_of_tag[tag][tag2]/count
prob_of_bigram_tag 

{'##': {'AT': 0.15257915746231057,
  '``': 0.0597217377824284,
  'PPS': 0.09739177615006007,
  'NN-HL': 0.008332364453745689,
  'WRB': 0.014630081773437198,
  'NN': 0.025655931480835562,
  'RB': 0.06652327248769523,
  'NNS-HL': 0.0029066387629345424,
  'IN': 0.08760609231484712,
  'VB-HL': 0.0003100414680463512,
  'CD': 0.01346742626826338,
  '(': 0.004728132387706856,
  'NP': 0.06462426849591132,
  'NN-TL': 0.007712281517652986,
  'PP$': 0.01753672053637174,
  'PPSS': 0.0508274231678487,
  'NNS': 0.017904894779676783,
  'NP$': 0.004844397938224237,
  'AT-HL': 0.002209045459830252,
  'CS': 0.04005348215323799,
  'CC': 0.04807580513893733,
  'NP-HL': 0.0032941905979924813,
  '--': 0.004127427043367051,
  'AP': 0.008642405921792039,
  'EX': 0.014009998837344495,
  'VBG': 0.01048327713831725,
  'PPSS+BEM': 0.0008526140371274658,
  'DT': 0.02997713444173158,
  'JJ': 0.019300081385885362,
  'ABN': 0.003468588923768554,
  'DTS': 0.007809169476417471,
  'VBZ-HL': 0.0005231949773282176,
  'VBN

In [55]:
# Now I will find all possible tag which was given to a word in both test and train data

possible_tags_of_word = {}

for data in train_data:
    for word, tag in data:
        
        # lowering the the capital letters
        word = word.lower()
        
        # checking if the word in already in the dictionary
            ## if yes the check if that is already in the list
                ## if yes then continue
                ## if not then append it to the list
            ## if not the create a new nested dict with {word:[tag]}  
            
        if word in possible_tags_of_word:
            if tag not in possible_tags_of_word[word]:
                possible_tags_of_word[word].append(tag)                
        else:
            temp=[]
            temp.append(tag)
            possible_tags_of_word[word] = temp
#print(possible_tags_of_word)

# Now doing the same thing for the test data
for data in test_data:
    for word, tag in data:
        
        # lowering the the capital letters
        word = word.lower()
        
        # checking if the word in already in the dictionary
            ## if yes the check if that is already in the list
                ## if yes then continue
                ## if not then append it to the list
            ## if not the create a new nested dict with {word:[tag]}  
            
        if word in possible_tags_of_word:
            if tag not in possible_tags_of_word[word]:
                possible_tags_of_word[word].append(tag)                
        else:
            temp=[]
            temp.append(tag)
            possible_tags_of_word[word] = temp
#print(possible_tags_of_word)

In [56]:
# pre processing the test data
# Removing the tags from the test data set

# The following structure is used
## test_word=[[word1, word2,word3...],
##                     [word1, word2, word3....]]   where each row is one sentence.

## in similar fashion the tags are stored
## test_tags = [[tag1, tag2, tag3 ,.......],
##                      [tag1, tag2, tag3,.......]] where each row i s one sentence 

test_words = []
test_tags = []

for data in test_data:
    temp_words = []
    temp_tags =[]
    for word, tag in data:
        temp_words.append(word.lower())
        temp_tags.append(tag)
    test_words.append(temp_words)
    test_tags.append(temp_tags)
test_tags

[['##', 'PPS', 'BEDZ', 'RB', 'CD', 'NNS', 'JJ', '.', '&&'],
 ['##', '``', 'DT', 'JJ', 'NN', ',', 'PP$', 'NN', '.', '.', '&&'],
 ['##',
  'RB',
  ',',
  'PPSS',
  'BER',
  'QL',
  'JJ',
  'TO',
  'VB',
  'PPO',
  'RB',
  'PPL',
  "''",
  ',',
  'PPS',
  'VBD',
  ',',
  'VBG',
  'IN',
  'NN',
  '.',
  '&&'],
 ['##',
  'NP',
  'VBD',
  'PP$',
  'NN',
  'IN',
  'AT',
  'NN',
  ',',
  'CC',
  'AT',
  'NN',
  'VBD',
  'PPO',
  'CS',
  'AT',
  'JJ',
  'NN',
  '.',
  '&&'],
 ['##',
  '``',
  'NP',
  'BEZ',
  'TO',
  'BE',
  'PP$',
  'NN',
  'NN',
  ',',
  'NP',
  '.',
  '&&'],
 ['##',
  'PPSS',
  'VB',
  'AT',
  'NN',
  'IN',
  'AT',
  'NN',
  'VBN',
  'IN',
  'PPO',
  'PPS',
  'BEZ',
  'AT',
  'QL',
  'JJ',
  'NN',
  ',',
  'QL',
  'JJ',
  'IN',
  'AP',
  'NNS',
  '.',
  '&&'],
 ['##',
  'PPS',
  'VBZ',
  'AT',
  'JJR',
  'NN',
  'CS',
  'RB',
  'VBG',
  'RB',
  'IN',
  'AT',
  'NP',
  'NN',
  "''",
  '.',
  '&&'],
 ['##', '``', 'QL', 'RB', ',', 'PP$', 'NN', '.', '&&'],
 ['##', 'PP$', 'NN', '

In [59]:
# implementing the VIterbi Algorithm

predicted_tags = []

for sentence in test_words:
    values = {}
    for i in range(len(sentence)):
        
        word = sentence[i]
        
        if i == 0:
            values[i] = {}
            tags = possible_tags_of_word[word]
            for tag in tags:
                if tag in values[i]:
                    values[i][tag] = ['##', prob_of_bigram_tag['##']*train_emission_prob_matrix[tag][word]]
                else:
                    values[i][tag] = ['##', 0.0001]
        if i>0:
            values[i] = {}
            previous_states = list(values[i-1].keys())
            current_state = possible_tags_of_word[word]
            
            for state in current_state:
                temp =[]
                
                for pt in previous_states:
                    try :
                        temp.append(value[i-1][pt][1]*prob_of_bigram_tag[pt][state]*train_emission_prob_matrix[state][word])
                    except:
                        temp.append(values[i-1][pt][1]*0.0001)
                max_temp_index = temp.index(max(temp))
                best_pt = previous_states[max_temp_index]
                values[i][tag] = [best_pt, max(temp)]
                
            pred_tags = []
            total_steps_num = values.keys()
            
            last_step_num = max(total_steps_num)
            for bs in range(len(total_steps_num)):
                step_num = last_step_num - bs
                if step_num == last_step_num:
                    pred_tags.append('&&')
                    pred_tags.append(values[step_num]['&&'][0])
                if step_num<last_step_num and step_num>0:
                    pred_tags.append(storing_values[step_num][pred_tags[len(pred_tags)-1]][0])
                predicted_tags.append(list(reversed(pred_tags)))

{1: {'IN': ['##', 0.0001], 'IN-HL': ['##', 0.0001], 'IN-TL': ['##', 0.0001]},
 2: {'WDT': ['IN', 1e-08],
  'WDT-HL': ['IN', 1e-08],
  'WPS-TL': ['IN', 1e-08],
  'WPO-TL': ['IN', 1e-08],
  'WDT-NC': ['IN', 1e-08],
  'WPS': ['IN', 1e-08]},
 3: {'PPSS': ['WDT', 1e-12],
  'NN': ['WDT', 1e-12],
  'PPSS-NC': ['WDT', 1e-12],
  'NN-TL': ['WDT', 1e-12],
  'NP': ['WDT', 1e-12],
  'NIL': ['WDT', 1e-12],
  'PPSS-HL': ['WDT', 1e-12]},
 4: {'BEDZ': ['PPSS', 1e-16],
  'BEDZ-HL': ['PPSS', 1e-16],
  'BEDZ-NC': ['PPSS', 1e-16]},
 5: {'JJ': ['BEDZ', 1.0000000000000001e-20]},
 6: {'TO': ['JJ', 1.0000000000000001e-24],
  'IN': ['JJ', 1.0000000000000001e-24],
  'IN-HL': ['JJ', 1.0000000000000001e-24],
  'TO-HL': ['JJ', 1.0000000000000001e-24],
  'IN-TL': ['JJ', 1.0000000000000001e-24],
  'TO-TL': ['JJ', 1.0000000000000001e-24],
  'NPS': ['JJ', 1.0000000000000001e-24],
  'NIL': ['JJ', 1.0000000000000001e-24],
  'QL': ['JJ', 1.0000000000000001e-24],
  'TO-NC': ['JJ', 1.0000000000000001e-24],
  'IN-NC': ['JJ',

In [52]:
values

{1: {'PPS': ['##', 0.0001],
  'PPS-TL': ['##', 0.0001],
  'PPS-NC': ['##', 0.0001],
  'NIL': ['##', 0.0001],
  'PPS-HL': ['##', 0.0001]},
 2: {'PPS-HL': ['PPS', 1e-08]}}