In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import random
import ast

import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/NLP'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/drive/MyDrive/NLP/sample_submission.csv
/content/drive/MyDrive/NLP/test_small.csv
/content/drive/MyDrive/NLP/train.csv
/content/drive/MyDrive/NLP/sample_submission_finalll.csv
/content/drive/MyDrive/NLP/sample_submissionfinal001.csv
/content/drive/MyDrive/NLP/sample_submissionfinal002.csv
/content/drive/MyDrive/NLP/sample_submissionfinal003.csv


In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/train.csv') # loading training data
data = []
for index, row in tqdm(df.iterrows()):
    data.append(ast.literal_eval(row['tagged_sentence'])) # changing data-type of entries from 'str' to 'list'

47340it [00:14, 3174.82it/s]


In [None]:
print(data[:5])

[[('The', 'AT'), ('jury', 'NN'), ('further', 'RB'), ('said', 'VB'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NN'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN'), ('Executive', 'JJ'), ('Committee', 'NN'), (',', ','), ('which', 'WD'), ('had', 'HV'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VB'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NN'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN'), ('of', 'IN'), ('Atlanta', 'NP'), ("''", "''"), ('for', 'IN'), ('the', 'AT'), ('manner', 'NN'), ('in', 'IN'), ('which', 'WD'), ('the', 'AT'), ('election', 'NN'), ('was', 'BE'), ('conducted', 'VB'), ('.', '.')], [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HV'), ('been', 'BE'), ('charged', 'VB'), ('by', 'IN'), ('Fulton', 'NP'), ('Superior', 'JJ'), ('Court', 'NN'), ('Judge', 'NN'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('r

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/test_small.csv') # loading test data
test_data = {}
for index, row in tqdm(df.iterrows()):
    test_data[row['id']] = ast.literal_eval(row['untagged_sentence']) # changing data-type of entries from 'str' to 'list'

4000it [00:00, 8523.22it/s]


In [None]:
def display_data(sentence_index):
    '''
        Input : 'sentence_index' (int) -> index of a sentence in training data
        Output: None
    '''
    sentence = data[sentence_index]
    print("TOKEN -> TAG")
    print('...')
    for token, tag in sentence:
        print(token, '>', tag)
sentence_index = random.choice(range(len(data)))
display_data(sentence_index)

TOKEN -> TAG
...
Though > CS
it > PP
may > MD
exist > VB
in > IN
either > CC
literate > JJ
or > CC
illiterate > JJ
societies > NN
, > ,
it > PP
assumes > VB
a > AT
role > NN
of > IN
true > JJ
cultural > JJ
importance > NN
only > RB
in > IN
the > AT
latter > AP
. > .


In [None]:
# cell to show the frequency of each distinct (slack or native) present in the training data
from collections import Counter
distinct_tags = []
word_tags = []
def store_tags():

    global distinct_tags
    global word_tags

    for sent in data:
        word_tags.append(('START','START'))
        for words, tag in sent:
            word_tags.extend([(tag, words)])
        word_tags.append(('END','END'))

store_tags()
tags=[]
for tag, words in word_tags:
    tags.append(tag)
distinct_tags=list(set(tags))
count_tags = {}
for tag, count in Counter(tags).items():
    count_tags[tag] = count

In [None]:
# Extracting tagged words from the 'data' dataset
tagged_words_list = [tup for sentence in data for tup in sentence]

len(tagged_words_list)

957849

In [None]:
# Creating a list of words by extracting the first element from each tuple in train_tagged_words
tokens = [pair[0] for pair in tagged_words_list]

# Displaying the first 10 elements of the tokens list
print(tokens[:10])


['The',
 'jury',
 'further',
 'said',
 'in',
 'term-end',
 'presentments',
 'that',
 'the',
 'City']

In [None]:
#vocabulary
V = set(tokens)
print(len(V))

51208


In [None]:
# number of tags
T = set([pair[1] for pair in tagged_words_list])
len(T)

49

In [None]:
####EMISSION PROBABILITIES

In [None]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [None]:
# compute word given tag: Emission Probability

def word_given_tag(word, tag, tagged_words_list):
    # Filter the tagged words list to get pairs of words and tags with the specified tag
    tag_list = [(w, t) for w, t in tagged_words_list if t == tag]

    # Count the occurrences of the specified tag
    count_tag = len(tag_list)

    # Filter the tag_list to get pairs where the word matches the specified word
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0] == word]

    # Count the occurrences where the word matches the specified word and tag
    count_w_given_tag = len(w_given_tag_list)

    return count_w_given_tag, count_tag


In [None]:
print(word_given_tag('city', 'NN'))
print(word_given_tag('further', 'VB'))

(212, 187719)
(6, 95801)


In [None]:
# # compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1,tagged_words_list):
    # Extract the list of tags from the tagged words list
    tags = [pair[1] for pair in tagged_words_list]

    # Count the occurrences of t1 in the tags list
    count_t1 = tags.count(t1)

    # Initialize a counter for occurrences of t2 following t1
    count_t2_t1 = 0

    # Iterate through the tags list and check for occurrences of t2 following t1
    for index in range(len(tags) - 1):
        if tags[index] == t1 and tags[index + 1] == t2:
            count_t2_t1 += 1

    return count_t2_t1, count_t1


In [None]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
   for j, t2 in enumerate(list(T)):
       tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [None]:
# convert the matrix to a dataframe df
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [None]:
tags_df

In [None]:
#Adding Tag occurance probability weights

In [None]:
# Create a list to store tuples of POS tags and their occurrence probabilities based on training data
tag_probabilities = []

# Calculate the total number of tags in the training data
total_tags = len([tag for word, tag in tagged_words_list])

# Iterate through each unique POS tag in the training data
for current_tag in tags:
    # Count the occurrences of the current POS tag in the training data
    tag_occurrences = [tag for word, tag in tagged_words_list if tag == current_tag]

    # Calculate the occurrence probability of the current POS tag
    probability = len(tag_occurrences) / total_tags

    # Append a tuple containing the POS tag and its occurrence probability to the list
    tag_probabilities.append((current_tag, probability))


In [None]:
def vanilla_viterbi(input_words, training_bag=tagged_words_list):
    # List to store predicted states for each word in the input sequence
    predicted_states = []

    # Extract unique POS tags from the training data
    tag_set = list(set([pair[1] for pair in training_bag]))

    # Iterate through each word in the input sequence
    for index, word in enumerate(input_words):
        # Initialize a list for the probability column for the current observation
        probabilities = []

        for tag in tag_set:
            # Calculate transition probability
            if index == 0:
                transition_probability = tags_df.loc['.', tag]
            else:
                transition_probability = tags_df.loc[predicted_states[-1], tag]

            # Compute emission and state probabilities
            emission_prob = word_given_tag(input_words[index], tag)[0] / word_given_tag(input_words[index], tag)[1]
            state_probability = emission_prob * transition_probability
            probabilities.append(state_probability)

        # Find the maximum probability and corresponding POS tag
        max_prob = max(probabilities)
        state_max = tag_set[probabilities.index(max_prob)]
        predicted_states.append(state_max)

    # Combine input words with their predicted POS tags and return the result
    return list(zip(input_words, predicted_states))


In [None]:
##First modification in viterbi: to handle unknown words
#emission probability for unknown word is zero.
#assign only based on transition probabilities.

In [None]:
def Viterbi_1(words, tagged_words_list, tags_df, word_given_tag, tag_probabilities):
    # Initialize the state list to store the predicted POS tags
    state = []

    # Get the unique POS tags from the training data
    T = list(set([pair[1] for pair in tagged_words_list]))

    # Iterate through each word in the input sentence
    for key, word in enumerate(words):
        # Initialise lists for probability and transition probability for each tag
        p = []  # list for storing emission probabilities
        p_transition = []  # list for storing weighted transition probabilities

        # Iterate through each POS tag
        for tag in T:
            # Compute transition probability based on the previous POS tag
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # Compute emission and state probabilities for the current word and tag
            emission_p = word_given_tag(word, tag)[0] / word_given_tag(word, tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

            # Find POS tag occurrence probability
            tag_p = [pair[1] for pair in tag_probabilities if pair[0] == tag]

            # Calculate the transition probability weighted by tag occurrence probability
            if tag_p:
                transition_p *= tag_p[0]
            p_transition.append(transition_p)

        # Choose the POS tag with the maximum emission probability
        pmax = max(p)
        state_max = T[p.index(pmax)]

        # If emission probability is zero (unknown word), use weighted transition probability
        if pmax == 0:
            pmax = max(p_transition)
            state_max = T[p_transition.index(pmax)]

        # Append the predicted POS tag to the state list
        state.append(state_max)

    # Return the list of tuples containing words and their corresponding predicted POS tags
    return list(zip(words, state))


In [None]:
def memm_features(sentence, index):
    word = sentence[index]
    return {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].upper() == word[0],
        'has_hyphen': '-' in word,
        'is_numeric': word.isdigit(),
        'prefix_2': word[:2],  # Prefix of length 2
        'prefix_3': word[:3],  # Prefix of length 3
        'suffix_2': word[-2:],  # Suffix of length 2
        'suffix_3': word[-3:],  # Suffix of length 3
        'prev_word': '' if index == 0 else sentence[index - 1],  # Previous word
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],  # Next word
        'contains_digits': any(char.isdigit() for char in word),  # Check if the word contains digits
        'word_length': len(word),  # Length of the word
        'is_alphanumeric': word.isalnum(),  # Check if the word is alphanumeric
    }


In [None]:
def viterbi_memm(sequence, hidden_states, model_weights):

dynamic_table = [{}]  #stores probability of different states at each time step
optimal_path = {}   #stores optimal path for each state at each time step

# Initialization at time step 0
for state in hidden_states:
    dynamic_table[0][state] = model_weights.get(state, 1e-10)
    optimal_path[state] = [state]

# Recursion step
for time_step in range(1, len(sequence)):
    dynamic_table.append({})  # stores the probabilities of different states at the current time step
    new_optimal_path = {} # stores the updated optimal paths for each state at the current time step
    for current_state in hidden_states:
        # For each state at the current time step, find the maximum probability and the corresponding previous state
        (probability, previous_state) = max([(dynamic_table[time_step-1][prev_state] * model_weights.get(current_state, 1e-10), prev_state) for prev_state in hidden_states])
        dynamic_table[time_step][current_state] = probability
        new_optimal_path[current_state] = optimal_path[previous_state] + [current_state]
    optimal_path = new_optimal_path

# Termination step
(max_probability, final_state) = max([(dynamic_table[len(sequence) - 1][state], state) for state in hidden_states])
return (max_probability, optimal_path[final_state])



In [None]:
submission = {'id': [], 'tagged_sentence' : []} # dictionary to store tag predictions
# NOTE ---> ensure that tagged_sentence's corresponing 'id' is same as 'id' of corresponding 'untagged_sentence' in training data
def store_submission(sent_id, tagged_sentence):

    global submission
    submission['id'].append(sent_id)
    submission['tagged_sentence'].append(tagged_sentence)

def clear_submission():
    global submission
    submission = {'id': [], 'tagged_sentence' : []}

In [None]:
def hmm_tagger_util(sent_id, untagged_sentence):
    # Using Viterbi Heuristic
    tagged_sentence = Viterbi_1(untagged_sentence)
    store_submission(sent_id, tagged_sentence)

In [None]:
for sent_id in tqdm(list(test_data.keys())):
    sent = test_data[sent_id]
    hmm_tagger_util(sent_id, sent)

100%|██████████| 4000/4000 [3:12:37<00:00,  2.89s/it]


In [None]:
pd.DataFrame(submission).to_csv('sample_submissionfinal1.csv', index = False)