<a href="https://colab.research.google.com/github/EngineerBear8000/ML-HMM-project/blob/main/q1and2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
def parse_data(file_path):
    sentences = []
    current_sentence = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line == '\n':  # Empty line separates sentences
                    if current_sentence:  # Add the sentence to the list
                        sentences.append(current_sentence)
                        current_sentence = []
                else:
                    # print(line.strip().split(' '))
                    # try to split line into token and tag, if no split then just use line as token
                    if ' ' not in line:
                        current_sentence.append((line.strip(), None))
                    else:
                        token, tag = line.strip().split(' ')
                        current_sentence.append((token, tag))
    except FileNotFoundError:
        print(f"File {file_path} not found.")

    if current_sentence:  # Add the last sentence to the list
        sentences.append(current_sentence)

    return sentences

In [27]:
#implement smoothing function

def smooth_data(data):
    # Count the occurrences of each word
    word_counts = {}
    for sentence in data:
        for token, _ in sentence:
            if token not in word_counts:
                word_counts[token] = 0 #if its the first time we see that word, then add a new entry for it
            word_counts[token] += 1

    # Replace words that appear less than 3 times with #UNK#
    smoothed_data = []
    for sentence in data:
        smoothed_sentence = []
        for token, tag in sentence:
            if word_counts[token] < 3:
                smoothed_sentence.append(('#UNK#', tag))
            else:
                smoothed_sentence.append((token, tag))
        smoothed_data.append(smoothed_sentence)

    return smoothed_data

smoothed_data = smooth_data(labelled_data)

In [4]:
#check if movie is in labelled_data

for sentence in smoothed_data:
    for token, _ in sentence:
        if token == 'movie':
            print(sentence)

[('Producers', 'B-NP'), ('Don', 'I-NP'), ('#UNK#', 'I-NP'), ('and', 'I-NP'), ('Jerry', 'I-NP'), ('#UNK#', 'I-NP'), (',', 'O'), ('who', 'B-NP'), ('#UNK#', 'B-VP'), ('``', 'O'), ('#UNK#', 'B-NP'), ("''", 'O'), ('through', 'B-PP'), ('several', 'B-NP'), ('#UNK#', 'I-NP'), ('and', 'O'), ('ultimately', 'B-VP'), ('produced', 'I-VP'), ('the', 'B-NP'), ('movie', 'I-NP'), (',', 'O'), ('#UNK#', 'B-VP'), ('when', 'B-ADVP'), ('Messrs.', 'B-NP'), ('Guber', 'I-NP'), ('and', 'I-NP'), ('Peters', 'I-NP'), ('take', 'B-VP'), ('credit', 'B-NP'), ('for', 'B-PP'), ('the', 'B-NP'), ('film', 'I-NP'), ('.', 'O')]
[('We', 'B-NP'), ('are', 'B-VP'), ('the', 'B-NP'), ('producers', 'I-NP'), ('of', 'B-PP'), ('that', 'B-NP'), ('movie', 'I-NP'), ('.', 'O')]
[('Mr.', 'B-NP'), ('Guber', 'I-NP'), ('got', 'B-VP'), ('his', 'B-NP'), ('start', 'I-NP'), ('in', 'B-PP'), ('the', 'B-NP'), ('movie', 'I-NP'), ('business', 'I-NP'), ('at', 'B-PP'), ('Columbia', 'B-NP'), ('two', 'B-NP'), ('decades', 'I-NP'), ('ago', 'B-ADVP'), ('.', '

In [30]:
def get_unique_labels(data):

    # takes in a list of lists each inner list represents a sentence with tokens and their corresponding tags,

    unique_labels = set()

    # Iterate over each sentence in the data
    for sentence in data:
        if isinstance(sentence, list):  # List of lists or numpy array
            for token_tag in sentence:
                label = token_tag[1]  # Extract the tag from the tuple
                unique_labels.add(label)
        elif isinstance(sentence, dict):  # Dictionary
            for label in sentence.values():
                unique_labels.add(label)

    return unique_labels

unique_tags = get_unique_labels(labelled_data) # Tags list
print(unique_tags)

{'B-LST', 'I-VP', 'I-NP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-NP', 'B-PP', 'B-UCP', 'B-INTJ', 'I-SBAR', 'I-PP', 'B-CONJP', 'I-INTJ', 'I-UCP', 'O', 'B-VP', 'I-CONJP', 'B-PRT', 'B-ADJP', 'B-SBAR'}


In [11]:
# Initialize the emission probabilities dictionary
e = {}
for tag in unique_tags:
    for observation in set([token for sentence in smoothed_data for token, _ in sentence]):
        e[(observation, tag)] = 0

# Count the emission probabilities
tag_counts = {}
for tag in unique_tags:
    tag_counts[tag] = 0

for sentence in smoothed_data:
    for token, tag in sentence:
        e[(token , tag)] += 1
        tag_counts[tag] += 1

# Normalize the emission probabilities
for pair in e:
    # print(pair)
    observation, tag = pair
    e[pair] /= tag_counts[tag]

for pair in e:
  if e[pair] != 0:
    print(pair, e[pair])

('Third', 'B-LST') 0.09090909090909091
('2', 'B-LST') 0.18181818181818182
('#UNK#', 'B-LST') 0.09090909090909091
('1', 'B-LST') 0.09090909090909091
('First', 'B-LST') 0.09090909090909091
('4', 'B-LST') 0.09090909090909091
('Second', 'B-LST') 0.09090909090909091
('b', 'B-LST') 0.09090909090909091
('a', 'B-LST') 0.09090909090909091
('3', 'B-LST') 0.09090909090909091
('narrowly', 'I-VP') 9.84348853233586e-05
('guide', 'I-VP') 0.0001968697706467172
('convert', 'I-VP') 0.0006890441972635102
('mentioned', 'I-VP') 0.0001968697706467172
('achieved', 'I-VP') 0.0002953046559700758
('returning', 'I-VP') 0.0001968697706467172
('distribute', 'I-VP') 0.0002953046559700758
('placed', 'I-VP') 0.0003937395412934344
('position', 'I-VP') 9.84348853233586e-05
('contested', 'I-VP') 9.84348853233586e-05
('jointly', 'I-VP') 0.0003937395412934344
('look', 'I-VP') 0.0010827837385569445
('enacted', 'I-VP') 0.000492174426616793
('pull', 'I-VP') 9.84348853233586e-05
('soon', 'I-VP') 0.000492174426616793
('indicat

In [13]:
#check if 'movie' is in e
'movie' in [pair[0] for pair in e]

True

In [14]:
test_data = parse_data('dev.in')

In [15]:
known_words = set(observation for observation, tag in e.keys())

def predict_tags(data, e, unique_tags, known_words):
    predictions = []
    for sentence in data:
        sentence_predictions = []
        for token, _ in sentence:

            if token not in known_words:
                # print(f"Unknown word: {token}")
                token = '#UNK#'
            token_predictions = {}
            for tag in unique_tags:
                # print(token)
                # print(tag)
                if (token, tag) in e:
                    token_predictions[tag] = e[(token, tag)]
            predicted_tag = max(token_predictions, key=token_predictions.get)
            sentence_predictions.append((token, predicted_tag))
        predictions.append(sentence_predictions)
    return predictions

y_pred = predict_tags(test_data, e, unique_tags, known_words)

In [16]:
y_pred

[[('HBO', 'B-NP'),
  ('has', 'B-VP'),
  ('close', 'B-ADJP'),
  ('to', 'B-PP'),
  ('24', 'I-NP'),
  ('million', 'I-NP'),
  ('subscribers', 'I-NP'),
  ('to', 'B-PP'),
  ('its', 'B-NP'),
  ('HBO', 'B-NP'),
  ('and', 'I-UCP'),
  ('#UNK#', 'B-INTJ'),
  ('networks', 'I-NP'),
  (',', 'O'),
  ('while', 'B-SBAR'),
  ('Showtime', 'B-NP'),
  ('and', 'I-UCP'),
  ('its', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('service', 'I-NP'),
  (',', 'O'),
  ('The', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  (',', 'O'),
  ('have', 'I-VP'),
  ('only', 'I-CONJP'),
  ('about', 'B-PP'),
  ('10', 'I-ADVP'),
  ('million', 'I-NP'),
  (',', 'O'),
  ('according', 'B-PP'),
  ('to', 'B-PP'),
  ('Paul', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('Associates', 'I-NP'),
  (',', 'O'),
  ('a', 'B-LST'),
  ('#UNK#', 'B-INTJ'),
  (',', 'O'),
  ('Calif.', 'B-NP'),
  (',', 'O'),
  ('research', 'I-NP'),
  ('firm', 'I-NP'),
  ('.', 'O')],
 [('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  ('after', 'B-SBAR'),
  

In [17]:
# write a function to output the prediction results into a file with no file extension
def save_results(y_pred):
    # Open a new file in write mode
    with open('dev.p2.out', 'w') as f:
        for sentence_predictions in y_pred:
            # Write each token and its predicted tag to the file
            for token, predicted_tag in sentence_predictions:
                f.write(f"{token} {predicted_tag}\n")
save_results(y_pred)

In [18]:
# write a function to output the prediction results into a file with no file extension
def save_results(y_pred, output_filename): # Added output_filename parameter
    # Open a new file in write mode
    with open(output_filename, 'w') as f:
        for sentence_predictions in y_pred:
            # Write each token and its predicted tag for the current sentence
            for token, predicted_tag in sentence_predictions:
                # Handle potential None tags if they somehow sneak in, replace with 'O' or similar default
                tag_to_write = predicted_tag if predicted_tag is not None else 'O'
                f.write(f"{token} {tag_to_write}\n")
            # Add a blank line after each sentence
            f.write("\n")

# Example usage when calling the function:
save_results(y_pred, 'dev.p2.out')

Q2

In [34]:
def parse_data2(file_path):
    sentences = []
    current_sentence = []

    try:
      with open(file_path, 'r') as file:
          for line in file:
              if line == '\n':  # Empty line separates sentences
                  if current_sentence:  # Add the sentence to the list
                      current_sentence = ['START'] + current_sentence + ['STOP'] # Add START and STOP to current_sentence before adding it to the list
                      sentences.append(current_sentence)
                      current_sentence = []
              else:
                  # try to split line into token and tag, if no split then just use line as token
                  if ' ' not in line:
                      current_sentence.append((line.strip(), None))
                  else:
                      token, tag = line.strip().split(' ')
                      current_sentence.append(tag) # Only the tag is needed for transmission
    except FileNotFoundError:
        print(f"File {file_path} not found.")

    if current_sentence:  # Add the last sentence to the list
        sentences.append(current_sentence)

    return sentences

labels = parse_data2('train')


In [20]:
from collections import defaultdict
def transitionCount(data):
  # Calculate the number of transitions for each tag in list labels
  transitions = defaultdict(lambda: defaultdict(int))
  tag_counts = defaultdict(int)
  for sentence in data:
    for i in range(1, len(sentence)):
      prev_tag = sentence[i-1]
      current_tag = sentence[i]
      transitions[prev_tag][current_tag] += 1
      tag_counts[prev_tag] += 1
  # Calculate the probabilities
  transition_probs = defaultdict(dict)
  for prev_tag in transitions:
      total = tag_counts[prev_tag] # Count(y(i-1))
      for current_tag in transitions[prev_tag]:
          prob = transitions[prev_tag][current_tag] / total # q = Count(y(i-1),y(i)) / Count(y(i-1))
          transition_probs[prev_tag][current_tag] = prob
  return transition_probs

transition_probs = transitionCount(labels) # transition_probs stores transition probabilities
display(transition_probs)

defaultdict(dict,
            {'START': {'B-NP': 0.6480490669450607,
              'B-PP': 0.1087041628604985,
              'O': 0.14185045021532036,
              'B-ADVP': 0.05428683283309409,
              'B-ADJP': 0.003262429857758058,
              'B-SBAR': 0.02257601461568576,
              'B-CONJP': 0.00026099438862064463,
              'B-VP': 0.018661098786376094,
              'B-INTJ': 0.0013049719431032234,
              'B-LST': 0.0010439775544825785},
             'B-NP': {'I-NP': 0.6847056336539478,
              'B-VP': 0.13030335059718845,
              'O': 0.08096395729838284,
              'B-PP': 0.05800655321847585,
              'B-NP': 0.028897579537046823,
              'B-ADVP': 0.009808688299334109,
              'B-ADJP': 0.0032131909946094494,
              'B-SBAR': 0.00340344572455343,
              'B-PRT': 0.00035937004544974105,
              'STOP': 0.00023253355882042067,
              'B-UCP': 2.1139414438220062e-05,
              'B-CONJP': 8.4

In [21]:
def q(prev_tag, curr_tag,transition_probs):
  return transition_probs[prev_tag][curr_tag]

In [58]:
def viterbi(x, tags, transition_probs, emission_probs):
  n = len(x)
  dp = [{} for _ in range(n)]
  backptr = [{} for _ in range(n)]

  # Initialization for 1st word
  for tag in tags:
    trans_p = transition_probs['START'].get(tag, 1e-6) # .get(key, default)
    emit_p = emission_probs.get((x[0], tag), 1e-6)
    dp[0][tag] = trans_p * emit_p
    backptr[0][tag] = None

  # Recursion for words 1 to n-1
  for i in range(1, n):
        for current_tag in tags:
            max_prob = 0
            best_prev = None
            emit_p = emission_probs.get((x[i], current_tag), 1e-6)

            for prev_tag in tags:
                trans_p = transition_probs.get(prev_tag, {}).get(current_tag, 1e-6)
                prob = dp[i-1][prev_tag] * trans_p * emit_p

                if prob > max_prob:
                    max_prob = prob
                    best_prev = prev_tag

            dp[i][current_tag] = max_prob
            backptr[i][current_tag] = best_prev

  # Termination: Find the best last tag
  max_final_prob = 0
  best_last_tag = None
  for tag in tags:
      if dp[-1][tag] > max_final_prob:
          max_final_prob = dp[-1][tag]
          best_last_tag = tag

  # Backtrack to find the full sequence of tags
  best_tags = [best_last_tag]
  for i in range(n-1, 0, -1):
      best_tags.insert(0, backptr[i][best_tags[0]])

  return best_tags

In [53]:
def parse_data3(file_path):
    sentences = []
    current_sentence = []

    try:
      with open(file_path, 'r') as file:
          for line in file:
              if line == '\n':  # Empty line separates sentences
                  if current_sentence:  # Add the sentence to the list
                      current_sentence = ['START'] + current_sentence + ['STOP'] # Add START and STOP to current_sentence before adding it to the list
                      sentences.append(current_sentence)
                      current_sentence = []
              else:
                  if ' ' not in line:
                      current_sentence.append(line.strip())
                  else:
                      token = line.strip().split(' ')
                      current_sentence.append(token) # Only the tag is needed for transmission
    except FileNotFoundError:
        print(f"File {file_path} not found.")

    if current_sentence:  # Add the last sentence to the list
        sentences.append(current_sentence)

    return sentences

test_data = parse_data3('dev.in')
print(test_data)

[['START', 'HBO', 'has', 'close', 'to', '24', 'million', 'subscribers', 'to', 'its', 'HBO', 'and', 'Cinemax', 'networks', ',', 'while', 'Showtime', 'and', 'its', 'sister', 'service', ',', 'The', 'Movie', 'Channel', ',', 'have', 'only', 'about', '10', 'million', ',', 'according', 'to', 'Paul', 'Kagan', 'Associates', ',', 'a', 'Carmel', ',', 'Calif.', ',', 'research', 'firm', '.', 'STOP'], ['START', 'WASHINGTON', 'LIES', 'LOW', 'after', 'the', 'stock', 'market', "'s", 'roller-coaster', 'ride', '.', 'STOP'], ['START', 'This', 'may', 'seem', 'to', 'be', 'a', 'preposterous', 'and', 'utterly', 'futile', 'effort', 'in', 'Africa', '.', 'STOP'], ['START', 'American', 'Express', 'Bank', 'earnings', 'fell', '50', '%', 'to', '$', '21.3', 'million', 'from', '$', '42.5', 'million', 'despite', 'a', '29', '%', 'revenue', 'gain', '.', 'STOP'], ['START', 'Californians', ',', 'meanwhile', ',', 'tried', 'to', 'cope', 'with', 'still-limited', 'services', ',', 'blocked', 'roadways', 'and', 'water', 'shortag

In [64]:
# Run algorithm on dev.in and write results to dev.p2.out
predicted_tags = []
for sentence in test_data:
  predicted_tags.append(viterbi(sentence, unique_tags, transition_probs, e))

# Write to dev.p2.out
with open('dev.p2.out', 'w') as f:
    for sentence, tags in zip(test_data, predicted_tags):
        for word, tag in zip(sentence, tags):
            f.write(f"{word} {tag}\n")
        f.write("\n")

In [65]:
with open('dev.p2.out', 'r') as f:
    content = f.read()
    print(content)

START B-PP
HBO B-NP
has B-VP
close I-VP
to B-PP
24 B-NP
million I-NP
subscribers I-NP
to B-PP
its B-NP
HBO I-NP
and O
Cinemax B-NP
networks I-NP
, O
while B-SBAR
Showtime B-NP
and O
its B-NP
sister I-NP
service I-NP
, O
The B-NP
Movie I-NP
Channel I-NP
, O
have B-VP
only I-VP
about B-PP
10 B-NP
million I-NP
, O
according B-PP
to B-PP
Paul B-NP
Kagan I-NP
Associates I-NP
, O
a B-NP
Carmel I-NP
, O
Calif. B-NP
, O
research B-NP
firm I-NP
. O
STOP B-NP

START B-NP
WASHINGTON I-NP
LIES I-NP
LOW I-NP
after B-PP
the B-NP
stock I-NP
market I-NP
's B-NP
roller-coaster I-NP
ride I-NP
. O
STOP B-NP

START B-PP
This B-NP
may B-VP
seem I-VP
to I-VP
be I-VP
a B-NP
preposterous I-NP
and O
utterly B-NP
futile I-NP
effort I-NP
in I-NP
Africa I-NP
. O
STOP B-NP

START B-NP
American I-NP
Express I-NP
Bank I-NP
earnings I-NP
fell B-VP
50 B-NP
% I-NP
to B-PP
$ B-NP
21.3 I-NP
million I-NP
from B-PP
$ B-NP
42.5 I-NP
million I-NP
despite B-PP
a B-NP
29 I-NP
% I-NP
revenue I-NP
gain I-NP
. O
STOP B-NP

START 

**Results of evalResult.py:**

Entity in gold data: 3, Entity in prediction: 33

Correct Entity : 1

Entity  precision: 0.0303, Entity  recall: 0.3333, Entity  F: 0.0556

Correct Sentiment : 0
Sentiment  precision: 0.0000,
Sentiment  recall: 0.0000,
Sentiment  F: 0.0000