In [1]:
def parse_data(file_path):
    sentences = []
    current_sentence = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line == '\n':  # Empty line separates sentences
                    if current_sentence:  # Add the sentence to the list
                        current_sentence.append(('','STOP'))
                        current_sentence.insert(0, ('','START'))
                        sentences.append(current_sentence)
                        current_sentence = []
                else:
                    # print(line.strip().split(' ')) 
                    # try to split line into token and tag, if no split then just use line as token
                    if ' ' not in line:
                        current_sentence.append((line.strip(), None))
                    else:
                        token, tag = line.strip().split(' ')
                        current_sentence.append((token, tag))
    except FileNotFoundError:
        print(f"File {file_path} not found.")

    if current_sentence:  # Add the last sentence to the list
        sentences.append(current_sentence)

    return sentences

labelled_data = parse_data('train')


In [2]:
labelled_data

[[('', 'START'),
  ('Municipal', 'B-NP'),
  ('bonds', 'I-NP'),
  ('are', 'B-VP'),
  ('generally', 'B-ADVP'),
  ('a', 'B-ADJP'),
  ('bit', 'I-ADJP'),
  ('safer', 'I-ADJP'),
  ('than', 'B-PP'),
  ('corporate', 'B-NP'),
  ('bonds', 'I-NP'),
  ('in', 'B-PP'),
  ('a', 'B-NP'),
  ('recession', 'I-NP'),
  (',', 'O'),
  ('but', 'O'),
  ('not', 'B-ADJP'),
  ('as', 'I-ADJP'),
  ('safe', 'I-ADJP'),
  ('as', 'B-PP'),
  ('bonds', 'B-NP'),
  ('issued', 'B-VP'),
  ('by', 'B-PP'),
  ('the', 'B-NP'),
  ('federal', 'I-NP'),
  ('government', 'I-NP'),
  ('.', 'O'),
  ('', 'STOP')],
 [('', 'START'),
  ('He', 'B-NP'),
  ('added', 'B-VP'),
  ('that', 'B-SBAR'),
  ('the', 'B-NP'),
  ('cost', 'I-NP'),
  ('for', 'B-PP'),
  ('stress-related', 'B-NP'),
  ('compensation', 'I-NP'),
  ('claims', 'I-NP'),
  ('is', 'B-VP'),
  ('about', 'B-NP'),
  ('twice', 'I-NP'),
  ('the', 'I-NP'),
  ('average', 'I-NP'),
  ('for', 'B-PP'),
  ('all', 'B-NP'),
  ('injury', 'I-NP'),
  ('claims', 'I-NP'),
  ('.', 'O'),
  ('', 'STOP')],


In [42]:
#implement smoothing function

def smooth_data(data):
    # Count the occurrences of each word
    word_counts = {}
    for sentence in data:
        for token, _ in sentence:
            if token not in word_counts:
                word_counts[token] = 0 #if its the first time we see that word, then add a new entry for it
            word_counts[token] += 1 
    
    # Replace words that appear less than 3 times with #UNK#
    smoothed_data = []
    for sentence in data:
        smoothed_sentence = []
        for token, tag in sentence:
            if word_counts[token] < 3:
                smoothed_sentence.append(('#UNK#', tag))
            else:
                smoothed_sentence.append((token, tag))
        smoothed_data.append(smoothed_sentence)
    
    return smoothed_data

smoothed_data = smooth_data(labelled_data)

In [43]:

#check if movie is in labelled_data

for sentence in smoothed_data:
    for token, _ in sentence:
        if token == 'movie':
            print(sentence)

[('', 'START'), ('Producers', 'B-NP'), ('Don', 'I-NP'), ('#UNK#', 'I-NP'), ('and', 'I-NP'), ('Jerry', 'I-NP'), ('#UNK#', 'I-NP'), (',', 'O'), ('who', 'B-NP'), ('#UNK#', 'B-VP'), ('``', 'O'), ('#UNK#', 'B-NP'), ("''", 'O'), ('through', 'B-PP'), ('several', 'B-NP'), ('#UNK#', 'I-NP'), ('and', 'O'), ('ultimately', 'B-VP'), ('produced', 'I-VP'), ('the', 'B-NP'), ('movie', 'I-NP'), (',', 'O'), ('#UNK#', 'B-VP'), ('when', 'B-ADVP'), ('Messrs.', 'B-NP'), ('Guber', 'I-NP'), ('and', 'I-NP'), ('Peters', 'I-NP'), ('take', 'B-VP'), ('credit', 'B-NP'), ('for', 'B-PP'), ('the', 'B-NP'), ('film', 'I-NP'), ('.', 'O'), ('', 'STOP')]
[('', 'START'), ('We', 'B-NP'), ('are', 'B-VP'), ('the', 'B-NP'), ('producers', 'I-NP'), ('of', 'B-PP'), ('that', 'B-NP'), ('movie', 'I-NP'), ('.', 'O'), ('', 'STOP')]
[('', 'START'), ('Mr.', 'B-NP'), ('Guber', 'I-NP'), ('got', 'B-VP'), ('his', 'B-NP'), ('start', 'I-NP'), ('in', 'B-PP'), ('the', 'B-NP'), ('movie', 'I-NP'), ('business', 'I-NP'), ('at', 'B-PP'), ('Columbia', 

In [44]:
def get_unique_labels(data):
   
    # takes in a list of lists each inner list represents a sentence with tokens and their corresponding tags,
    
    unique_labels = set()

    # Iterate over each sentence in the data
    for sentence in data:
        if isinstance(sentence, list):  # List of lists or numpy array
            for token_tag in sentence:
                label = token_tag[1]  # Extract the tag from the tuple
                unique_labels.add(label)
        elif isinstance(sentence, dict):  # Dictionary
            for label in sentence.values():
                unique_labels.add(label)

    return unique_labels

unique_tags = get_unique_labels(labelled_data)
print(unique_tags) 

{'I-ADJP', 'O', 'I-PP', 'B-VP', 'I-INTJ', 'B-PRT', 'I-CONJP', 'I-NP', 'B-LST', 'B-UCP', 'B-INTJ', 'B-ADJP', 'I-UCP', 'B-NP', 'B-SBAR', 'B-PP', 'B-CONJP', 'I-VP', 'I-ADVP', 'I-SBAR', 'STOP', 'START', 'B-ADVP'}


In [45]:


# Initialize the emission probabilities dictionary
e = {}
for tag in unique_tags:
    for observation in set([token for sentence in smoothed_data for token, _ in sentence]):
        e[(observation, tag)] = 0

# Count the emission probabilities
tag_counts = {}
for tag in unique_tags:
    tag_counts[tag] = 0

for sentence in smoothed_data:
    for token, tag in sentence:
        e[(token , tag)] += 1
        tag_counts[tag] += 1

# Normalize the emission probabilities
for pair in e:
    # print(pair)
    observation, tag = pair
    e[pair] /= tag_counts[tag]

In [46]:
#check if 'movie' is in e
'movie' in [pair[0] for pair in e]

True

In [47]:
test_data = parse_data('dev.in')

In [None]:

known_words = set(observation for observation, tag in e.keys())

def predict_tags(data, e, unique_tags, known_words): 
    predictions = []
    for sentence in data:
        sentence_predictions = []
        for token, _ in sentence:
            
            if token not in known_words:
                # print(f"Unknown word: {token}") 
                token = '#UNK#'
            token_predictions = {}
            for tag in unique_tags:
                # print(token)
                # print(tag)
                if (token, tag) in e:
                    token_predictions[tag] = e[(token, tag)]                
            predicted_tag = max(token_predictions, key=token_predictions.get)
            if predicted_tag != 'START' and predicted_tag != 'STOP': #strip out all of the start and end tags
                sentence_predictions.append((token, predicted_tag))
        predictions.append(sentence_predictions)
    return predictions

y_pred = predict_tags(test_data, e, unique_tags, known_words)

In [49]:
y_pred

[[('HBO', 'B-NP'),
  ('has', 'B-VP'),
  ('close', 'B-ADJP'),
  ('to', 'B-PP'),
  ('24', 'I-NP'),
  ('million', 'I-NP'),
  ('subscribers', 'I-NP'),
  ('to', 'B-PP'),
  ('its', 'B-NP'),
  ('HBO', 'B-NP'),
  ('and', 'I-UCP'),
  ('#UNK#', 'B-INTJ'),
  ('networks', 'I-NP'),
  (',', 'O'),
  ('while', 'B-SBAR'),
  ('Showtime', 'B-NP'),
  ('and', 'I-UCP'),
  ('its', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('service', 'I-NP'),
  (',', 'O'),
  ('The', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  (',', 'O'),
  ('have', 'I-VP'),
  ('only', 'I-CONJP'),
  ('about', 'B-PP'),
  ('10', 'I-ADVP'),
  ('million', 'I-NP'),
  (',', 'O'),
  ('according', 'B-PP'),
  ('to', 'B-PP'),
  ('Paul', 'B-NP'),
  ('#UNK#', 'B-INTJ'),
  ('Associates', 'I-NP'),
  (',', 'O'),
  ('a', 'B-LST'),
  ('#UNK#', 'B-INTJ'),
  (',', 'O'),
  ('Calif.', 'B-NP'),
  (',', 'O'),
  ('research', 'I-NP'),
  ('firm', 'I-NP'),
  ('.', 'O')],
 [('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  ('#UNK#', 'B-INTJ'),
  ('after', 'B-SBAR'),
  

In [50]:
# write a function to output the prediction results into a file with no file extension
def save_results(y_pred, output_filename): # Added output_filename parameter
    # Open a new file in write mode
    with open(output_filename, 'w') as f:
        for sentence_predictions in y_pred:
            # Write each token and its predicted tag for the current sentence
            for token, predicted_tag in sentence_predictions:
                # Handle potential None tags if they somehow sneak in, replace with 'O' or similar default
                tag_to_write = predicted_tag if predicted_tag is not None else 'O'
                f.write(f"{token} {tag_to_write}\n")
            # Add a blank line after each sentence
            f.write("\n")

# Example usage when calling the function:
save_results(y_pred, 'dev_startstop.p2.out')

#Entity in gold data: 13179
#Entity in prediction: 19406

#Correct Entity : 9152
Entity  precision: 0.4716
Entity  recall: 0.6944
Entity  F: 0.5617

#Correct Sentiment : 7644
Sentiment  precision: 0.3939
Sentiment  recall: 0.5800
Sentiment  F: 0.4692