### Word Sense Disambiguation (WSD)

##### Import libraries

In [1]:
import logging
import re
import sys
import math
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import scipy
from nltk.metrics import ConfusionMatrix

import os
import datetime

##### Set up logger

In [2]:
logging.basicConfig(
    filename='decision-list-log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

#### decision-list.py

##### Algorithm:

1. Read the XML files (TRAIN_XML and TEST_XML) and define the output file paths (OUTPUT and OUTPUT_ANSWERS).
2. Define a set of feature functions in the feature_set() function. These functions extract specific patterns from a line and associate them with a sense. Additional feature functions can be added as needed.
3. Implement a probability() function that calculates the probability of an attribute occurring in the sense_text and other_text.
4. Read the training XML file (TRAIN_XML) using BeautifulSoup and create an attribute list by identifying patterns from line-train.xml and associating attributes with sense. This is done by iterating over the instances in the XML file and extracting the relevant text based on the sense.
5. Read the testing XML file (TEST_XML) using BeautifulSoup and initialize variables for counting the occurrences of "phone" and "product" senses.
6. Calculate the probability for each attribute in the attribute list by calling the probability() function. The probabilities are written to the OUTPUT file.
7. Sort the attribute_list based on their probability scores to create a decision list.
8. Determine the default sense based on the counts of "phone" and "product" senses in the training data.
9. Iterate over the instances in the testing XML file and extract the context. For each context, perform a search on a particular attribute in the context by iterating over the attribute_list and checking if the attribute matches the context.
10. If a match is found, assign the associated sense to the instance. If no match is found, assign the default sense.
11. Write the assigned senses to the OUTPUT_ANSWERS file in the required format.
12. Print the assigned senses in the same format as the gold standard file.

In [3]:
# Run %%writefile decision-list.py to write and save decision-list.py

# Input files
TRAIN_XML = r"C:\Users\muge\Github\line-data\line-train.xml"
TEST_XML = r"C:\Users\muge\Dropbox\GMU\AIT 526\Module 7\line-data\line-test.xml"
OUTPUT = r"C:\Users\muge\Dropbox\GMU\AIT 526\Module 7\line-data\my-decision-list.txt"
OUTPUT_ANSWERS = r"C:\Users\muge\Dropbox\GMU\AIT 526\Module 7\line-data\my-line-answers.txt"

def feature_set():
    # Define feature functions

    # Example:
    def vote_feature(line):
        return bool(re.search(r'vote', line)), 'phone'
    yield vote_feature

    def growth_feature(line):
        return bool(re.search(r'growth', line)), 'phone'
    yield growth_feature

    # Add more feature functions as needed

def probability(attribute, sense_text, other_text):
    count1 = 0
    count2 = 0

    for line in sense_text:
        if line is not None and attribute(line)[0]:
            count1 += 1

    for line in other_text:
        if line is not None and attribute(line)[0]:
            count2 += 1

    total_count = count1 + count2
    prob1 = count1 / total_count
    prob2 = count2 / total_count

    try:
        ratio = math.log10(prob1 / prob2)
    except ZeroDivisionError:
        ratio = 1

    with open(OUTPUT, 'a+') as output:
        output.write(f'{attribute.__name__}\t{ratio}\t{sense_text[0][1]}\n')

    logger.info(f'Attribute: {attribute.__name__}, Ratio: {ratio}, Sense: {sense_text[0][1]}')

    return ratio

if __name__ == '__main__':
    logger.info("Starting the decision-list.py script")

    with open(TRAIN_XML) as f:
        data = f.read()

    parser = BeautifulSoup(data, 'xml')

    attribute_list = []

    # Create an attribute list by identifying patterns from line-train.xml and associate attributes with sense
    for feature in feature_set():
        attribute_list.append(feature)

    textsense1 = []
    textsense2 = []

    # Read the XML file into training argument
    for instance in parser.find_all('instance'):
        if instance.answer['senseid'] == 'phone':
            for tag in instance.find_all('s'):
                string = tag.string
                textsense1.append(string)
        else:
            for tag in instance.find_all('s'):
                string = tag.string
                textsense2.append(string)

    # Calculate probability for each attribute in the corpus
    for attribute in attribute_list:
        probability(attribute, textsense1, textsense2)

    # Sort the attributes accordingly based on their probability scores and create a decision list
    attribute_list.sort(key=lambda attribute: probability(attribute, textsense1, textsense2))

    phone_count = len(parser.find_all(senseid="phone"))
    product_count = len(parser.find_all(senseid="product"))
    default_sense = 'phone' if phone_count > product_count else 'product'

    with open(TEST_XML) as y:
        data = y.read()

    parser = BeautifulSoup(data, 'xml')

    # Read the XML file into testing argument
    for instance in parser.find_all('instance'):
        context = tuple(
            tag.string for tag in instance.find_all('s')
            if tag.string is not None
        )

        sense = None

        # Read the XML file based on the tag instance and read the context
        # Perform search on a particular attribute in the context
        for line in context:
            for attribute in attribute_list:
                if attribute(line)[0]:
                    sense = attribute(line)[1]
                    break

        # If a particular attribute is matched in the attribute list, assign the associated sense
        # If no match is found, assign the default sense (Phone)
        if sense is None:
            sense = default_sense

        id_num = instance['id']
        with open(OUTPUT_ANSWERS, 'a+') as output_check:
            output_check.write(f'<answer instance="{id_num}" senseid="{sense}"/>\n')

        # Print the output in the same standard as the gold standard file
        print(f'<answer instance="{id_num}" senseid="{sense}"/>')

    logger.info("Finished running the decision-list.py script")

<answer instance="line-n.w8_059:8174:" senseid="product"/>
<answer instance="line-n.w7_098:12684:" senseid="product"/>
<answer instance="line-n.w8_106:13309:" senseid="product"/>
<answer instance="line-n.w9_40:10187:" senseid="product"/>
<answer instance="line-n.w9_16:217:" senseid="product"/>
<answer instance="line-n.w8_119:16927:" senseid="product"/>
<answer instance="line-n.w8_008:13756:" senseid="product"/>
<answer instance="line-n.w8_041:15186:" senseid="product"/>
<answer instance="line-n.art7} aphb 05601797:" senseid="product"/>
<answer instance="line-n.w8_119:2964:" senseid="product"/>
<answer instance="line-n.w7_040:13652:" senseid="product"/>
<answer instance="line-n.w7_122:2194:" senseid="product"/>
<answer instance="line-n.art7} aphb 45903907:" senseid="product"/>
<answer instance="line-n.art7} aphb 43602625:" senseid="product"/>
<answer instance="line-n.w8_034:3995:" senseid="product"/>
<answer instance="line-n.w8_139:696:" senseid="product"/>
<answer instance="line-n.art7

#### scorer.py

##### Algorithm:

1. Set up logging and file paths.
2. Read the output file's content and add it to the f1 list.
3. Divide each line of f1 at:" and add the key-value pairs that result to the var1 list.
4. Make an empty dictionary called predicted and fill it with the key-value pairs from var1.
5. After removing the values from the predicted dictionary, create the predicted_list.
6. Open the key file and add the information to the f2 list.
7. Divide each line of f2 at:" and add the key-value pairs that result to the var2 list.
8. Create the key-value pairs from var2 and place them in the empty dictionary observed.
9. After removing the values from the observed dictionary, create the observed_list.
10. Using the observed_list and predicted_list, create a ConfusionMatrix object (cm).
11. Set the counter variable x to zero.
12. Go through the lists of predictions and observations repeatedly, increasing x for each pair that matches.
13. Divide x by the length of the predicted_list, then multiply the result by 100 to find the accuracy.
14. Print the confusion matrix and the accuracy matrix (cm).
15. Update the logging to reflect the script's completion.

In [4]:
# Run %%writefile scorer.py to write and save scorer.py

# Configure logging
logging.basicConfig(filename='scorer-log.txt', level=logging.INFO)
logger = logging.getLogger()

def main():
    logger.info("Starting the scorer.py script")

    output_file = r"C:\Users\muge\Dropbox\GMU\AIT 526\Module 7\line-data\my-line-answers.txt"
    key_file = r"C:\Users\muge\Dropbox\GMU\AIT 526\Module 7\line-data\line-answers.txt"

    # Read and process the predicted answers
    with open(output_file) as file:
        f1 = [line.rstrip('\n') for line in file]
        var1 = [i.split(':"', 1) for i in f1 if ':"' in i]  # Ensure split is valid
        predicted = {}

    for a in range(len(var1)):
        key = var1[a][0]
        value = var1[a][1]
        predicted[key] = value

    predicted_list = [predicted[v] for v in predicted if v in predicted]

    # Read and process the observed answers
    with open(key_file) as myf1:
        f2 = [line.rstrip('\n') for line in myf1]
        var2 = [i.split(':"', 1) for i in f2 if ':"' in i]  # Ensure split is valid
        observed = {}

    for a in range(len(var2)):
        key = var2[a][0]
        value = var2[a][1]
        observed[key] = value

    observed_list = [observed[v] for v in observed if v in observed]

    # Debugging output
    logger.info(f"Length of predicted_list: {len(predicted_list)}")
    logger.info(f"Length of observed_list: {len(observed_list)}")

    if len(predicted_list) != len(observed_list):
        logger.error("Lists have different lengths.")
        print("Error: Lists have different lengths.")
        return

    # Calculate and print accuracy and confusion matrix
    cm = ConfusionMatrix(observed_list, predicted_list)
    correct = sum(1 for i in range(len(predicted_list)) if predicted_list[i] == observed_list[i])
    accuracy = (correct / len(predicted_list)) * 100

    print('Accuracy of the classifier is:', accuracy, '\n\nConfusion Matrix: ', str(cm))
    logger.info("Finished running the scorer.py script")

if __name__ == '__main__':
    main()

Accuracy of the classifier is: 42.857142857142854 

Confusion Matrix:                       |       |
                     |     s |
                     |     e |
                     |  s  n |
                     |  e  s |
                     |  n  e |
                     |  s  i |
                     |  e  d |
                     |  i  = |
                     |  d  " |
                     |  =  p |
                     |  "  r |
                     |  p  o |
                     |  h  d |
                     |  o  u |
                     |  n  c |
                     |  e  t |
                     |  "  " |
                     |  /  / |
                     |  >  > |
---------------------+-------+
   senseid="phone"/> | <.>72 |
 senseid="product"/> |  .<54>|
---------------------+-------+
(row = reference; col = test)



##### Create a log list

In [5]:
# Function to log the script execution details
def log_execution(script_path):
    # Get the directory of the script
    script_dir = os.path.dirname(script_path)

    # Create the log file path
    log_file_path = os.path.join(script_dir, "decision-list-log.txt")

    # Get the current date and time
    current_time = datetime.datetime.now()

    # Create a log message
    log_message = f"Script executed on {current_time}\n"

    # Write the log message to the file
    with open(log_file_path, "a") as log_file:
        log_file.write(log_message)

    print("Script execution logged successfully.")

if __name__ == "__main__":
    # Check if the script filename is provided as an argument
    if len(sys.argv) > 1:
        script_file = sys.argv[1]

        # Get the absolute path of the script file
        script_path = os.path.abspath(script_file)

        # Log the script execution
        log_execution(script_path)
    else:
        print("decision-list-log.txt")

Script execution logged successfully.
