In [4]:
# Import statements
import numpy as np
import pandas as pd
import csv
import xml.etree.ElementTree as ET
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split




In [9]:
import xml.etree.ElementTree as ET
import csv

# Process XML file and turn it into a CSV table
tree = ET.parse('../Data/train.xml')
root = tree.getroot()

# Create the CSV file
with open('train_orth_context.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write header
    writer.writerow(["Prev3", "Prev2", "Prev1", "Current", "Next1", "Next2", "Next3", "Class"])

    # Loop over each <chunk>
    for chunk in root.findall('.//chunk'):
        # Collect all <orth> and <ctag> elements in the current chunk
        orths = []
        ctags = []
        for tok in chunk.findall('tok'):
            orth = tok.find('orth')
            ctag = tok.find('.//ctag')

            # Handle missing orth or ctag by using "_" as a default
            orth_text = orth.text if orth is not None else "_"
            ctag_text = ctag.text.split(":")[0] if ctag is not None else "_"

            orths.append(orth_text.lower())  # Convert orth to lowercase
            ctags.append(ctag_text.lower())  # Convert ctag to lowercase

        # Iterate over each <orth> with its index
        for i in range(len(orths)):
            # Get previous and next orths with "_" if out of range
            prev3 = orths[i - 3] if i - 3 >= 0 else "_"
            prev2 = orths[i - 2] if i - 2 >= 0 else "_"
            prev1 = orths[i - 1] if i - 1 >= 0 else "_"
            current = orths[i]
            next1 = orths[i + 1] if i + 1 < len(orths) else "_"
            next2 = orths[i + 2] if i + 2 < len(orths) else "_"
            next3 = orths[i + 3] if i + 3 < len(orths) else "_"
            current_class = ctags[i]

            # Write row to CSV with all values lowercased
            writer.writerow([prev3, prev2, prev1, current, next1, next2, next3, current_class])

print("CSV file created as 'train_orth_context.csv'.")


CSV file created as 'train_orth_context.csv'.


In [10]:
import xml.etree.ElementTree as ET
import csv

# Process XML file and turn it into a CSV table
tree = ET.parse('../Data/test-1-1.xml')
root = tree.getroot()

# Create the CSV file
with open('test_orth_context.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write header
    writer.writerow(["Prev3", "Prev2", "Prev1", "Current", "Next1", "Next2", "Next3", "Class"])

    # Loop over each <chunk>
    for chunk in root.findall('.//chunk'):
        # Collect all <orth> and <ctag> elements in the current chunk
        orths = []
        ctags = []
        for tok in chunk.findall('tok'):
            orth = tok.find('orth')
            ctag = tok.find('.//ctag')

            # Handle missing orth or ctag by using "_" as a default
            orth_text = orth.text if orth is not None else "_"
            ctag_text = ctag.text.split(":")[0] if ctag is not None else "_"

            orths.append(orth_text.lower())  # Convert orth to lowercase
            ctags.append(ctag_text.lower())  # Convert ctag to lowercase

        # Iterate over each <orth> with its index
        for i in range(len(orths)):
            # Get previous and next orths with "_" if out of range
            prev3 = orths[i - 3] if i - 3 >= 0 else "_"
            prev2 = orths[i - 2] if i - 2 >= 0 else "_"
            prev1 = orths[i - 1] if i - 1 >= 0 else "_"
            current = orths[i]
            next1 = orths[i + 1] if i + 1 < len(orths) else "_"
            next2 = orths[i + 2] if i + 2 < len(orths) else "_"
            next3 = orths[i + 3] if i + 3 < len(orths) else "_"
            current_class = ctags[i]

            # Write row to CSV with all values lowercased
            writer.writerow([prev3, prev2, prev1, current, next1, next2, next3, current_class])

print("CSV file created as 'test_orth_context.csv'.")


CSV file created as 'test_orth_context.csv'.


In [25]:
import xml.etree.ElementTree as ET
import csv

# Process XML file and turn it into a CSV table
tree = ET.parse('../Data/train.xml')
root = tree.getroot()

# Create the CSV file
with open('improved_train_orth_context.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write header including 'Base' for the base form of each token
    writer.writerow(["Prev2", "Prev1", "Current", "Next1", "Next2", "Class", "Base"])

    # Loop over each <chunk>
    for chunk in root.findall('.//chunk'):
        # Collect all <orth>, <ctag>, and <base> elements in the current chunk
        orths = []
        ctags = []
        bases = []  # To store base forms
        for tok in chunk.findall('tok'):
            orth = tok.find('orth')
            ctag = tok.find('.//ctag')
            base = tok.find('.//base')  # Find the <base> element

            # Handle missing orth, ctag, or base by using "_" as a default
            orth_text = orth.text if orth is not None else "_"
            ctag_text = ctag.text.split(":")[0] if ctag is not None else "_"
            base_text = base.text if base is not None else "_"  # Default to "_" if no base form is present

            orths.append(orth_text.lower())  # Convert orth to lowercase
            ctags.append(ctag_text.lower())  # Convert ctag to lowercase
            bases.append(base_text.lower())  # Convert base form to lowercase

        # Iterate over each <orth> with its index
        for i in range(len(orths)):
            # Get previous and next orths and bases with "_" if out of range
            prev2 = orths[i - 2] if i - 2 >= 0 else "_"
            prev1 = orths[i - 1] if i - 1 >= 0 else "_"
            current = orths[i]
            next1 = orths[i + 1] if i + 1 < len(orths) else "_"
            next2 = orths[i + 2] if i + 2 < len(orths) else "_"
            current_class = ctags[i]
            current_base = bases[i]  # Base form of the current word

            # Write row to CSV, including the base form as an additional column
            writer.writerow([prev2, prev1, current, next1, next2, current_class, current_base])

print("CSV file created as 'improved_train_orth_context.csv'.")


CSV file created as 'improved_train_orth_context.csv'.


In [27]:
import xml.etree.ElementTree as ET
import csv

# Process XML file and turn it into a CSV table
tree = ET.parse('../Data/test-1-1.xml')
root = tree.getroot()

# Create the CSV file
with open('improved_test_orth_context.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write header including 'Base' for the base form of each token
    writer.writerow(["Prev2", "Prev1", "Current", "Next1", "Next2", "Class", "Base"])

    # Loop over each <chunk>
    for chunk in root.findall('.//chunk'):
        # Collect all <orth>, <ctag>, and <base> elements in the current chunk
        orths = []
        ctags = []
        bases = []  # To store base forms
        for tok in chunk.findall('tok'):
            orth = tok.find('orth')
            ctag = tok.find('.//ctag')
            base = tok.find('.//base')  # Find the <base> element

            # Handle missing orth, ctag, or base by using "_" as a default
            orth_text = orth.text if orth is not None else "_"
            ctag_text = ctag.text.split(":")[0] if ctag is not None else "_"
            base_text = base.text if base is not None else "_"  # Default to "_" if no base form is present

            orths.append(orth_text.lower())  # Convert orth to lowercase
            ctags.append(ctag_text.lower())  # Convert ctag to lowercase
            bases.append(base_text.lower())  # Convert base form to lowercase

        # Iterate over each <orth> with its index
        for i in range(len(orths)):
            # Get previous and next orths and bases with "_" if out of range
            prev2 = orths[i - 2] if i - 2 >= 0 else "_"
            prev1 = orths[i - 1] if i - 1 >= 0 else "_"
            current = orths[i]
            next1 = orths[i + 1] if i + 1 < len(orths) else "_"
            next2 = orths[i + 2] if i + 2 < len(orths) else "_"
            current_class = ctags[i]
            current_base = bases[i]  # Base form of the current word

            # Write row to CSV, including the base form as an additional column
            writer.writerow([prev2, prev1, current, next1, next2, current_class, current_base])

print("CSV file created as 'improved_test_orth_context.csv'.")


CSV file created as 'improved_test_orth_context.csv'.
