# Notebook for CCG Generation

## Defining a lexicon

In [1]:
class Lexicon:
    def __init__(self):
        # Dictionary to store lexical entries, where the key is a word
        # and the value is a list of categories (with optional semantics).
        self.entries = {}


    def add_entry(self, word, category, semantics=None):
        # Add a lexical entry to the lexicon.
        if word not in self.entries:
            self.entries[word] = []
        self.entries[word].append({'category': category, 'semantics': semantics})

    def get_categories(self, word):
        # Retrieve all categories for a given word.
        return self.entries.get(word, [])

    def __str__(self):
        # Display all entries in the lexicon.
        lexicon_str = "Lexicon:\n"
        for word, categories in self.entries.items():
            lexicon_str += f"{word}:\n"
            for entry in categories:
                semantics = entry['semantics'] if entry['semantics'] else 'None'
                lexicon_str += f"  - Category: {entry['category']}, Semantics: {semantics}\n"
        return lexicon_str

In [None]:
# Create an instance of the Lexicon
lexicon = Lexicon()

# Add entries
lexicon.add_entry("John", "NP", semantics="john")
lexicon.add_entry("Mary", "NP", semantics="mary")
lexicon.add_entry("Jack", "NP", semantics="jack")
lexicon.add_entry("Bill", "NP", semantics="bill")
lexicon.add_entry("likes", "(S\\NP)/NP", semantics="λx.λy.likes(y, x)")
lexicon.add_entry("runs", "S\\NP", semantics="λx.runs(x)")
lexicon.add_entry("the", "NP/N", semantics="λx.x")
lexicon.add_entry("dog", "N", semantics="dog")
lexicon.add_entry("cat", "N", semantics="cat")
lexicon.add_entry("brown", "N/N", semantics="λx.brown(x)")
lexicon.add_entry("big", "N/N", semantics="λx.big(x)")
lexicon.add_entry("and", "CONJ", semantics="λx.λy.and(x, y)")
lexicon.add_entry("would", "(S\\NP)/VP", semantics="λx.λy.will(y, x)")
lexicon.add_entry("prefer", "VP/NP", semantics="λx.prefer(x)")

# Handle ambiguous entries (e.g., "saw" as both a verb and noun)
lexicon.add_entry("saw", "(S\\NP)/NP", semantics="λx.λy.saw(y, x)")
lexicon.add_entry("saw", "N", semantics="saw")

In [None]:
# Retrieve categories for a specific word
word = "likes"
print(f"Categories for '{word}':")
for entry in lexicon.get_categories(word):
    print(f"  - Category: {entry['category']}, Semantics: {entry['semantics']}")

# Print the entire lexicon
print(lexicon)

## Generating sentences (including trees) without semantic constraints

In [2]:
import random

class SentenceGenerator: 
    #A class is a blueprint for creating objects (instances), 
    # which can hold data and have functions (methods) associated with them.
    def __init__(self, lexicon): 
        # Constructor method for the class: special method in Python automatically called when an 
        # object (an instance of the class) is created.
        self.lexicon = lexicon 
        # This line stores the lexicon parameter passed to the constructor as an instance attribute.

    def generate_sentence(self, target_category="S"):
        # Generate a sentence that matches the target category (default is 'S' for a sentence).
        # Defines a method named generate_sentence: 
        # a member function of the SentenceGenerator class, 
        # meaning it operates on instances of that class.
        sentence, remaining_category = self.build_sentence(target_category) 
        # This line calls the build_sentence method: result is expected to be a tuple.
        
        # If we couldn't completely match the category, return None
        if remaining_category is None: #This part handles the outcome of the sentence generation.
            return " ".join(sentence)
        else:
            return None

    def generate_multiple_sentences(self, target_category="S", count=5):
        # Generate multiple unique sentences that match the target category.
        sentences = set() #This is a set that will hold the unique sentences generated.
        attempts = 0 #This counter keeps track of the number of attempts made to generate a sentence.
        max_attempts = count * 5  # Allow multiple attempts to find unique sentences
        
        # Ensures loop continues until desired number of unique sentences has been generated or 
        # max number of attempts is reached.
        while len(sentences) < count and attempts < max_attempts:
            attempts += 1
            sentence = self.generate_sentence(target_category)
            if sentence and sentence not in sentences:
                sentences.add(sentence)
        
        return list(sentences)

    def build_sentence(self, target_category, depth=0, max_depth=10, used_conjunction=False, used_adjectives=None):
        # Initialize the set to track used adjectives if it's not passed in
        if used_adjectives is None:
            used_adjectives = set()

        if depth > max_depth:
            print(f"Exceeded max recursion depth: {depth} for target {target_category}")
            return [], None, None  # Include tree in return values


        if target_category == "S":
            # Randomly decide decomposition type with a 50/50 chance
            if random.choice([True, False]):
                # Decompose S into NP and S\NP
                print("Decomposing S into NP and S\\NP")
                np_sentence, np_category, np_tree = self.build_sentence("NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                if np_category is None:  # NP found
                    print(f"NP successfully resolved: {np_sentence}")
                    snp_sentence, snp_category, snp_tree = self.build_sentence("S\\NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                    if snp_category is None:  # S\NP found
                        print(f"Successfully decomposed S: NP='{np_sentence}' and S\\NP='{snp_sentence}'")
                        tree = {
                            "node": "S",
                            "children": [np_tree, snp_tree]
                        }
                        return np_sentence + snp_sentence, None, tree
                    else:
                        print(f"Failed to resolve S\\NP after matching NP='{np_sentence}'")
                else:
                    print(f"Failed to resolve NP for S decomposition")
            else:
                # Decompose S into S/NP and NP
                print("Decomposing S into S/NP and NP")
                snp_sentence, snp_category, snp_tree = self.build_sentence("S/NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                if snp_category is None:  # S/NP found
                    print(f"S/NP successfully resolved: {snp_sentence}")
                    np_sentence, np_category, np_tree = self.build_sentence("NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                    if np_category is None:  # NP found
                        print(f"Successfully decomposed S: S/NP='{snp_sentence}' and NP='{np_sentence}'")
                        tree = {
                            "node": "S",
                            "children": [snp_tree, np_tree]
                        }
                        return snp_sentence + np_sentence, None, tree
                    else:
                        print(f"Failed to resolve NP after matching S/NP='{snp_sentence}'")
                else:
                    print(f"Failed to resolve S/NP for S decomposition")
        
        if target_category == "S:DCL":
            # Randomly decide decomposition type with a 50/50 chance
            if random.choice([True, False]):
                # Decompose S:DCL into NP and S:DCL\NP
                print("Decomposing S:DCL into NP and S:DCL\\NP")
                np_sentence, np_category, np_tree = self.build_sentence("NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                if np_category is None:  # NP found
                    print(f"NP successfully resolved: {np_sentence}")
                    snp_sentence, snp_category, snp_tree = self.build_sentence("S:DCL\\NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                    if snp_category is None:  # S:DCL\NP found
                        print(f"Successfully decomposed S:DCL: NP='{np_sentence}' and S:DCL\\NP='{snp_sentence}'")
                        tree = {
                            "node": "S:DCL",
                            "children": [np_tree, snp_tree]
                        }
                        return np_sentence + snp_sentence, None, tree
                    else:
                        print(f"Failed to resolve S:DCL\\NP after matching NP='{np_sentence}'")
                else:
                    print(f"Failed to resolve NP for S:DCL decomposition")
            else:
                # Decompose S:DCL into S:DCL/NP and NP
                print("Decomposing S:DCL into S:DCL/NP and NP")
                snp_sentence, snp_category, snp_tree = self.build_sentence("S:DCL/NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                if snp_category is None:  # S:DCL/NP found
                    print(f"S:DCL/NP successfully resolved: {snp_sentence}")
                    np_sentence, np_category, np_tree = self.build_sentence("NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                    if np_category is None:  # NP found
                        print(f"Successfully decomposed S:DCL: S:DCL/NP='{snp_sentence}' and NP='{np_sentence}'")
                        tree = {
                            "node": "S:DCL",
                            "children": [snp_tree, np_tree]
                        }
                        return snp_sentence + np_sentence, None, tree
                    else:
                        print(f"Failed to resolve NP after matching S:DCL/NP='{snp_sentence}'")
                else:
                    print(f"Failed to resolve S:DCL/NP for S:DCL decomposition")
        

        lexicon_keys = list(self.lexicon.entries.keys())
        random.shuffle(lexicon_keys)

        for word in lexicon_keys:
            entries = self.lexicon.entries[word]
            for entry in entries:
                category = entry['category']
                print(f"Trying word '{word}' with category '{category}' to match '{target_category}'")

                # Skip repeated adjectives
                if category == "N/N" and word in used_adjectives:
                    print(f"Skipping repeated adjective '{word}'")
                    continue

                # Direct match:
                if self.normalize_category(category) == self.normalize_category(target_category):
                    print(f"Direct match found: '{word}' for category '{target_category}'")
                    if category == "N/N":
                        used_adjectives.add(word)  # Mark this adjective as used
                    tree = {"node": target_category, "children": [word]}
                    return [word], None, tree

                # Conjunction
                if category == "CONJ" and target_category in ["NP", "S", "(S\\NP)/NP", "S\\NP"] and not used_conjunction:
                    print(f"Trying conjunction '{word}' for category '{target_category}'")
                    left_sentence, remaining_category, left_tree = self.build_sentence(target_category, depth + 1, max_depth, True, used_adjectives)
                    if remaining_category is None:
                        print(f"Conjunction partly successful for '{word}'")
                        right_sentence, remaining_category, right_tree = self.build_sentence(target_category, depth + 1, max_depth, True, used_adjectives)
                        if remaining_category is None:
                            print(f"Conjunction successful for '{word}'")
                            tree = {
                                "node": target_category,
                                "children": [left_tree, {"node": "CONJ", "children": [word]}, right_tree]
                            }
                            return left_sentence + [word] + right_sentence, None, tree

                # Forward application: (A/B) + B → A
                if self.is_forward_function(category):
                    left_category, right_category = map(self.normalize_category, self.split_category(category))
                    if self.normalize_category(target_category) == left_category:
                        print(f"Attempting forward application: '{word}' as ({left_category}/{right_category})")
                        if category == "N/N":
                            used_adjectives.add(word)  # Mark this adjective as used
                        right_sentence, remaining_category, right_tree = self.build_sentence(right_category, depth + 1, max_depth, used_conjunction, used_adjectives)
                        if remaining_category is None:
                            print(f"Forward application successful for '{word}'")
                            tree = {
                                "node": left_category,
                                "children": [{"node": f"({left_category}/{right_category})", "children": [word]}, right_tree]
                            }
                            return [word] + right_sentence, None, tree

                # Backward application: B + (A\B) → A
                if self.is_backward_function(category):
                    left_category, right_category = map(self.normalize_category, self.split_category(category))
                    if self.normalize_category(target_category) == left_category:
                        print(f"Attempting backward application: '{word}' as ({left_category}\\{right_category})")
                        left_sentence, remaining_category, left_tree = self.build_sentence(right_category, depth + 1, max_depth, used_conjunction, used_adjectives)
                        if remaining_category is None:
                            print(f"Backward application successful for '{word}'")
                            tree = {
                                "node": left_category,
                                "children": [left_tree, {"node": f"({left_category}\\{right_category})", "children": [word]}]
                            }
                            return left_sentence + [word], None, tree
                        
                # Type raising and forward composition in one for NP (→ S/(S\NP)) + (S\NP)/NP → S/NP
                # for declarative sentences
                if target_category == "S:DCL/NP" and category == "NP":
                    print(f"Attempting type raising and forward composition for '{word}' to obtain S:DCL/NP")
                    right_sentence, remaining_category, right_tree = self.build_sentence("(S:DCL\\NP)/NP", depth + 1, max_depth, used_conjunction, used_adjectives)
                    if remaining_category is None:
                        print(f"Type raising and forward composition successful for '{word}'")
                        tree = {
                            "node": "S:DCL/NP",
                            "children": [{"node": "NP", "children": [word]}, right_tree]
                        }
                        return [word] + right_sentence, None, tree
                
                # # Forward composition: (A/B) + (B/C) → A/C (not necessary?)
                # if self.is_forward_function(category) and self.is_forward_function(target_category):
                #     left_category, right_category = map(self.normalize_category, self.split_category(category))
                #     left_target_category, right_target_category = self.split_category(target_category)
                #     if left_target_category == left_category:
                #         print(f"Attempting forward composition: '{word}' as ({left_category}/{right_category})")
                #         right_sentence, remaining_category, right_tree = self.build_sentence(f"{right_category}/{right_target_category}", depth + 1, max_depth, used_conjunction, used_adjectives)
                #         if remaining_category is None:
                #             print(f"Forward composition successful for '{word}'")
                #             tree = {
                #                 "node": target_category,
                #                 "children": [{"node": f"({left_category}/{right_category})", "children": [word]}, right_tree]
                #             }
                #             return [word] + right_sentence, None, tree
                #     elif right_target_category == right_category:
                #         print(f"Attempting forward composition: '{word}' as ({left_category}/{right_category})")
                #         left_sentence, remaining_category, left_tree = self.build_sentence(f"{left_target_category}/{left_category}", depth + 1, max_depth, used_conjunction, used_adjectives)
                #         if remaining_category is None:
                #             print(f"Forward composition successful for '{word}'")
                #             tree = {
                #                 "node": target_category,
                #                 "children": [left_tree, {"node": f"({left_category}/{right_category})", "children": [word]}]
                #             }
                #             return left_sentence + [word], None, tree
                        
                # # Backward composition: (B\C) + (A\B) → A\C (not necessary?)
                # if self.is_backward_function(category) and self.is_backward_function(target_category):
                #     left_category, right_category = map(self.normalize_category, self.split_category(category))
                #     left_target_category, right_target_category = self.split_category(target_category)
                #     if left_target_category == left_category:
                #         print(f"Attempting backward composition: '{word}' as ({left_category}\\{right_category})")
                #         left_sentence, remaining_category, left_tree = self.build_sentence(f"{right_category}\\{right_target_category}", depth + 1, max_depth, used_conjunction, used_adjectives)
                #         if remaining_category is None:
                #             print(f"Backward composition successful for '{word}'")
                #             tree = {
                #                 "node": target_category,
                #                 "children": [left_tree, {"node": f"({left_category}/{right_category})", "children": [word]}]
                #             }
                #             return left_sentence + [word], None, tree
                #     elif right_target_category == right_category:
                #         print(f"Attempting backward composition: '{word}' as ({left_category}\\{right_category})")
                #         right_sentence, remaining_category, right_tree = self.build_sentence(f"{left_target_category}\\{left_category}", depth + 1, max_depth, used_conjunction, used_adjectives)
                #         if remaining_category is None:
                #             print(f"Backward composition successful for '{word}'")
                #             tree = {
                #                 "node": target_category,
                #                 "children": [{"node": f"({left_category}/{right_category})", "children": [word]}, right_tree]
                #             }
                #             return [word] + right_sentence, None, tree


        print(f"No match found for target '{target_category}' at depth {depth}")
        return [], target_category, None


    def is_forward_function(self, category):
        # Check if the category is a forward function (A/B).
        return "/" in category

    def is_backward_function(self, category):
        # Check if the category is a backward function (A\B).
        return "\\" in category

    def split_category(self, function_category):
        # Split a function category (A/B or A\B) into its components A and B.
        if "/" in function_category:
            return function_category.split("/", 1)
        elif "\\" in function_category:
            return function_category.split("\\", 1)
        return None, None
    
    def normalize_category(self, category):
        # Remove unnecessary parentheses from a category.
        while category.startswith("(") and category.endswith(")"):
            category = category[1:-1]
        return category

In [3]:
import anytree
from anytree import Node, RenderTree

# New code for visualizing the tree
def build_visual_tree(tree, parent=None):
    """
    Recursively convert the JSON-like tree into an anytree.Node tree.
    """
    if isinstance(tree, dict):
        node = Node(tree['node'], parent=parent)
        for child in tree.get('children', []):
            build_visual_tree(child, parent=node)
        return node
    elif isinstance(tree, str):  # Leaf node (word)
        return Node(tree, parent=parent)

In [None]:
import json

generator = SentenceGenerator(lexicon)

# Generate multiple sentences and store them in a matrix
num_sentences = 5  # Adjust the number of sentences as needed
matrix = []  # Initialize the matrix to store sentences and trees

# Generate sentences and trees
for _ in range(num_sentences):
    sentence, _, tree = generator.build_sentence("S")  # Generate sentence and its tree
    matrix.append([" ".join(sentence), tree])  # Append as a row in the matrix

# Print the matrix with ASCII tree visualizations
print("Generated Sentences and Trees:")
for i, (sentence, tree) in enumerate(matrix):
    print(f"Sentence {i + 1}: {sentence}")
    ascii_tree_root = build_visual_tree(tree)  # Convert tree to anytree format
    print("Tree Visualization:")
    for pre, fill, node in RenderTree(ascii_tree_root):
        print(f"{pre}{node.name}")
    print("-" * 50)

# Save the matrix to a JSON file
with open("sentences_and_trees.json", "w") as f:
    json.dump(matrix, f, indent=2)

print("\nMatrix saved to 'sentences_and_trees.json'")

## Extracting lexicon from data and generating sentences

In [None]:
import json
import os
import re

# Function to normalize categories
def normalize_category(category):
    """
    Normalize syntactic categories:
    - Replace S:DCL with S, preserving structure in compound categories.
    """
    # Replace all occurrences of S:DCL with S, even in nested categories
    normalized = re.sub(r"\bS:DCL\b", "S", category)
    return normalized

# Function to extract words and categories from multiple files and populate the Lexicon
def populate_lexicon_from_folder(folder_path, lexicon):
    """
    Extract words and syntactic categories from all relevant files in a folder and populate the lexicon.
    Ensures that syntactic types (categories) are capitalized and S:DCL is treated as S.
    """
    pattern = re.compile(r"t\(([^,]+),\s*'([^']+)',\s*\[.*?\]\)")

    # List all relevant files in the folder
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith("en.parse.tags"):  # Process only files with this extension
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                with open(file_path, 'r') as f:
                    for line in f:
                        matches = pattern.findall(line)
                        for match in matches:
                            syntactic_type, word = match
                            # Capitalize the syntactic type and normalize S:DCL to S
                            #normalized_type = normalize_category(syntactic_type.upper())
                            lexicon.add_entry(word, syntactic_type.upper())
                            #print(f"Added entry: Word='{word}', Category='{normalized_type}'")

# Example usage

# Create an instance of Lexicon
lexicon2 = Lexicon()

# Path to the p00 folder
## Update the path to the p00 folder on your system!
folder_path = "/Users/rubenreijerse/repos/ccg-lm/p00"

# Populate the lexicon from all files in the folder
populate_lexicon_from_folder(folder_path, lexicon2)

# Print the populated lexicon
print(lexicon2)


Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d2508/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d2153/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d1948/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d0856/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d0869/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d0055/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d3420/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d1383/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d0867/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d3418/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d3285/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d0858/en.parse.tags
Processing file: /Users/rubenreijerse/repos/ccg-lm/p00/d2303/en.parse.tags
Processing file: /Users/r

## PROBLEM: At the moment only capable to generate S:DCL sentences

CCG S:.. categories:

S:DCL (full declarative sentence)\
S:Q (full question sentence)\
S:B (bare infinitive clause? Don't know if full sentence)\
S:ADJ (some sort of adjectival sentence like 'John is busy' - but busy is S:ADJ\\NP?)\
S:NG ?\
S:PSS (passive sentence - but doesn't seem full: acquired is 'S:PSS\NP')\
S:TO ?\
S:PT ?\

In [None]:
#Generate one sentence

generator2 = SentenceGenerator(lexicon2)

sentence2, _, tree2 = generator2.build_sentence("S:DCL")

sentence2join = " ".join(sentence2)
print(f"Sentence: {sentence2join}")
ascii_tree_root = build_visual_tree(tree2)  # Convert tree to anytree format
print("Tree Visualization:")
for pre, fill, node in RenderTree(ascii_tree_root):
    print(f"{pre}{node.name}")

In [6]:
# Generate multiple sentences and store them in a matrix

generator2 = SentenceGenerator(lexicon2)

num_sentences2 = 10  # Adjust the number of sentences as needed
matrix2 = []  # Initialize the matrix to store sentences and trees

# Generate sentences and trees
for _ in range(num_sentences2):
    sentence, _, tree = generator2.build_sentence("S:DCL")  # Generate sentence and its tree
    matrix2.append([" ".join(sentence), tree])  # Append as a row in the matrix

# Print the matrix with ASCII tree visualizations
print("Generated Sentences and Trees:")
for i, (sentence, tree) in enumerate(matrix2):
    print(f"Sentence {i + 1}: {sentence}")
    ascii_tree_root = build_visual_tree(tree)  # Convert tree to anytree format
    print("Tree Visualization:")
    for pre, fill, node in RenderTree(ascii_tree_root):
        print(f"{pre}{node.name}")
    print("-" * 50)

# Save the matrix to a JSON file
with open("sentences_and_trees2.json", "w") as f:
    json.dump(matrix2, f, indent=2)

print("\nMatrix saved to 'sentences_and_trees2.json'")

Decomposing S:DCL into NP and S:DCL\NP
Trying word 'invited' with category '((S:DCL\NP)/PP)/NP' to match 'NP'
Trying word 'mow' with category '(S:B\NP)/NP' to match 'NP'
Trying word 'escaped' with category '(S:DCL\NP)/PP' to match 'NP'
Trying word 'whom' with category 'S:WQ/(S:Q/NP)' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'did' with category '(S:Q/(S:B\NP))/NP' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'did' with category '(S:Q/(S:B\NP))/NP' to match 'NP'
Trying word 'did' with category '(S:Q/(S:B\NP))/NP' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'did' with category '(S:DCL\NP)/(S:B\NP)' to match 'NP'
Trying word 'door' with category 'N' to match 'NP'
Trying word 'door' with category 'N' to matc