In [2]:
### Global utils
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## Es6

### Write a program in a programming language of your choice that, given a string T, construct the grammar produced by the algorithm RE-PAIR.
### Implement also the construction of the grammar in the Chomsky normal form.
### Compute also the size of the grammar in both cases.


In [6]:
import string


def repair_enc(T: str, n_char=0):
    # Initialize a dictionary to store non-terminal to terminal symbol pairs
    substitutions = {}
    step = 1

    # Loop until no more frequent adjacent pairs can be found
    for step in range(1, 101):
        pair_freq = {}

        # Build a frequency dictionary for each adjacent symbol pair in the string
        for i in range(0, len(T) - 1):
            pair = T[i] + T[i + 1]
            pair_freq[pair] = T.count(pair)

        # Print the frequency of all pairs in this step
        print(f"{bcolors.OKGREEN}Step {step}{bcolors.ENDC} - Pair Frequencies: {pair_freq}")

        # Find the pair with the maximum frequency
        max_pair = max(pair_freq, key=pair_freq.get)

        # If the most frequent pair occurs at most once, stop the process
        if pair_freq[max_pair] <= 1:
            print(f"{bcolors.WARNING}No more frequent pairs. Terminating at Step {step}.{bcolors.ENDC}")
            break

        # Create a unique non-terminal symbol for the most frequent pair
        NT = string.ascii_uppercase[n_char]

        # Print the pair being replaced and the corresponding Non-Terminal
        print(f"{bcolors.OKCYAN}Step {step}{bcolors.ENDC} - Replacing Pair '{max_pair}' with Non-Terminal '{NT}'")

        # Replace the most frequent pair with the non-terminal symbol in the string
        T = T.replace(max_pair, NT)

        # Print the updated string after replacement
        print(f"{bcolors.OKBLUE}Step {step}{bcolors.ENDC} - Updated String: {T}\n")

        # Add the pair and its substitution to the dictionary
        substitutions[NT] = max_pair

        # Increment to use the next letter for the next non-terminal symbol
        n_char += 1

    if step > 100:
        raise RuntimeError("Error, max iteration reached")
    
    return T, substitutions


In [7]:
x = "aaabbbaabaabab"
print("RE-PAIR Encoding:", x)

T, G = repair_enc(x)
keys = G.keys()

for k in keys:
    print("| " + k + "->" + G[k])

print("Output code:" + T)
print()

RE-PAIR Encoding: aaabbbaabaabab
[92mStep 1[0m - Pair Frequencies: {'aa': 3, 'ab': 4, 'bb': 1, 'ba': 3}
[96mStep 1[0m - Replacing Pair 'ab' with Non-Terminal 'A'
[94mStep 1[0m - Updated String: aaAbbaAaAA

[92mStep 2[0m - Pair Frequencies: {'aa': 1, 'aA': 3, 'Ab': 1, 'bb': 1, 'ba': 1, 'Aa': 1, 'AA': 1}
[96mStep 2[0m - Replacing Pair 'aA' with Non-Terminal 'B'
[94mStep 2[0m - Updated String: aBbbBBA

[92mStep 3[0m - Pair Frequencies: {'aB': 1, 'Bb': 1, 'bb': 1, 'bB': 1, 'BB': 1, 'BA': 1}
[93mNo more frequent pairs. Terminating at Step 3.[0m
| A->ab
| B->aA
Output code:aBbbBBA



In [16]:
def repair_cnf(T: str):
    """Function that performs RE-PAIR encoding and builds the Grammar in Chomsky Normal Form (CNF)"""
    print(f"Initial Input String: {T}")
    symbols = T.strip()
    unique_symbols = sorted(set(symbols))
    NT_symbols = {}
    n_char = 0

    # Replace each unique terminal symbol in the input string with a Non-Terminal symbol
    for s in unique_symbols:
        NT = string.ascii_uppercase[n_char]
        NT_symbols[NT] = s
        n_char += 1

        # Replace the terminal symbols in the input string with the new Non-Terminals
        T = T.replace(s, NT)

    print(f"Replaced symbols with Non-Terminal ones: {T}")
    print(f"Initial Non-Terminal Symbols: {NT_symbols}")

    # Use the RE-PAIR algorithm, starting with 'n_char' as the index for new Non-Terminals
    T, repair_dict = repair_enc(T, n_char=n_char)

    # Print the RE-PAIR dictionary and the resulting string
    print(f"RE-PAIR Dictionary: {repair_dict}")
    print(f"String after RE-PAIR encoding: {T}")

    # Merge the initial Non-Terminal substitutions with the RE-PAIR substitutions
    cnf_grammar = {**NT_symbols, **repair_dict}

    # Print the final CNF grammar
    print("Final CNF Grammar:")
    if "S" not in cnf_grammar:
        print("  S -> \u03B5")
    for key, value in cnf_grammar.items():
        print(f"  {key} -> {value}")

    return T, cnf_grammar

In [17]:
x = "aababbbabaababbbabbabb"
print("RE-PAIR Encoding with CNF Grammar:", x)

T, G_cnf = repair_cnf(x)
keys = G_cnf.keys()

print("Output code: " + T)

RE-PAIR Encoding with CNF Grammar: aababbbabaababbbabbabb
Initial Input String: aababbbabaababbbabbabb
Replaced symbols with Non-Terminal ones: AABABBBABAABABBBABBABB
Initial Non-Terminal Symbols: {'A': 'a', 'B': 'b'}
[92mStep 1[0m - Pair Frequencies: {'AA': 2, 'AB': 7, 'BA': 6, 'BB': 4}
[96mStep 1[0m - Replacing Pair 'AB' with Non-Terminal 'C'
[94mStep 1[0m - Updated String: ACCBBCACCBBCBCB

[92mStep 2[0m - Pair Frequencies: {'AC': 2, 'CC': 2, 'CB': 4, 'BB': 2, 'BC': 3, 'CA': 1}
[96mStep 2[0m - Replacing Pair 'CB' with Non-Terminal 'D'
[94mStep 2[0m - Updated String: ACDBCACDBDD

[92mStep 3[0m - Pair Frequencies: {'AC': 2, 'CD': 2, 'DB': 2, 'BC': 1, 'CA': 1, 'BD': 1, 'DD': 1}
[96mStep 3[0m - Replacing Pair 'AC' with Non-Terminal 'E'
[94mStep 3[0m - Updated String: EDBCEDBDD

[92mStep 4[0m - Pair Frequencies: {'ED': 2, 'DB': 2, 'BC': 1, 'CE': 1, 'BD': 1, 'DD': 1}
[96mStep 4[0m - Replacing Pair 'ED' with Non-Terminal 'F'
[94mStep 4[0m - Updated String: FBCFBDD

