In [7]:
import random
import math
import numpy as np
from tqdm import tqdm

size = 81
PAD = "2"

# start by generating the accessibility matrix
def generate_accessibility_matrix(length: int, prob=0.5):
    matrix = []
    for _ in range(length):
        matrix.append("1" if random.random() < prob else "0")
    return "".join(matrix)

def generate_propositions(length: int, prob=0.5):
    propositions = []
    for _ in range(length):
        propositions.append("1" if random.random() < prob else "0")
    return "".join(propositions)

def generate_answer(accessibility: str, propositions: str) -> str:
    # compile each string as an array
    accessibility_dim = math.sqrt(len(accessibility))
    accessibility_array = [int(bit) for bit in accessibility]
    propositions_array = [int(bit) for bit in propositions]

    # build the array
    accessibility_array = np.array(accessibility_array).reshape((int(accessibility_dim), int(accessibility_dim)))
    propositions_array = np.array(propositions_array).reshape((int(accessibility_dim), 1))

    # evaluate the answer
    answer_array = accessibility_array.dot(propositions_array) > 0
    answer_bits = ["1" if bit else "0" for bit in answer_array.flatten()]
    answer = "".join(answer_bits)

    # we need to append padding to make it match the shape of the input
    padding = PAD * (len(accessibility) + len(propositions) - len(answer))
    answer += padding

    return answer

print(generate_accessibility_matrix(81))
print(generate_propositions(9))
print(generate_answer(generate_accessibility_matrix(81), generate_propositions(9)))

111001100110001100011100100110110100110111000011111110101111110011100110011001100
111111010
111111111222222222222222222222222222222222222222222222222222222222222222222222222222222222


In [8]:
def generate_kripke_row(length: int) -> str:
    accessibility = generate_accessibility_matrix(length)
    propositions = generate_propositions(int(math.sqrt(length)))
    answer = generate_answer(accessibility, propositions)
    return ",".join(["81-9-9-generation", accessibility + propositions, answer, "90"])  # adding 90 because just need the dummy rating for now

print(generate_kripke_row(81))

81-9-9-generation,100010000010011011110011100010101100000010001001010100001111111100010000011110010000101101,011111101222222222222222222222222222222222222222222222222222222222222222222222222222222222,90


In [10]:
def build_kripke_dataset(rows: int, length: int) -> str:
    with open("kripke_dataset.csv", "w") as f:
        f.write("source, question, answer, rating\n")
        for _ in tqdm(range(rows), desc="Generating Kripke rows"):
            f.write(generate_kripke_row(length) + "\n")

build_kripke_dataset(8000, 81)


Generating Kripke rows: 100%|██████████| 8000/8000 [00:00<00:00, 40515.58it/s]
