In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import Counter
import os

In [8]:
# Get the current working directory (where the notebook is running)
current_dir = Path().resolve()

# Move one level up to reach the parent directory of 'notebooks/'
BASE_DIR = current_dir.parent
print(BASE_DIR)

# Construct the path to the data file
data_path = BASE_DIR / "data" / "dataset.json"

# Load the JSON data
with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

D:\AdminStorage\Documents\trns-ai-2025


In [9]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)

# Basic statistics
num_problems = len(df)
print(f"Number of problems: {num_problems}")

Number of problems: 481


In [10]:
# Analyze premise types in FOL
universal_counts = []
existential_counts = []
implication_counts = []
negation_counts = []

for premises in df['premises-FOL']:
    universal = sum(1 for p in premises if p.startswith('∀'))
    existential = sum(1 for p in premises if p.startswith('∃'))
    implication = sum(1 for p in premises if '→' in p)
    negation = sum(1 for p in premises if '¬' in p)

    universal_counts.append(universal)
    existential_counts.append(existential)
    implication_counts.append(implication)
    negation_counts.append(negation)

print("\nLogical constructs in premises:")
print(f"  Universal statements (∀): {np.mean(universal_counts):.2f} per example")
print(f"  Existential statements (∃): {np.mean(existential_counts):.2f} per example")
print(f"  Implications (→): {np.mean(implication_counts):.2f} per example")
print(f"  Negations (¬): {np.mean(negation_counts):.2f} per example")


Logical constructs in premises:
  Universal statements (∀): 3.43 per example
  Existential statements (∃): 0.64 per example
  Implications (→): 7.06 per example
  Negations (¬): 2.65 per example


In [11]:
def create_visualizations(df):
    # Premises needed per question
    premise_counts = []
    for idx_list in df['idx']:
        for premise_set in idx_list:
            premise_counts.append(len(premise_set))

    plt.figure(figsize=(8, 6))
    plt.hist(premise_counts, bins=range(1, max(premise_counts)+2), alpha=0.7)
    plt.title('Number of Premises Needed per Question')
    plt.xlabel('Number of Premises')
    plt.ylabel('Frequency')
    plt.savefig('premises_needed.png')
    plt.close()

    # Logical construct distribution

    logical_constructs = {
        'Universal': sum(sum(1 for p in premises if p.startswith('∀')) for premises in df['premises-FOL']),
        'Existential': sum(sum(1 for p in premises if p.startswith('∃')) for premises in df['premises-FOL']),
        'Implication': sum(sum(1 for p in premises if '→' in p) for premises in df['premises-FOL']),
        'Negation': sum(sum(1 for p in premises if '¬' in p) for premises in df['premises-FOL'])
    }

    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(logical_constructs.keys()),
                y=list(logical_constructs.values()))
    plt.title('Total Count of Logical Constructs in Dataset')
    plt.ylabel('Count')
    plt.savefig('logical_constructs.png')
    plt.close()

    # Alternative: Line chart showing distribution per example
    logical_construct_data = {
        'Example': [],
        'Construct': [],
        'Count': []
    }

    logical_constructs = {
        'Universal': sum(sum(1 for p in premises if p.startswith('∀')) for premises in df['premises-FOL']),
        'Existential': sum(sum(1 for p in premises if p.startswith('∃')) for premises in df['premises-FOL']),
        'Implication': sum(sum(1 for p in premises if '→' in p) for premises in df['premises-FOL']),
        'Negation': sum(sum(1 for p in premises if '¬' in p) for premises in df['premises-FOL'])
    }

    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(logical_constructs.keys()),
                y=list(logical_constructs.values()))
    plt.title('Total Count of Logical Constructs in Dataset')
    plt.ylabel('Count')
    plt.savefig('logical_constructs.png')
    plt.close()

    # Alternative: Line chart showing distribution per example
    logical_construct_data = {
        'Example': [],
        'Construct': [],
        'Count': []
    }

    for i, premises in enumerate(df['premises-FOL']):
        logical_construct_data['Example'].extend([i+1] * 4)
        logical_construct_data['Construct'].extend(
            ['Universal', 'Existential', 'Implication', 'Negation'])
        logical_construct_data['Count'].extend([
            sum(1 for p in premises if p.startswith('∀')),
            sum(1 for p in premises if p.startswith('∃')),
            sum(1 for p in premises if '→' in p),
            sum(1 for p in premises if '¬' in p)
        ])

    construct_df = pd.DataFrame(logical_construct_data)

    plt.figure(figsize=(12, 6))
    for construct in ['Universal', 'Existential', 'Implication', 'Negation']:
        data = construct_df[construct_df['Construct'] == construct]
        plt.plot(data['Example'], data['Count'], marker='o', label=construct)

    plt.title('Logical Constructs per Example')
    plt.xlabel('Example Number')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('logical_constructs_by_example.png')
    plt.close()


# Create visualizations
create_visualizations(df)


In [14]:
import json

# Load data from the original JSON file
input_path = "../data/dataset.json"
with open(input_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract mappings between natural language and FOL
mappings = [
    {"premise": nl, "fol": fol}
    for item in data
    for nl, fol in zip(item.get("premises-NL", []), item.get("premises-FOL", []))
]

# Write the extracted mappings to a new JSON file
output_path = "../data/train.json"
with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(mappings, file, indent=2, ensure_ascii=False)
