In [1]:
import os
import json
import pandas as pd

### Data Cleaning

Add premise to the context

In [2]:
def transform_train_data(train_json_path, ct_json_folder):
    # Load the train.json file
    with open(train_json_path, 'r') as f:
        train_data = json.load(f)

    # Initialize lists to store the data
    sample_ids = []
    types = []
    section_ids = []
    primary_ids = []
    secondary_ids = []
    statements = []
    labels = []
    primary_evidence_indices = []
    secondary_evidence_indices = []
    primary_premises = []
    secondary_premises = []

    # Loop through the train.json data
    for sample_id, value in train_data.items():
        sample_ids.append(sample_id)  # Add sample ID
        types.append(value['Type'])
        section_ids.append(value['Section_id'])
        primary_ids.append(value['Primary_id'])

        # Check if 'Secondary_id' exists, otherwise use None
        secondary_id = value.get('Secondary_id', None)
        secondary_ids.append(secondary_id)

        statements.append(value['Statement'])
        labels.append(value['Label'])
        primary_evidence_indices.append(value['Primary_evidence_index'])

        # Check if 'Secondary_evidence_index' exists, otherwise use None
        secondary_evidence_index = value.get('Secondary_evidence_index', None)
        secondary_evidence_indices.append(secondary_evidence_index)

        # Load the corresponding CT json file for the primary_id
        ct_file_path_primary = os.path.join(ct_json_folder, f"{value['Primary_id']}.json")
        if os.path.exists(ct_file_path_primary):
            with open(ct_file_path_primary, 'r') as ct_file:
                ct_data_primary = json.load(ct_file)
                primary_premise = ct_data_primary.get(value['Section_id'], None)
                primary_premises.append(primary_premise)
        else:
            primary_premises.append(None)

        # Load the corresponding CT json file for the secondary_id if it exists
        if secondary_id:
            ct_file_path_secondary = os.path.join(ct_json_folder, f"{secondary_id}.json")
            if os.path.exists(ct_file_path_secondary):
                with open(ct_file_path_secondary, 'r') as ct_file:
                    ct_data_secondary = json.load(ct_file)
                    secondary_premise = ct_data_secondary.get(value['Section_id'], None)
                    secondary_premises.append(secondary_premise)
            else:
                secondary_premises.append(None)
        else:
            secondary_premises.append(None)

    # Create a dictionary where 'Sample_ID' is the key and other columns are values
    result_dict = {
        sample_id: {
            'Type': types[i],
            'Section_id': section_ids[i],
            'Primary_id': primary_ids[i],
            'Secondary_id': secondary_ids[i],
            'Primary_premise': primary_premises[i],
            'Secondary_premise': secondary_premises[i],
            'Statement': statements[i],
            'Label': labels[i],
            'Primary_evidence_index': primary_evidence_indices[i],
            'Secondary_evidence_index': secondary_evidence_indices[i]
        }
        for i, sample_id in enumerate(sample_ids)
    }


    return result_dict

In [3]:
train_json_path = '/home/rambod/MyDocuments/Complete_dataset/train.json'
ct_json_folder = '/home/rambod/MyDocuments/Complete_dataset/CT json'
resulting_data = transform_train_data(train_json_path, ct_json_folder)

Save the JSON file

In [4]:
# Save the dictionary as a JSON file
with open('data_with_premise.json', 'w') as json_file:
    json.dump(resulting_data, json_file, indent=4)

# Print a message indicating the JSON file has been saved
print("JSON file 'data_with_premise.json' has been created.")

JSON file 'data_with_premise.json' has been created.


Add special tokens to the Primary_premise, Secondary_premise and Statement

In [5]:
def add_special_tokens(input_file, output_file):
    # Load the JSON data from the input file
    with open(input_file, 'r') as json_file:
        data = json.load(json_file)

    # Define the special tokens
    primary_token_start = "<primary_text>"
    primary_token_end = "</primary_text>"
    secondary_token_start = "<secondary_text>"
    secondary_token_end = "</secondary_text>"
    statement_token_start = "<statement_text>"
    statement_token_end = "</statement_text>"

    # Add special tokens to columns
    for key, value in data.items():
        if 'Primary_premise' in value:
            primary_premise = value['Primary_premise']
            if primary_premise is not None:
                value['Primary_premise'] = primary_token_start + ' '.join(primary_premise) + primary_token_end

        if 'Secondary_premise' in value:
            secondary_premise = value['Secondary_premise']
            if secondary_premise is not None:
                value['Secondary_premise'] = secondary_token_start + ' '.join(secondary_premise) + secondary_token_end

        if 'Statement' in value:
            statement = value['Statement']
            if statement is not None:
                value['Statement'] = statement_token_start + statement + statement_token_end

    # Save the modified data back to the output JSON file
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print(f"Modified JSON file '{output_file}' has been created.")

Save JSON file with special tokens

In [6]:
input_path = "/home/rambod/T5Model/data_with_premise.json"
output_path = "/home/rambod/T5Model/data_with_special_tokens.json"
add_special_tokens(input_path, output_path)

Modified JSON file '/home/rambod/T5Model/data_with_special_tokens.json' has been created.


Add "context" column

In [7]:
def add_context_column(input_file_path, output_file_path):
    # Read the input JSON file
    with open(input_file_path, 'r') as json_file:
        data = json.load(json_file)

    # Iterate through each item in the JSON
    for key, value in data.items():
        # Check if "Secondary_premise" is present and not None
        if 'Secondary_premise' in value and value['Secondary_premise'] is not None:
            # Concatenate "Primary_premise," "Secondary_premise," and "Statement"
            context = value['Primary_premise'] + value['Secondary_premise'] + value['Statement']
        else:
            # If "Secondary_premise" is None, use only "Primary_premise" and "Statement"
            context = value['Primary_premise'] + value['Statement']

        # Add the "Context" column to the JSON data
        value['Context'] = context

    # Save the updated JSON data back to the output file
    with open(output_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

Save JSON file with context

In [8]:
input_file_path = '/home/rambod/T5Model/data_with_special_tokens.json'
output_file_path = '/home/rambod/T5Model/data_with_context.json'
add_context_column(input_file_path, output_file_path)