In [1]:
#import
import pandas as pd
from pprint import pprint
import json
import csv
import random

In [2]:
#mount to googledrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#file path
root = '/content/drive/My Drive/Swinburne/TIP/data3/'


In [4]:
input_file_path = root + 'cleaned_ARC_CommonSense_SciQ.jsonl'
output_file_path = root + 'final_ARC_CommonSense_SciQ.csv'

In [5]:
import os
# Check and count the number of lines in the input file
try:
    with open(input_file_path, 'r', encoding='utf-8') as infile:
        line_count = sum(1 for line in infile)
    print(f"Total number of questions in {input_file_path}: {line_count}")
except FileNotFoundError:
    print(f"File not found: {input_file_path}")
except Exception as e:
    print(f"An error occurred while processing {input_file_path}: {str(e)}")


Total number of questions in /content/drive/My Drive/Swinburne/TIP/data3/cleaned_ARC_CommonSense_SciQ.jsonl: 16366


In [9]:

# Open the input file and create a CSV writer for the output file
with open(input_file_path, 'r', encoding='utf-8') as infile, \
     open(output_file_path, 'w', newline='', encoding='utf-8') as outfile:

    # Initialize CSV writer
    header = ['id', 'prompt', 'A', 'B', 'C', 'D', 'answer']
    csv_writer = csv.DictWriter(outfile, fieldnames=header)
    csv_writer.writeheader()

    # Process each line in the input file
    for line_number, line in enumerate(infile, start=1):
        try:
            # Parse JSON line
            json_obj = json.loads(line.strip())

            # Extract data
            id_ = json_obj.get('id', f'UnknownID_{line_number}')
            prompt = json_obj.get('question', {}).get('stem', '')
            choices = {choice['label']: choice['text'] for choice in json_obj.get('question', {}).get('choices', [])}
            answer = json_obj.get('answerKey', '')

            # Check if all choices are present
            if not all(label in choices for label in ['A', 'B', 'C', 'D']):
                print(f"Skipping line {line_number}: Not all choices are present.")
                continue

            # Write to CSV
            csv_writer.writerow({
                'id': id_,
                'prompt': prompt,
                'A': f"A. {choices['A']}",
                'B': f"B. {choices['B']}",
                'C': f"C. {choices['C']}",
                'D': f"D. {choices['D']}",
                'answer': answer
            })
        except json.JSONDecodeError:
            print(f"Error decoding JSON on line {line_number}.")
        except Exception as e:
            print(f"An error occurred on line {line_number}: {str(e)}")


Skipping line 5: Not all choices are present.
Skipping line 20: Not all choices are present.
Skipping line 25: Not all choices are present.
Skipping line 32: Not all choices are present.
Skipping line 34: Not all choices are present.
Skipping line 74: Not all choices are present.
Skipping line 129: Not all choices are present.
Skipping line 166: Not all choices are present.
Skipping line 167: Not all choices are present.
Skipping line 169: Not all choices are present.
Skipping line 183: Not all choices are present.
Skipping line 191: Not all choices are present.
Skipping line 202: Not all choices are present.
Skipping line 228: Not all choices are present.
Skipping line 236: Not all choices are present.
Skipping line 276: Not all choices are present.
Skipping line 295: Not all choices are present.
Skipping line 333: Not all choices are present.
Skipping line 355: Not all choices are present.
Skipping line 356: Not all choices are present.
Skipping line 419: Not all choices are present.

In [10]:
try:
    with open(output_file_path, 'r', encoding='utf-8') as infile:
        line_count = sum(1 for line in infile)
    print(f"Total number of questions in {output_file_path}: {line_count}")
except FileNotFoundError:
    print(f"File not found: {output_file_path}")
except Exception as e:
    print(f"An error occurred while processing {output_file_path}: {str(e)}")

Total number of questions in /content/drive/My Drive/Swinburne/TIP/data3/final_ARC_CommonSense_SciQ.csv: 16266


In [11]:
# with open(output_file_path, 'r') as f:
#     lines = f.readlines()
#     count = len(lines)

# print(f"There are {count} items in the final file.")


There are 16266 items in the final file.
