In [7]:
import json
import csv
import os

DATA_DIR = '../benchmark_data'
LANGUAGE = 'italian'
# Read the JSON data line by line
file = os.path.join(DATA_DIR, LANGUAGE, 'dev.jsonl')
with open(file, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f if line.strip()]

# Initialize a dictionary to count output_type occurrences
output_type_count = {}

# Prepare data for CSV
csv_data = []

for item in data:
    output_type = item.get("output_type")
    # Count the occurrences of each output_type
    if output_type in output_type_count:
        output_type_count[output_type] += 1
    else:
        output_type_count[output_type] = 1
    
    # Extract required fields for CSV
    csv_data.append({
        "id": item.get("id"),
        "question": item.get("question"),
        "targets": item.get("targets"),
        "output_type": output_type
    })

# Save the output_type statistics to a JSON file
statistics_file_path = os.path.join(DATA_DIR, LANGUAGE, 'output_type_statistics.json')
if not os.path.exists(os.path.dirname(statistics_file_path)):
    os.makedirs(os.path.dirname(statistics_file_path))
with open(statistics_file_path, 'w', encoding='utf-8') as stats_file:
    json.dump(output_type_count, stats_file, ensure_ascii=False, indent=4)

# Save the extracted data to a CSV file
csv_file_path = os.path.join(DATA_DIR, LANGUAGE, 'extracted_data.csv')
if not os.path.exists(os.path.dirname(csv_file_path)):
    os.makedirs(os.path.dirname(csv_file_path))
with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=["id", "question", "targets", "output_type"])
    writer.writeheader()
    writer.writerows(csv_data)