# convert csv to json for prediction

In [21]:
import pandas as pd
import json
import re

# 1 Read CSV file
csv_file_path = "GM_reviews_all.csv"  # Replace with your actual CSV file path
df = pd.read_csv(csv_file_path, encoding='latin1')

# 2 Ensure 'id' column exists; if not, generate it
if 'id' not in df.columns:
    df['id'] = range(1, len(df) + 1)

# 3 Preprocess: Tokenize sentences while preserving punctuation
def preprocess_text(sentence):
    """
    Tokenize the sentence into words while preserving punctuation.
    """
    words = re.findall(r"[A-Za-z]+|[^A-Za-z\s]+", sentence)
    return words

# Process data
json_data = []
for _, row in df.iterrows():
    split_text = row["split_sentence"]
    words = preprocess_text(split_text)

    sample = {
        "text": words,
        "spans": [],  # Empty spans for now
        "id": str(row["id"])  # Convert id to string format
    }
    json_data.append(sample)

# 4 Save as JSON-Lines format
output_jsonl_path = "GM_reviews_all.json"

with open(output_jsonl_path, 'w') as outfile:
    for record in json_data:
        json.dump(record, outfile)
        outfile.write('\n')

print(f"4: JSON-Lines file saved -> {output_jsonl_path}")


4: JSON-Lines file saved -> GM_reviews_all.json


# convert predict file to csv file for annotation

In [4]:
import json
import csv
import os

def add_ids_for_predicted(inputid_file, inputpre_file, output_file):
    with open(inputid_file, 'r', encoding='utf-8') as file1:
        ids = [json.loads(line)['id'] for line in file1]
    
    with open(inputpre_file, 'r', encoding='utf-8') as file2, open(output_file, 'w', encoding='utf-8') as file2_updated:
        for i, line in enumerate(file2):
            data = json.loads(line)
            if i < len(ids): 
                data['id'] = ids[i]  
            file2_updated.write(json.dumps(data, ensure_ascii=False) + '\n')

    print(f"ID added to {output_file}")


def convert_pred_to_csv(json_file, output_csv_path):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        max_tokens_length = max(len(item["tokens"]) for item in data)
        header = ["" for _ in range(max_tokens_length)]
        writer.writerow(header)

        for item in data:
            tokens_row = ["" for _ in range(max_tokens_length)]
            for i, token in enumerate(item["tokens"]):
                tokens_row[i] = token
            writer.writerow(tokens_row)

            for span in item["predicts"]:
                row = ["" for _ in range(max_tokens_length)]
                start, end, entity_type = span[0], span[1], span[2]
                for i in range(start, end):
                    row[i] = entity_type
                writer.writerow(row)

            id_row = ["#ID"] + [item["id"]] + ["" for _ in range(max_tokens_length - 2)]
            writer.writerow(id_row)
            writer.writerow([])

    print(f"CSV saved to {output_csv_path}")

inputid_json = "GM_reviews_all.json"  # 带 ID 的 JSON 文件
inputpre_json = "pred_modern.json"  # 需要添加 ID 的 JSON 文件
output_json = "GM_reviews_all_pred0321.json"  # 添加 ID 后的 JSON 输出文件
output_csv_path = "GM_reviews_all_pred0321.csv"  # 转换后的 CSV 文件

add_ids_for_predicted(inputid_json, inputpre_json, output_json)
convert_pred_to_csv(output_json, output_csv_path)



ID added to GM_reviews_all_pred0321.json
CSV saved to GM_reviews_all_pred0321.csv


# checking annotation data

In [15]:
import csv

file_path = "20250110check_modified.csv"
filtered_rows_corrected = []

with open(file_path, 'r', newline='', encoding='gbk') as csv_file:
    reader = csv.reader(csv_file)
    for line_number, row in enumerate(reader, start=1):
        non_empty_cells = [cell for cell in row if cell.strip()]
        if len(non_empty_cells) > 1:
            row_str = " ".join(row) 
            
            split_by_spaces = row_str.split()
            for i in range(len(split_by_spaces) - 1):
                gap = row_str[row_str.find(split_by_spaces[i]) + len(split_by_spaces[i]): row_str.find(split_by_spaces[i + 1])]
                if " " in gap:  
                    filtered_rows_corrected.append((line_number, row_str))
                    break  


num_filtered_corrected_rows = len(filtered_rows_corrected)
sample_filtered_corrected_rows = filtered_rows_corrected[:5]  

num_filtered_corrected_rows, sample_filtered_corrected_rows

(21406,
 [(1,
   'there are also mountain bikers who take a specially constructed path that is filled with sharp turns on undulating terrain .                                                        '),
  (5,
   '#ID 25140                                                                           '),
  (7,
   'a place where you sees many athletes jogging , running and doing exercise here .                                                              '),
  (11,
   '#ID 27484                                                                           '),
  (13,
   'well - maintained with sufficient signage and facilities for collection and use of hot spring water 10 / 10 recommend                                                         ')])

# Convert annotation data for training

In [2]:
import csv
import json
import pandas as pd
import re

def csv_to_custom_json(input_csv_path, output_json_path):
    """
    Converts a CSV file into target JSON format, where each row represents a sample 
    containing text, annotated entity spans, and a sample ID.

    Example:
    Input CSV:
        hello, world, this, is, a, test
        , , , , , B-ENTITY
        #ID, sample_1

    Output JSON:
        {"text": ["hello", "world", "this", "is", "a", "test"], "spans": [{"start": 5, "end": 6, "type": "B-ENTITY"}], "id": "sample_1"}
    """
    with open(input_csv_path, 'r', newline='', encoding='latin1') as csv_file:
        reader = csv.reader(csv_file)
        samples = []
        sample = {}
        spans = []
        text_line_read = False
        
        for row in reader:
            if not any(row):
                continue
            
            if not text_line_read:
                text = []
                for i, word in enumerate(row):
                    if word or (i < len(row) - 1 and row[i + 1]):
                        text.append(word)
                sample['text'] = text
                text_line_read = True
                continue
            
            if row[0] != '#ID':
                start = None
                end = None
                span_type = None
                for idx, cell in enumerate(row):
                    if cell != '':
                        if start is None:
                            start = idx
                        end = idx + 1
                        span_type = cell
                if start is not None and span_type is not None:
                    spans.append({"start": start, "end": end, "type": span_type})
            
            elif row[0] == '#ID':
                sample['id'] = row[1]
                sample['spans'] = spans
                samples.append(sample)
                sample = {}
                spans = []
                text_line_read = False
        
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        for sample in samples:
            if sample['spans']:
                json_file.write(json.dumps(sample, ensure_ascii=False) + '\n')
                
input_csv_path = '20250110check_modified.csv'
output_json_path = '20250110check_modified.json' 
csv_to_custom_json(input_csv_path, output_json_path)