In [None]:
import json
import random

# File paths
file_path = 'train.json'

# Set random seed for reproducibility
random.seed(42)

# Store data for two system types separately
system1_data = []  # First system: Financial expert
system2_data = []  # Second system: Sentiment analysis expert

print("Reading and categorizing data...")

# Read and categorize data
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_obj = json.loads(line.strip())
        
        if "system" in json_obj:
            system_content = json_obj["system"]
            
            # First system: Financial expert
            if system_content == 'You are a financial expert. Your task is to provide accurate and insightful answers to finance-related questions based on the given conversation context.':
                system1_data.append(json_obj)
            
            # Second system: Sentiment analysis expert
            elif system_content == 'You are a financial sentiment analysis expert. Your task is to analyze the sentiment expressed in the given financial text.Only reply with positive, neutral, or negative.':
                system2_data.append(json_obj)

print(f"Original data statistics:")
print(f"First system data count: {len(system1_data)}")
print(f"Second system data count: {len(system2_data)}")

# Step 1: Initial sampling from original data
# Randomly sample 6000 from 38757 records of first system
# Randomly sample 8000 from 28628 records of second system
first_sample_system1 = random.sample(system1_data, 6000)
first_sample_system2 = random.sample(system2_data, 8000)

print(f"\nFirst sampling results:")
print(f"Sampled from first system: {len(first_sample_system1)}")
print(f"Sampled from second system: {len(first_sample_system2)}")

# Step 2: Further sampling for training set from first sample
# Sample 4000 from 6000 as part of training set
# Sample 6000 from 8000 as part of training set
train_system1 = random.sample(first_sample_system1, 4000)
train_system2 = random.sample(first_sample_system2, 6000)

# Combine into final training set (4000 + 6000 = 10000)
train_data = train_system1 + train_system2

# Step 3: Remaining data as test set
# Remaining from first system: 6000 - 4000 = 2000
test_system1 = [item for item in first_sample_system1 if item not in train_system1]

# Remaining from second system: 8000 - 6000 = 2000  
test_system2 = [item for item in first_sample_system2 if item not in train_system2]

# Combine test set (2000 + 2000 = 4000)
test_data = test_system1 + test_system2
random.shuffle(train_data) 

print(f"\nFinal data split results:")
print("=" * 50)
print(f"Training set total: {len(train_data)}")
print(f"  - First system: {len(train_system1)}")
print(f"  - Second system: {len(train_system2)}")

print(f"\nTest set total: {len(test_data)}")
print(f"  - First system: {len(test_system1)}")
print(f"  - Second system: {len(test_system2)}")

print(f"\nUnused data remaining:")
print(f"  - First system remaining: {len(system1_data) - 6000}")
print(f"  - Second system remaining: {len(system2_data) - 8000}")

# Save training set as jsonl format
with open('train_final.jsonl', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Save test set as jsonl format
with open('test_final.jsonl', 'w', encoding='utf-8') as f:
    for item in test_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Optionally save test sets for each task separately as jsonl format
with open('test_task1.jsonl', 'w', encoding='utf-8') as f:
    for item in test_system1:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

with open('test_task2.jsonl', 'w', encoding='utf-8') as f:
    for item in test_system2:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("\nFiles saved:")
print("- train_final.jsonl: Training set (10,000)")
print("- test_final.jsonl: Test set (4,000)")
print("- test_task1.jsonl: Task 1 test set (2,000)")
print("- test_task2.jsonl: Task 2 test set (2,000)")

# Validate data integrity
print(f"\nValidation information:")
print(f"Training set + test set total: {len(train_data) + len(test_data)}")
print(f"Total sampled from original data: {6000 + 8000}")
print(f"Data split correctness: {(len(train_data) + len(test_data)) == 14000}")