In [1]:
import os
import json
import random
import shutil
from datetime import datetime
from collections import defaultdict
import pandas as pd

In [2]:
def stratified_sampling_and_copy(source_dir, target_dir, sample_size=4):
    # Ensure target directory exists
    os.makedirs(target_dir, exist_ok=True)

    # Dictionary to hold files grouped by (year, importance)
    grouped_files = defaultdict(list)

    for file_name in os.listdir(source_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(source_dir, file_name)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                # Ensure required keys are present
                if "judgementdate" in data and "importance" in data:
                    # Convert judgementdate to a datetime object
                    judgement_date = datetime.strptime(data["judgementdate"], "%d/%m/%Y").date()
                    year = judgement_date.year

                    # Filter based on the given conditions
                    if 2018 <= year <= 2024 and int(data["importance"]) in [2, 3, 4]:
                        # Group by (year, importance)
                        grouped_files[(year, data["importance"])].append(file_path)
            except (json.JSONDecodeError, ValueError) as e:
                print(f"Error reading file {file_name}: {e}")

    # Collect all eligible files
    all_filtered_files = [file for files in grouped_files.values() for file in files]

    # Stratified sampling
    sampled_files = []
    files_per_group = max(1, sample_size // len(grouped_files))

    for group, files in grouped_files.items():
        sampled_files.extend(random.sample(files, min(files_per_group, len(files))))

    # Add additional files to meet the target sample size
    if len(sampled_files) < sample_size:
        remaining_files = list(set(all_filtered_files) - set(sampled_files))
        additional_samples = random.sample(remaining_files, min(sample_size - len(sampled_files), len(remaining_files)))
        sampled_files.extend(additional_samples)

    # Shuffle the final sampled files
    random.shuffle(sampled_files)
    # Copy sampled files to the target directory
    for file_path in sampled_files:
    #    shutil.copy(file_path, target_dir)
        print(f"Copied {os.path.basename(file_path)} to {target_dir}")

    print(f"Successfully copied {len(sampled_files)} files to {target_dir}.")

In [39]:

# Source and target directories
source_directory = "../../../ECHR/echr-processed/"
target_directory = "../../kc_classification_data/125_notkc_2018_2024/"

# Execute the function
stratified_sampling_and_copy(source_directory, target_directory)


Copied 001-201432.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-199515.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-212148.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-184504.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-217805.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-213217.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-189593.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-223298.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-194451.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-180316.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-217804.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-222789.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-203180.json to ../../kc_classification_data/125_notkc_2018_2024/
Copied 001-2

In [None]:
! rm ../../kc_classification_data/125_notkc_2018_2024/001-202345.json

In [3]:
def collect_data(source_dir, target_importance=2, size=125):
    # Dictionary to hold files grouped by (year, importance)
    grouped_files = {'date':[], 'file_path':[]}

    for file_name in os.listdir(source_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(source_dir, file_name)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                # Ensure required keys are present
                if "judgementdate" in data and "importance" in data and "facts" in data and "law" in data:
                    # Convert judgementdate to a datetime object
                    importance = int(data["importance"])
                    judgement_date = datetime.strptime(data["judgementdate"], "%d/%m/%Y").date()
                    file_id = file_path.split('/')[-1]

                    # Filter based on the given conditions
                    if importance == target_importance:
                        # Group by (year, importance)
                        grouped_files['date'].append(judgement_date)
                        grouped_files['file_path'].append(file_id)

            except (json.JSONDecodeError, ValueError) as e:
                print(f"Error reading file {file_name}: {e}")
    
    grouped_files_df = pd.DataFrame.from_dict(grouped_files)
    grouped_files_df = grouped_files_df.sort_values(by='date', ascending=False)
    grouped_files_df = grouped_files_df.head(size)
    return grouped_files_df

In [4]:
source_directory = "../../../ECHR/echr-processed/"
df1 = collect_data(source_directory, target_importance=1)
df2 = collect_data(source_directory, target_importance=2)
df3 = collect_data(source_directory, target_importance=3)
df4 = collect_data(source_directory, target_importance=4)

In [82]:
df1.to_csv('../../kc_classification_data/pre_cutoff_data/df_1.csv', index=False)
df2.to_csv('../../kc_classification_data/pre_cutoff_data/df_2.csv', index=False)
df3.to_csv('../../kc_classification_data/pre_cutoff_data/df_3.csv', index=False)
#df4.to_csv('../../kc_classification_data/pre_cutoff_data/df_4.csv', index=False)

In [5]:
source_directory = "../../../ECHR/echr-processed/"
df4 = collect_data(source_directory, target_importance=4)
df4.to_csv('../../kc_classification_data/pre_cutoff_data/df_4.csv', index=False)