Template file for merging json file applying specific process to clean duplicates

In [8]:
import pandas as pd
import os
import copy

Define the folder path containing the CSV files and list all CSV files in the directory.

In [3]:
# Set the folder path containing the CSV files
folder_path = './data/json/'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(folder_path) if (f.endswith('.json') and not f.startswith('variables_'))]
print('Files found:')
csv_files

Files found:


['after_end_calculation_cycle_6.json',
 'after_Read_ISON_Data_P_2.json',
 'before_clear_Prometheus_data_3.json',
 'before_Read_ISON_Data_P_1.json',
 'before_simulation_4.json',
 'before_start_calculation_cycle_5.json']

Reorder files in list if it's importance! 

In [5]:
ordered_files = [0] * len(csv_files)
for i, f in zip(range(len(csv_files)), csv_files):
    ordered_files[(-1)*(int(f[-6]))] = f
ordered_files

['after_end_calculation_cycle_6.json',
 'before_start_calculation_cycle_5.json',
 'before_simulation_4.json',
 'before_clear_Prometheus_data_3.json',
 'after_Read_ISON_Data_P_2.json',
 'before_Read_ISON_Data_P_1.json']

Iterate through each CSV file, read it into a DataFrame, and add a column for the file name as 'category'.

In [6]:
df_list = []
for file in ordered_files:
    file_path = os.path.join(folder_path, file)
    #df = pd.read_csv(file_path, sep=';')
    df = pd.read_json(file_path)
    df['created_place'] = os.path.splitext(file)[0]  # Use file name without extension
    df = df[['name', 'created_place']]
    df_list.append(df)

# Check size of each DataFrame
[x.shape[0] for x in df_list]

[2021, 1885, 1147, 824, 572, 48]

Join files in particular order

In [9]:
# Test merging two smallest DataFrames

# Merge the two DataFrames on the 'name' column using a left join
merged = pd.merge(df_list[4], df_list[5], on='name', how='left')

# Create a new column 'created_place' by taking values from 'created_place_y' where available,
# otherwise filling with values from 'created_place_x'
merged['created_place'] = merged['created_place_y'].combine_first(merged['created_place_x'])

# Select only the 'name' and 'created_place' columns for the final result
result = merged[['name', 'created_place']]

# View result
result.sort_values(by=['created_place', 'name'], ascending=[False,True]).head(60).reset_index(drop=True).loc[40:]

Unnamed: 0,name,created_place
40,pname_SPAN_bin_data,before_Read_ISON_Data_P_1
41,ppdir,before_Read_ISON_Data_P_1
42,ppindex,before_Read_ISON_Data_P_1
43,r2d,before_Read_ISON_Data_P_1
44,r2m,before_Read_ISON_Data_P_1
45,ro_gps_aux,before_Read_ISON_Data_P_1
46,unalignment,before_Read_ISON_Data_P_1
47,AC1,after_Read_ISON_Data_P_2
48,AC2,after_Read_ISON_Data_P_2
49,AC3,after_Read_ISON_Data_P_2


Apply merging data as tested before

In [10]:
# Concatenate all DataFrames and rename columns
df_list_copy = copy.deepcopy(df_list)  # Avoid modifying the original df_list

if df_list_copy:
    df0 = df_list_copy.pop(0)  # Start with the first DataFrame

    for df in df_list_copy:
        df0 = pd.merge(df0, df, on='name', how='left', suffixes=('_x', '_y'))
        df0['created_place'] = df0['created_place_y'].combine_first(df0['created_place_x'])
        # Drop the old columns to avoid duplicate columns in the next merge
        df0 = df0[['name', 'created_place']]

    df_merged = df0
else:
    df_merged = pd.DataFrame(columns=['name', 'created_place'])

Display the final stacked DataFrame with columns: name, subcategory, category.

In [11]:
# Display the final DataFrame
df_merged.head()

Unnamed: 0,name,created_place
0,A,before_start_calculation_cycle_5
1,AB,before_clear_Prometheus_data_3
2,AC1,after_Read_ISON_Data_P_2
3,AC2,after_Read_ISON_Data_P_2
4,AC3,after_Read_ISON_Data_P_2


In [12]:
df_merged.shape

(2021, 2)

In [13]:
# Distribution of created_place
df_merged['created_place'].value_counts() #.head(20)

created_place
before_start_calculation_cycle_5    738
after_Read_ISON_Data_P_2            523
before_simulation_4                 325
before_clear_Prometheus_data_3      251
after_end_calculation_cycle_6       136
before_Read_ISON_Data_P_1            48
Name: count, dtype: int64

Save final data frame

In [14]:
df_merged.to_csv('data/merge_json.csv', index=False, sep=';')