In [None]:
import pandas as pd
import os
import glob


# Define input and output folders
input_folder = '../gitHub/Data/MortalityCollections/'
output_folder = '../data/raw/mortality_collections/'

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop over all CSV files in input folder
for filepath in glob.glob(os.path.join(input_folder, '*.csv')):
    filename = os.path.basename(filepath)
    
    # Split filename into parts
    parts = filename.replace('.csv', '').split('_')
    if len(parts) < 4:
        print(f"Skipping malformed filename: {filename}")
        continue
    
    objId = parts[0]
    area = parts[1]

    # Load DataFrame with first column as index (the date)
    df = pd.read_csv(filepath, sep=',', header=0, index_col=0)

    # Add metadata columns
    df['objId'] = objId
    df['area'] = area

    # Move date index into a column
    df = df.reset_index()
    df = df.rename(columns={"index": "Date"})

    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    # Save to new location
    df.to_csv(os.path.join(output_folder, filename), sep=',', index=False)

    print(f"Processed: {filename}")

In [None]:
# Define the folder containing CSV files
input_folder = '../data/raw/mortality_collections/'

# Find all CSV files in the folder
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

# List to store individual DataFrames
df_list = []

# Read and collect all CSVs
for file in csv_files:
    df = pd.read_csv(file)
    df_list.append(df)

# Concatenate all DataFrames
merged_df = pd.concat(df_list, ignore_index=True)

# Save to a single CSV file
merged_df.to_csv('../data/raw/merged_mortality_collections.csv', index=False)

#save to pickle
merged_df.to_pickle('../data/pkl/merged_mortality_collections.pkl')


print(f"Merged {len(csv_files)} files into one CSV.")