# This code merges the given CSV Datasets and can be used to combine new datasets to it.

In [None]:
import pandas as pd

# --- 1. Define File Names ---
file_umsatz = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv'
file_kiwo = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/kiwo.csv'
file_wetter = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/wetter.csv'

try:
    # --- 2. Load CSVs and Parse 'Datum' Column ---
    # We use parse_dates=['Datum'] to ensure Pandas treats 
    # the join column as a date, which is crucial for accuracy.
    print(f"Loading {file_umsatz}...")
    df_umsatz = pd.read_csv(file_umsatz, parse_dates=['Datum'])
    
    print(f"Loading {file_kiwo}...")
    df_kiwo = pd.read_csv(file_kiwo, parse_dates=['Datum'])
    
    print(f"Loading {file_wetter}...")
    df_wetter = pd.read_csv(file_wetter, parse_dates=['Datum'])

    # --- 3. Set 'Datum' as the Index for All DataFrames ---
    # Joining on the index is a very efficient way to combine DataFrames.
    df_umsatz = df_umsatz.set_index('Datum')
    df_kiwo = df_kiwo.set_index('Datum')
    df_wetter = df_wetter.set_index('Datum')
    
    print("All files loaded and indexed on 'Datum'.")

    # --- 4. Perform the Outer Join ---
    # We start with the largest DataFrame (df_umsatz) and join the others to it.
    # how='outer' is the key part: it includes all dates from all three files.
    # If a date exists in one file but not another, the missing data
    # will be filled with 'NaN' (Not a Number).
    
    # We can join a list of DataFrames at once.
    other_dfs = [df_kiwo, df_wetter]
    
    print("Performing outer join...")
    final_df = df_umsatz.join(other_dfs, how='outer')

    # --- 5. Clean Up and Save ---
    
    # After joining, 'Datum' is the index. Let's make it a regular column again.
    final_df = final_df.reset_index()
    
    # You may want to sort by date to make the final file logical
    final_df = final_df.sort_values(by='Datum')

    # --- 6. Display Results and Save to File ---
    print("\n--- Join Complete ---")
    print("First 5 rows of the combined data:")
    print(final_df.head())
    
    print("\nLast 5 rows of the combined data:")
    print(final_df.tail())

    print(f"\nTotal rows in combined file: {len(final_df)}")
    
    # Save the final result to a new CSV
    output_filename = 'combined_data_outer.csv'
    final_df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully saved combined data to '{output_filename}'")

except FileNotFoundError as e:
    print(f"\nError: File not found.")
    print(f"Details: {e}")
    print("Please make sure all CSV files (umsatzdaten_gekuerzt.csv, kiwo.csv, wetter.csv) are in the same directory as the script.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")