## Load the Combined Dataset

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# Set up path for processed data
processed_data_dir = '../processed_data'

# Load the combined dataset
file_path = os.path.join(processed_data_dir, 'combined_data_final_imputed.csv')
df = pd.read_csv(file_path, parse_dates=['Datum'])

print(f"Combined dataset loaded successfully")
print(f"Shape: {df.shape}")
print(f"Date range: {df['Datum'].min()} to {df['Datum'].max()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())

Combined dataset loaded successfully
Shape: (11211, 48)
Date range: 2013-07-01 00:00:00 to 2019-07-30 00:00:00

First few rows:
       Datum         id  Warengruppe      Umsatz  KielerWoche  Bewoelkung  \
0 2013-07-01  1307011.0          1.0  148.828353          0.0         6.0   
1 2013-07-01  1307013.0          3.0  201.198426          0.0         6.0   
2 2013-07-01  1307014.0          4.0   65.890169          0.0         6.0   
3 2013-07-01  1307015.0          5.0  317.475875          0.0         6.0   
4 2013-07-01  1307012.0          2.0  535.856285          0.0         6.0   

   Temperatur  Windgeschwindigkeit  Wettercode  Niederschlag  ...  W_Cat_4  \
0     17.8375                 15.0        20.0           0.3  ...        1   
1     17.8375                 15.0        20.0           0.3  ...        1   
2     17.8375                 15.0        20.0           0.3  ...        1   
3     17.8375                 15.0        20.0           0.3  ...        1   
4     17.8375      

## Define Date Ranges and Split the Dataset

In [5]:
# Define the date ranges for the splits
train_start = pd.Timestamp('2013-07-01')
train_end = pd.Timestamp('2017-07-31')

val_start = pd.Timestamp('2017-08-01')
val_end = pd.Timestamp('2018-07-31')

test_start = pd.Timestamp('2018-08-01')
test_end = pd.Timestamp('2019-07-31')

# Split the dataset based on date ranges
df_train = df[(df['Datum'] >= train_start) & (df['Datum'] <= train_end)].copy()
df_validation = df[(df['Datum'] >= val_start) & (df['Datum'] <= val_end)].copy()
df_test = df[(df['Datum'] >= test_start) & (df['Datum'] <= test_end)].copy()

print("Dataset splits created successfully!\n")
print(f"Training set: {train_start.date()} to {train_end.date()}")
print(f"  Rows: {len(df_train)}")
print(f"\nValidation set: {val_start.date()} to {val_end.date()}")
print(f"  Rows: {len(df_validation)}")
print(f"\nTest set: {test_start.date()} to {test_end.date()}")
print(f"  Rows: {len(df_test)}")
print(f"\nTotal rows in all splits: {len(df_train) + len(df_validation) + len(df_test)}")

Dataset splits created successfully!

Training set: 2013-07-01 to 2017-07-31
  Rows: 7523

Validation set: 2017-08-01 to 2018-07-31
  Rows: 1849

Test set: 2018-08-01 to 2019-07-31
  Rows: 1839

Total rows in all splits: 11211


## Verify Data Integrity

In [6]:
# Verify row counts and IDs for each split
print("=" * 60)
print("DATA INTEGRITY VERIFICATION")
print("=" * 60)

# Check for missing values
print("\n1. Missing Values Analysis:")
print("-" * 60)

for split_name, split_df in [('Training', df_train), ('Validation', df_validation), ('Test', df_test)]:
    print(f"\n{split_name} Set:")
    rows_with_id = split_df['id'].notna().sum()
    rows_with_umsatz = split_df['Umsatz'].notna().sum()
    print(f"  Total rows: {len(split_df)}")
    print(f"  Rows with id: {rows_with_id}")
    print(f"  Rows with Umsatz: {rows_with_umsatz}")
    print(f"  Rows with both id and Umsatz: {sum((split_df['id'].notna()) & (split_df['Umsatz'].notna()))}")

# Check unique IDs
print("\n2. Unique IDs Analysis:")
print("-" * 60)

for split_name, split_df in [('Training', df_train), ('Validation', df_validation), ('Test', df_test)]:
    unique_ids = split_df['id'].dropna().unique()
    print(f"\n{split_name} Set:")
    print(f"  Unique IDs: {len(unique_ids)}")
    print(f"  ID range: {split_df['id'].min():.0f} to {split_df['id'].max():.0f}")

# Check for no overlap between splits
print("\n3. Date Range Verification:")
print("-" * 60)
print(f"Training: {df_train['Datum'].min().date()} to {df_train['Datum'].max().date()}")
print(f"Validation: {df_validation['Datum'].min().date()} to {df_validation['Datum'].max().date()}")
print(f"Test: {df_test['Datum'].min().date()} to {df_test['Datum'].max().date()}")

# Verify no date overlaps
if df_train['Datum'].max() < df_validation['Datum'].min():
    print("\n✓ No overlap between Training and Validation")
else:
    print("\n✗ WARNING: Overlap between Training and Validation!")

if df_validation['Datum'].max() < df_test['Datum'].min():
    print("✓ No overlap between Validation and Test")
else:
    print("✗ WARNING: Overlap between Validation and Test!")

DATA INTEGRITY VERIFICATION

1. Missing Values Analysis:
------------------------------------------------------------

Training Set:
  Total rows: 7523
  Rows with id: 7493
  Rows with Umsatz: 7493
  Rows with both id and Umsatz: 7493

Validation Set:
  Total rows: 1849
  Rows with id: 1841
  Rows with Umsatz: 1841
  Rows with both id and Umsatz: 1841

Test Set:
  Total rows: 1839
  Rows with id: 1830
  Rows with Umsatz: 0
  Rows with both id and Umsatz: 0

2. Unique IDs Analysis:
------------------------------------------------------------

Training Set:
  Unique IDs: 7493
  ID range: 1307011 to 1707315

Validation Set:
  Unique IDs: 1841
  ID range: 1708011 to 1807315

Test Set:
  Unique IDs: 1830
  ID range: 1808011 to 1907305

3. Date Range Verification:
------------------------------------------------------------
Training: 2013-07-01 to 2017-07-31
Validation: 2017-08-01 to 2018-07-31
Test: 2018-08-01 to 2019-07-30

✓ No overlap between Training and Validation
✓ No overlap between 

In [None]:
# dropping the lines without 'id' or 'Umsatz' for saving the splits

## Save the Split Datasets

In [7]:
# Define output file names with processed_data directory
output_train = os.path.join(processed_data_dir, 'train_set_imputed.csv')
output_val = os.path.join(processed_data_dir, 'validation_set_imputed.csv')
output_test = os.path.join(processed_data_dir, 'test_set_imputed.csv')

# Create processed_data directory if it doesn't exist
os.makedirs(processed_data_dir, exist_ok=True)

# Save the splits to CSV files
df_train.to_csv(output_train, index=False)
df_validation.to_csv(output_val, index=False)
df_test.to_csv(output_test, index=False)

print("Datasets saved successfully!")
print(f"\n✓ {output_train} ({len(df_train)} rows)")
print(f"✓ {output_val} ({len(df_validation)} rows)")
print(f"✓ {output_test} ({len(df_test)} rows)")
print(f"\nTotal: {len(df_train) + len(df_validation) + len(df_test)} rows saved")

Datasets saved successfully!

✓ ../processed_data/train_set_imputed.csv (7523 rows)
✓ ../processed_data/validation_set_imputed.csv (1849 rows)
✓ ../processed_data/test_set_imputed.csv (1839 rows)

Total: 11211 rows saved


## Summary Statistics for Each Split

In [8]:
# Display summary statistics for each split
print("=" * 80)
print("SPLIT SUMMARY STATISTICS")
print("=" * 80)

for split_name, split_df in [('Training', df_train), ('Validation', df_validation), ('Test', df_test)]:
    print(f"\n{split_name} Set (2013-07-01 to 2017-07-31):" if split_name == 'Training'
          else f"\n{split_name} Set (2017-08-01 to 2018-07-31):" if split_name == 'Validation'
          else f"\n{split_name} Set (2018-08-01 to 2019-07-31):")
    print("-" * 80)
    print(f"Rows with sales data (Umsatz): {split_df['Umsatz'].notna().sum()}")
    print(f"Rows with weather data: {split_df['Temperatur'].notna().sum()}")
    print(f"Unique product groups (Warengruppe): {split_df['Warengruppe'].nunique()}")
    print(f"\nUmsatz statistics:")
    print(split_df['Umsatz'].describe())
    print(f"\nTemperatur statistics:")
    print(split_df['Temperatur'].describe())
    print(f"\nHolidays in this period: {split_df['is_holiday'].sum():.0f}")

SPLIT SUMMARY STATISTICS

Training Set (2013-07-01 to 2017-07-31):
--------------------------------------------------------------------------------
Rows with sales data (Umsatz): 7493
Rows with weather data: 7523
Unique product groups (Warengruppe): 6

Umsatz statistics:
count    7493.000000
mean      209.338996
std       147.769192
min        12.937383
25%        96.774910
50%       162.622977
75%       283.910218
max      1879.461831
Name: Umsatz, dtype: float64

Temperatur statistics:
count    7523.000000
mean       12.067209
std         7.019900
min        -8.475000
25%         6.625000
50%        11.500000
75%        17.775000
max        31.437500
Name: Temperatur, dtype: float64

Holidays in this period: 205

Validation Set (2017-08-01 to 2018-07-31):
--------------------------------------------------------------------------------
Rows with sales data (Umsatz): 1841
Rows with weather data: 1849
Unique product groups (Warengruppe): 6

Umsatz statistics:
count    1841.000000
mean  