Handling of Missing values in our data set

1. Overall evaluation of our data set

In [3]:
# Missing values analysis and date continuity check
import pandas as pd
from pathlib import Path

# Path to processed CSV
csv_path = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv")

# Load data
try:
    df = pd.read_csv(csv_path)
except Exception as e:
    raise RuntimeError(f"Failed to read CSV at {csv_path}: {e}")

print(f"File: {csv_path}")
print(f"Shape (rows, cols): {df.shape}")

# 1) Missing values per column
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)

missing_summary = (
    pd.DataFrame({
        "missing_count": missing_counts,
        "missing_pct": missing_pct.round(2)
    })
    .sort_values(by=["missing_count", "missing_pct"], ascending=False)
)

print("\nMissing values per column:")
print(missing_summary)

# 2) Check continuity of the 'Datum' column
print("\nDatum continuity check:")
if "Datum" not in df.columns:
    print("Column 'Datum' not found in dataset.")
else:
    def _parse_datum(series: pd.Series) -> pd.Series:
        d = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
        # Fallback: try dayfirst if many NaT
        if d.isna().mean() > 0.2:
            d = pd.to_datetime(series, errors="coerce", dayfirst=True)
        return d

    dates = _parse_datum(df["Datum"])  # Datetime series with NaT for unparsable values

    nat_count = dates.isna().sum()
    total = len(dates)
    print(f"Total rows: {total}")
    print(f"Unparseable/missing dates (NaT): {nat_count} ({(nat_count/total*100):.2f}%)")

    if nat_count == total:
        print("All 'Datum' values are NaT; cannot assess continuity.")
    else:
        # Work with unique non-NaT dates
        unique_dates = pd.DatetimeIndex(dates.dropna().unique()).sort_values()
        min_date = unique_dates.min()
        max_date = unique_dates.max()

        expected_range = pd.date_range(start=min_date, end=max_date, freq="D")
        missing_dates = expected_range.difference(unique_dates)

        duplicates_count = total - dates.nunique() - nat_count

        print(f"Date range: {min_date.date()} → {max_date.date()}")
        print(f"Unique non-NaT dates: {len(unique_dates)}")
        print(f"Expected daily dates in range: {len(expected_range)}")
        print(f"Missing dates in range: {len(missing_dates)}")
        print(f"Duplicate date rows (excluding NaT): {duplicates_count}")

        if len(missing_dates) == 0:
            print("Dates are continuous with daily frequency (no gaps).")
        else:
            # Show a small sample of missing dates
            sample_n = min(15, len(missing_dates))
            print(f"Example missing dates (up to {sample_n}):")
            print(pd.Series(missing_dates[:sample_n]).dt.date.tolist())

File: /home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv
Shape (rows, cols): (11782, 11)

Missing values per column:
                     missing_count  missing_pct
KielerWoche                  11496        97.57
Wettercode                    2997        25.44
Umsatz                        2448        20.78
id                             618         5.25
Warengruppe                    618         5.25
Bewoelkung                     311         2.64
Temperatur                     256         2.17
Windgeschwindigkeit            256         2.17
Niederschlag                     3         0.03
Datum                            0         0.00
is_holiday                       0         0.00

Datum continuity check:
Total rows: 11782
Unparseable/missing dates (NaT): 0 (0.00%)
Date range: 2012-01-01 → 2019-12-31
Unique non-NaT dates: 2792
Expected daily dates in range: 2922
Missing dates in range: 130
Duplicate date rows (excluding NaT)

  d = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)


2. Comparision of merged data set with raw data, to ensure no lines were droped

In [4]:
# Compare raw data files to the merged dataset for row coverage
import pandas as pd
from pathlib import Path
from pandas.api.types import is_numeric_dtype

# Ensure merged dataframe is available; if not, load it
try:
    df_merged
except NameError:
    merged_path = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv")
    df_merged = pd.read_csv(merged_path)

raw_dir = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/raw_data")
raw_files = [
    "umsatzdaten_gekuerzt.csv",
    "wetter.csv",
    "Niederschlag.csv",
    "kiwo.csv",
    "test.csv",
    "combined_data_outer.csv",
]

print("Merged dataset shape:", df_merged.shape)

# Helper to normalize 'Datum' to YYYY-MM-DD strings

def normalize_datum(s: pd.Series) -> pd.Series:
    d = pd.to_datetime(s, errors="coerce")
    if d.isna().mean() > 0.2:
        d = pd.to_datetime(s, errors="coerce", dayfirst=True)
    return d.dt.strftime("%Y-%m-%d")

# Helper to align dtypes for merging on common columns

def align_common_columns(raw_df: pd.DataFrame, merged_df: pd.DataFrame, common_cols: list) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_aligned = raw_df.copy()
    merged_aligned = merged_df[common_cols].copy()
    for col in common_cols:
        if col == "Datum":
            raw_aligned[col] = normalize_datum(raw_aligned[col]) if col in raw_aligned.columns else raw_aligned[col]
            merged_aligned[col] = normalize_datum(merged_aligned[col])
        else:
            # Prefer numeric alignment if both look numeric; else cast to string
            if col in raw_aligned.columns and is_numeric_dtype(raw_aligned[col]) and is_numeric_dtype(merged_aligned[col]):
                # Keep numeric
                continue
            else:
                raw_aligned[col] = raw_aligned[col].astype(str)
                merged_aligned[col] = merged_aligned[col].astype(str)
    return raw_aligned, merged_aligned

results = []
for fname in raw_files:
    path = raw_dir / fname
    try:
        raw_df = pd.read_csv(path)
    except Exception as e:
        print(f"\n[ERROR] Failed to read {fname}: {e}")
        continue

    common_cols = [c for c in raw_df.columns if c in df_merged.columns]
    print(f"\nFile: {fname}")
    print(f"Raw shape: {raw_df.shape}")
    print(f"Common columns used for comparison ({len(common_cols)}): {common_cols}")

    if len(common_cols) == 0:
        print("No common columns with merged dataset; cannot compare coverage.")
        results.append({"file": fname, "raw_rows": len(raw_df), "covered_rows": None, "coverage_pct": None, "missing_rows": None})
        continue

    raw_aligned, merged_aligned = align_common_columns(raw_df, df_merged, common_cols)

    # Left merge to detect which raw rows have matches in the merged dataset
    merged_check = raw_aligned.merge(merged_aligned.drop_duplicates(), on=common_cols, how="left", indicator=True)
    left_only = merged_check["_merge"].eq("left_only").sum()
    covered = len(raw_df) - left_only
    coverage_pct = (covered / len(raw_df) * 100) if len(raw_df) else 0.0

    print(f"Covered raw rows in merged: {covered}/{len(raw_df)} ({coverage_pct:.2f}%)")
    print(f"Missing raw rows not found in merged: {left_only}")

    # Show up to 5 examples of missing rows (by common columns)
    if left_only > 0:
        missing_examples = (
            merged_check.loc[merged_check["_merge"].eq("left_only"), common_cols]
            .head(5)
        )
        print("Example missing rows (up to 5):")
        print(missing_examples)

    # Compare missingness for common columns (raw vs merged on matched subset)
    matched_mask = merged_check["_merge"].eq("both")
    matched_raw = raw_aligned.loc[matched_mask, common_cols]
    matched_merged = merged_aligned.loc[
        merged_aligned.merge(raw_aligned[common_cols].drop_duplicates(), on=common_cols, how="inner").index,
        common_cols,
    ]

    miss_compare = []
    for col in common_cols:
        raw_miss = raw_df[col].isna().mean() * 100
        matched_miss = matched_merged[col].isna().mean() * 100
        miss_compare.append({"column": col, "raw_missing_pct": round(raw_miss, 2), "merged_missing_pct_on_matches": round(matched_miss, 2)})

    miss_df = pd.DataFrame(miss_compare).sort_values("column")
    print("Missingness comparison (raw vs merged on matched rows):")
    print(miss_df)

    results.append({
        "file": fname,
        "raw_rows": len(raw_df),
        "covered_rows": covered,
        "coverage_pct": round(coverage_pct, 2),
        "missing_rows": left_only,
    })

# Summary across raw files
summary_df = pd.DataFrame(results)
print("\nCoverage summary across raw files:")
print(summary_df)


Merged dataset shape: (11782, 11)

File: umsatzdaten_gekuerzt.csv
Raw shape: (9334, 4)
Common columns used for comparison (4): ['id', 'Datum', 'Warengruppe', 'Umsatz']
Covered raw rows in merged: 9332/9334 (99.98%)
Missing raw rows not found in merged: 2
Example missing rows (up to 5):
          id       Datum  Warengruppe     Umsatz
221  1402111  2014-02-11            1  93.605003
755  1508091  2015-08-09            1  99.384497
Missingness comparison (raw vs merged on matched rows):
        column  raw_missing_pct  merged_missing_pct_on_matches
1        Datum              0.0                           0.00
3       Umsatz              0.0                           4.85
2  Warengruppe              0.0                           4.85
0           id              0.0                           4.85

File: wetter.csv
Raw shape: (2601, 5)
Common columns used for comparison (5): ['Datum', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode']
Covered raw rows in merged: 2601/2601 (10

3. Continuity check of the relevant time window

In [5]:
# Check continuity for a specific window: 2013-07-01 to 2019-07-30
import pandas as pd

start_date = pd.Timestamp('2013-07-01')
end_date = pd.Timestamp('2019-07-30')

# Ensure we have a parsed date Series named `dates`
try:
    dates
except NameError:
    if 'df' not in globals():
        from pathlib import Path
        csv_path = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv")
        df = pd.read_csv(csv_path)
    d = pd.to_datetime(df["Datum"], errors="coerce")
    if d.isna().mean() > 0.2:
        d = pd.to_datetime(df["Datum"], errors="coerce", dayfirst=True)
    dates = d

mask = dates.between(start_date, end_date, inclusive="both")
window_dates = pd.DatetimeIndex(dates[mask].dropna().unique()).sort_values()
expected_window = pd.date_range(start=start_date, end=end_date, freq='D')
missing_in_window = expected_window.difference(window_dates)

print(f"Window: {start_date.date()} -> {end_date.date()}")
print(f"Days expected in window: {len(expected_window)}")
print(f"Unique non-NaT dates in window: {len(window_dates)}")
print(f"Missing dates in window: {len(missing_in_window)}")
if len(missing_in_window) == 0:
    print("Dates are continuous in the specified window.")
else:
    sample_n = min(20, len(missing_in_window))
    print(f"Example missing dates (up to {sample_n}):")
    print(pd.Series(missing_in_window[:sample_n]).dt.date.tolist())

Window: 2013-07-01 -> 2019-07-30
Days expected in window: 2221
Unique non-NaT dates in window: 2221
Missing dates in window: 0
Dates are continuous in the specified window.


4. Imputation of missing values in the raw data columns
4.1 loading relevant data from combined data outer with test.csv

In [6]:
# Load data from combined_data_outer_with_test.csv for the specified date range
import pandas as pd
from pathlib import Path

# Path to the CSV file
csv_path = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv")

# Load the full dataset
df_full = pd.read_csv(csv_path)

# Parse the Datum column
df_full['Datum'] = pd.to_datetime(df_full['Datum'], errors='coerce')

# Filter for the date range: 2013-07-01 to 2019-07-30
start_date = pd.Timestamp('2013-07-01')
end_date = pd.Timestamp('2019-07-30')

df_filtered = df_full[(df_full['Datum'] >= start_date) & (df_full['Datum'] <= end_date)].copy()

print(f"Loaded data shape: {df_filtered.shape}")
print(f"Date range: {df_filtered['Datum'].min()} to {df_filtered['Datum'].max()}")
print(f"\nFirst few rows:")
print(df_filtered.head())


Loaded data shape: (11211, 11)
Date range: 2013-07-01 00:00:00 to 2019-07-30 00:00:00

First few rows:
         Datum         id  Warengruppe      Umsatz  KielerWoche  Bewoelkung  \
417 2013-07-01  1307011.0          1.0  148.828353          NaN         6.0   
418 2013-07-01  1307013.0          3.0  201.198426          NaN         6.0   
419 2013-07-01  1307014.0          4.0   65.890169          NaN         6.0   
420 2013-07-01  1307015.0          5.0  317.475875          NaN         6.0   
421 2013-07-01  1307012.0          2.0  535.856285          NaN         6.0   

     Temperatur  Windgeschwindigkeit  Wettercode  Niederschlag  is_holiday  
417     17.8375                 15.0        20.0           0.3           0  
418     17.8375                 15.0        20.0           0.3           0  
419     17.8375                 15.0        20.0           0.3           0  
420     17.8375                 15.0        20.0           0.3           0  
421     17.8375                 15.0 

4.2 Handling of missing values in "Temperatur" column

In [7]:
# Impute missing values in Temperatur column using forward fill then backward fill
# Create indicator column for imputed values

# Create the indicator column first (1 for missing, 0 for real values)
df_filtered['Temperatur_imputed'] = df_filtered['Temperatur'].isna().astype(int)

# Count missing values before imputation
missing_before = df_filtered['Temperatur'].isna().sum()
print(f"Missing values in Temperatur before imputation: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Impute using forward fill first, then backward fill for any remaining NaN
df_filtered['Temperatur'] = df_filtered['Temperatur'].fillna(method='ffill').fillna(method='bfill')

# Verify imputation
missing_after = df_filtered['Temperatur'].isna().sum()
print(f"\nMissing values in Temperatur after imputation: {missing_after}")
print(f"Total values imputed: {df_filtered['Temperatur_imputed'].sum()}")

# Show summary statistics
print(f"\nTemperatur_imputed column value counts:")
print(df_filtered['Temperatur_imputed'].value_counts().sort_index())

# Show a few examples of imputed rows
if df_filtered['Temperatur_imputed'].sum() > 0:
    print(f"\nExample rows where Temperatur was imputed:")
    print(df_filtered[df_filtered['Temperatur_imputed'] == 1][['Datum', 'Temperatur', 'Temperatur_imputed']].head(10))


Missing values in Temperatur before imputation: 81
Percentage missing: 0.72%

Missing values in Temperatur after imputation: 0
Total values imputed: 81

Temperatur_imputed column value counts:
Temperatur_imputed
0    11130
1       81
Name: count, dtype: int64

Example rows where Temperatur was imputed:
          Datum  Temperatur  Temperatur_imputed
6788 2016-12-11      9.5000                   1
6789 2016-12-11      9.5000                   1
6790 2016-12-11      9.5000                   1
6791 2016-12-11      9.5000                   1
6792 2016-12-11      9.5000                   1
6793 2016-12-11      9.5000                   1
8260 2017-10-04     14.2625                   1
8261 2017-10-04     14.2625                   1
8262 2017-10-04     14.2625                   1
8263 2017-10-04     14.2625                   1


  df_filtered['Temperatur'] = df_filtered['Temperatur'].fillna(method='ffill').fillna(method='bfill')


4.3 Handling of missing values in "Bewoelkung" column

In [8]:
# Impute missing values in Bewoelkung column using forward fill then backward fill
# Create indicator column for imputed values

# Create the indicator column first (1 for missing, 0 for real values)
df_filtered['Bewoelkung_imputed'] = df_filtered['Bewoelkung'].isna().astype(int)

# Count missing values before imputation
missing_before = df_filtered['Bewoelkung'].isna().sum()
print(f"Missing values in Bewoelkung before imputation: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Impute using forward fill first, then backward fill for any remaining NaN
df_filtered['Bewoelkung'] = df_filtered['Bewoelkung'].fillna(method='ffill').fillna(method='bfill')

# Verify imputation
missing_after = df_filtered['Bewoelkung'].isna().sum()
print(f"\nMissing values in Bewoelkung after imputation: {missing_after}")
print(f"Total values imputed: {df_filtered['Bewoelkung_imputed'].sum()}")

# Show summary statistics
print(f"\nBewoelkung_imputed column value counts:")
print(df_filtered['Bewoelkung_imputed'].value_counts().sort_index())

# Show a few examples of imputed rows
if df_filtered['Bewoelkung_imputed'].sum() > 0:
    print(f"\nExample rows where Bewoelkung was imputed:")
    print(df_filtered[df_filtered['Bewoelkung_imputed'] == 1][['Datum', 'Bewoelkung', 'Bewoelkung_imputed']].head(10))


Missing values in Bewoelkung before imputation: 136
Percentage missing: 1.21%

Missing values in Bewoelkung after imputation: 0
Total values imputed: 136

Bewoelkung_imputed column value counts:
Bewoelkung_imputed
0    11075
1      136
Name: count, dtype: int64

Example rows where Bewoelkung was imputed:
          Datum  Bewoelkung  Bewoelkung_imputed
6788 2016-12-11         7.0                   1
6789 2016-12-11         7.0                   1
6790 2016-12-11         7.0                   1
6791 2016-12-11         7.0                   1
6792 2016-12-11         7.0                   1
6793 2016-12-11         7.0                   1
8260 2017-10-04         5.0                   1
8261 2017-10-04         5.0                   1
8262 2017-10-04         5.0                   1
8263 2017-10-04         5.0                   1


  df_filtered['Bewoelkung'] = df_filtered['Bewoelkung'].fillna(method='ffill').fillna(method='bfill')


4.4 Handling of missing values in "Windgeschwindigkeit" column

In [9]:
# Impute missing values in Windgeschwindigkeit column using forward fill then backward fill
# Create indicator column for imputed values

# Create the indicator column first (1 for missing, 0 for real values)
df_filtered['Windgeschw_imputed'] = df_filtered['Windgeschwindigkeit'].isna().astype(int)

# Count missing values before imputation
missing_before = df_filtered['Windgeschwindigkeit'].isna().sum()
print(f"Missing values in Windgeschwindigkeit before imputation: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Impute using forward fill first, then backward fill for any remaining NaN
df_filtered['Windgeschwindigkeit'] = df_filtered['Windgeschwindigkeit'].fillna(method='ffill').fillna(method='bfill')

# Verify imputation
missing_after = df_filtered['Windgeschwindigkeit'].isna().sum()
print(f"\nMissing values in Windgeschwindigkeit after imputation: {missing_after}")
print(f"Total values imputed: {df_filtered['Windgeschw_imputed'].sum()}")

# Show summary statistics
print(f"\nWindgeschw_imputed column value counts:")
print(df_filtered['Windgeschw_imputed'].value_counts().sort_index())

# Show a few examples of imputed rows
if df_filtered['Windgeschw_imputed'].sum() > 0:
    print(f"\nExample rows where Windgeschwindigkeit was imputed:")
    print(df_filtered[df_filtered['Windgeschw_imputed'] == 1][['Datum', 'Windgeschwindigkeit', 'Windgeschw_imputed']].head(10))


Missing values in Windgeschwindigkeit before imputation: 81
Percentage missing: 0.72%

Missing values in Windgeschwindigkeit after imputation: 0
Total values imputed: 81

Windgeschw_imputed column value counts:
Windgeschw_imputed
0    11130
1       81
Name: count, dtype: int64

Example rows where Windgeschwindigkeit was imputed:
          Datum  Windgeschwindigkeit  Windgeschw_imputed
6788 2016-12-11                 10.0                   1
6789 2016-12-11                 10.0                   1
6790 2016-12-11                 10.0                   1
6791 2016-12-11                 10.0                   1
6792 2016-12-11                 10.0                   1
6793 2016-12-11                 10.0                   1
8260 2017-10-04                 14.0                   1
8261 2017-10-04                 14.0                   1
8262 2017-10-04                 14.0                   1
8263 2017-10-04                 14.0                   1


  df_filtered['Windgeschwindigkeit'] = df_filtered['Windgeschwindigkeit'].fillna(method='ffill').fillna(method='bfill')


4.5 Handling of missing values in "Niederschlag" column

In [10]:
# Impute missing values in Niederschlag column using forward fill then backward fill
# Create indicator column for imputed values

# Create the indicator column first (1 for missing, 0 for real values)
df_filtered['Niederschlag_imputed'] = df_filtered['Niederschlag'].isna().astype(int)

# Count missing values before imputation
missing_before = df_filtered['Niederschlag'].isna().sum()
print(f"Missing values in Niederschlag before imputation: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Impute using forward fill first, then backward fill for any remaining NaN
df_filtered['Niederschlag'] = df_filtered['Niederschlag'].fillna(method='ffill').fillna(method='bfill')

# Verify imputation
missing_after = df_filtered['Niederschlag'].isna().sum()
print(f"\nMissing values in Niederschlag after imputation: {missing_after}")
print(f"Total values imputed: {df_filtered['Niederschlag_imputed'].sum()}")

# Show summary statistics
print(f"\nNiederschlag_imputed column value counts:")
print(df_filtered['Niederschlag_imputed'].value_counts().sort_index())

# Show a few examples of imputed rows
if df_filtered['Niederschlag_imputed'].sum() > 0:
    print(f"\nExample rows where Niederschlag was imputed:")
    print(df_filtered[df_filtered['Niederschlag_imputed'] == 1][['Datum', 'Niederschlag', 'Niederschlag_imputed']].head(10))


Missing values in Niederschlag before imputation: 0
Percentage missing: 0.00%

Missing values in Niederschlag after imputation: 0
Total values imputed: 0

Niederschlag_imputed column value counts:
Niederschlag_imputed
0    11211
Name: count, dtype: int64


  df_filtered['Niederschlag'] = df_filtered['Niederschlag'].fillna(method='ffill').fillna(method='bfill')


4.6 Handling of missing values in "Kieler Woche" column

In [11]:
# Fill missing values in Kieler Woche column with 0
# Keep existing 1.0 values unchanged

# Count missing values before filling
missing_before = df_filtered['KielerWoche'].isna().sum()
print(f"Missing values in KielerWoche before filling: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Fill all non-1.0 values (including NaN) with 0
df_filtered['KielerWoche'] = df_filtered['KielerWoche'].fillna(0)

# Verify filling
missing_after = df_filtered['KielerWoche'].isna().sum()
print(f"\nMissing values in KielerWoche after filling: {missing_after}")

# Show value distribution
print(f"\nKielerWoche column value counts:")
print(df_filtered['KielerWoche'].value_counts().sort_index())

# Show some examples
print(f"\nFirst few rows of KielerWoche column:")
print(df_filtered[['Datum', 'KielerWoche']].head(20))


Missing values in KielerWoche before filling: 10943
Percentage missing: 97.61%

Missing values in KielerWoche after filling: 0

KielerWoche column value counts:
KielerWoche
0.0    10943
1.0      268
Name: count, dtype: int64

First few rows of KielerWoche column:
         Datum  KielerWoche
417 2013-07-01          0.0
418 2013-07-01          0.0
419 2013-07-01          0.0
420 2013-07-01          0.0
421 2013-07-01          0.0
422 2013-07-02          0.0
423 2013-07-02          0.0
424 2013-07-02          0.0
425 2013-07-02          0.0
426 2013-07-02          0.0
427 2013-07-03          0.0
428 2013-07-03          0.0
429 2013-07-03          0.0
430 2013-07-03          0.0
431 2013-07-03          0.0
432 2013-07-04          0.0
433 2013-07-04          0.0
434 2013-07-04          0.0
435 2013-07-04          0.0
436 2013-07-04          0.0


4.7 Handling of missing values in "Wettercode" column

In [12]:
# Impute missing values in Wettercode column using forward fill then backward fill
# Create indicator column for imputed values
# Note: Wettercode is categorical, so forward/backward fill maintains temporal consistency

# Create the indicator column first (1 for missing, 0 for real values)
df_filtered['Wettercode_imputed'] = df_filtered['Wettercode'].isna().astype(int)

# Count missing values before imputation
missing_before = df_filtered['Wettercode'].isna().sum()
print(f"Missing values in Wettercode before imputation: {missing_before}")
print(f"Percentage missing: {(missing_before / len(df_filtered) * 100):.2f}%")

# Impute using forward fill first, then backward fill for any remaining NaN
df_filtered['Wettercode'] = df_filtered['Wettercode'].fillna(method='ffill').fillna(method='bfill')

# Verify imputation
missing_after = df_filtered['Wettercode'].isna().sum()
print(f"\nMissing values in Wettercode after imputation: {missing_after}")
print(f"Total values imputed: {df_filtered['Wettercode_imputed'].sum()}")

# Show summary statistics
print(f"\nWettercode_imputed column value counts:")
print(df_filtered['Wettercode_imputed'].value_counts().sort_index())

# Show a few examples of imputed rows
if df_filtered['Wettercode_imputed'].sum() > 0:
    print(f"\nExample rows where Wettercode was imputed:")
    print(df_filtered[df_filtered['Wettercode_imputed'] == 1][['Datum', 'Wettercode', 'Wettercode_imputed']].head(10))


Missing values in Wettercode before imputation: 2671
Percentage missing: 23.82%

Missing values in Wettercode after imputation: 0
Total values imputed: 2671

Wettercode_imputed column value counts:
Wettercode_imputed
0    8540
1    2671
Name: count, dtype: int64

Example rows where Wettercode was imputed:
         Datum  Wettercode  Wettercode_imputed
422 2013-07-02        20.0                   1
423 2013-07-02        20.0                   1
424 2013-07-02        20.0                   1
425 2013-07-02        20.0                   1
426 2013-07-02        20.0                   1
437 2013-07-05        20.0                   1
438 2013-07-05        20.0                   1
439 2013-07-05        20.0                   1
440 2013-07-05        20.0                   1
441 2013-07-05        20.0                   1


  df_filtered['Wettercode'] = df_filtered['Wettercode'].fillna(method='ffill').fillna(method='bfill')


5 Creating output file

In [13]:
# Save the imputed dataframe to CSV
output_path = Path("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_imputed.csv")

# Save to CSV
df_filtered.to_csv(output_path, index=False)

print(f"Data successfully saved to: {output_path}")
print(f"Shape of saved data: {df_filtered.shape}")
print(f"\nColumns in saved file:")
print(df_filtered.columns.tolist())


Data successfully saved to: /home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_imputed.csv
Shape of saved data: (11211, 16)

Columns in saved file:
['Datum', 'id', 'Warengruppe', 'Umsatz', 'KielerWoche', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode', 'Niederschlag', 'is_holiday', 'Temperatur_imputed', 'Bewoelkung_imputed', 'Windgeschw_imputed', 'Niederschlag_imputed', 'Wettercode_imputed']
