1) Load (typed) + quick inspect

In [7]:
import py7zr
from pathlib import Path

raw_dir = Path("data/raw")

for file in raw_dir.glob("*.7z"):
    with py7zr.SevenZipFile(file, mode='r') as archive:
        archive.extractall(path=raw_dir)


### Load and inspect shapes/types and missing values:

In [11]:

import pandas as pd, numpy as np
from pathlib import Path

PATH = r"C:\Users\Yizi\New folder\Payday-surge-favorita\data\raw\train.csv"

USECOLS = ["date","store_nbr","item_nbr","unit_sales","onpromotion"]
DTYPES  = {"store_nbr":"int16","item_nbr":"int32","unit_sales":"float32"}

df = pd.read_csv(PATH, usecols=USECOLS, parse_dates=["date"], dtype=DTYPES, low_memory=False)

print("Shape:", df.shape)                 # → rows, cols
print(df.head(3))                         # → first rows look sane?
print(df.info())                          # → dtypes are as expected
print("Date range:", df["date"].min(), "→", df["date"].max())

na = df.isna().sum().sort_values(ascending=False)
print("Missing values (top):\n", na.head(10))   # → `onpromotion` will have many NaNs (expected)


Shape: (125497040, 5)
        date  store_nbr  item_nbr  unit_sales onpromotion
0 2013-01-01         25    103665         7.0         NaN
1 2013-01-01         25    105574         1.0         NaN
2 2013-01-01         25    105575         2.0         NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   store_nbr    int16         
 2   item_nbr     int32         
 3   unit_sales   float32       
 4   onpromotion  object        
dtypes: datetime64[ns](1), float32(1), int16(1), int32(1), object(1)
memory usage: 3.0+ GB
None
Date range: 2013-01-01 00:00:00 → 2017-08-15 00:00:00
Missing values (top):
 onpromotion    21657651
date                  0
store_nbr             0
item_nbr              0
unit_sales            0
dtype: int64


### 2) Standardize Schema

In [12]:

df["onpromotion"] = (
    df["onpromotion"]
      .map({True: True, False: False, "True": True, "False": False, 1: True, 0: False})
      .astype("boolean")   # nullable boolean to avoid FutureWarning
      .fillna(False)       # NA = not on promotion
      .astype(bool)        # final dtype: bool
)

df = df.dropna(subset=["unit_sales"])  

print("onpromotion dtype:", df["onpromotion"].dtype)    # Output: bool
print("onpromotion uniques:", df["onpromotion"].unique())  # Output: [True False]
print("Remaining NA counts:\n", df.isna().sum())       


onpromotion dtype: bool
onpromotion uniques: [False  True]
Remaining NA counts:
 date           0
store_nbr      0
item_nbr       0
unit_sales     0
onpromotion    0
dtype: int64


### 3) Cleaning:

In [13]:


df["was_return"] = df["unit_sales"] < 0

dup_before = df.duplicated(subset=["date","store_nbr","item_nbr"]).sum()
print("Duplicate keys before groupby:", dup_before)

df = (df.groupby(["date","store_nbr","item_nbr"], as_index=False)
        .agg(unit_sales=("unit_sales","sum"),
             onpromotion=("onpromotion","max"),
             was_return=("was_return","any")))

dup_after = df.duplicated(subset=["date","store_nbr","item_nbr"]).sum()
print("Duplicate keys after groupby:", dup_after)   


Duplicate keys before groupby: 0
Duplicate keys after groupby: 0


### 4) Clipping negatives, capped extreme outliers and applied log1p transformation:

In [14]:

df["unit_sales_clipped"] = df["unit_sales"].clip(lower=0).astype("float32")

cap = df["unit_sales_clipped"].quantile(0.999) 
df["unit_sales_capped"] = np.minimum(df["unit_sales_clipped"], cap).astype("float32")

df["unit_sales_log1p"] = np.log1p(df["unit_sales_capped"]).astype("float32")

print(df["unit_sales_clipped"].describe(percentiles=[0.99,0.999]))


count    1.254970e+08
mean     8.556009e+00
std      2.352696e+01
min      0.000000e+00
50%      4.000000e+00
99%      7.100000e+01
99.9%    2.127619e+02
max      8.944000e+04
Name: unit_sales_clipped, dtype: float64


### 5) Validation (no null keys, unique keys, clipped ≥ 0):

In [15]:


print("Nulls in keys:\n", df[["date","store_nbr","item_nbr"]].isna().sum())
print("Duplicate keys:", df.duplicated(subset=["date","store_nbr","item_nbr"]).sum())
print("Min clipped:", df["unit_sales_clipped"].min())

assert df[["date","store_nbr","item_nbr"]].isna().sum().sum() == 0
assert df.duplicated(subset=["date","store_nbr","item_nbr"]).sum() == 0
assert df["unit_sales_clipped"].min() >= 0.0
print("Validation passed")


Nulls in keys:
 date         0
store_nbr    0
item_nbr     0
dtype: int64
Duplicate keys: 0
Min clipped: 0.0
Validation passed


### 6)  Save to Parquet (partitioned):

In [16]:


import pyarrow as pa, pyarrow.parquet as pq

out_dir = Path("data/processed/train_clean_parquet")
out_dir.mkdir(parents=True, exist_ok=True)

df["dow"]   = df["date"].dt.dayofweek.astype("int8")
df["year"]  = df["date"].dt.year.astype("int16")
df["month"] = df["date"].dt.month.astype("int8")

pq.write_to_dataset(
    pa.Table.from_pandas(df, preserve_index=False),
    root_path=str(out_dir),
    partition_cols=["year","month"],
    compression="zstd",
    use_dictionary=True
)
print("Saved →", out_dir)


Saved → data\processed\train_clean_parquet
