In [15]:
# 1. Import core libraries
import pandas as pd
from pathlib import Path

# 2. Define the path to the Excel file
DATA_PATH = Path("data_train.xlsx")  # relative path

# 3. Load the Excel file into a DataFrame
df = pd.read_excel(DATA_PATH)

# 4. Define the list of columns to standardize (only dates, without time)
date_cols = ['END_LOAD_DATE', 'START_DELIVERY_DATE', 'END_DELIVERY_DATE']

# 5. Convert values in specified columns to date format (without time)
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce').dt.date

# 6. Ensure that START_LOAD_DATE is also properly formatted (date only)
df['START_LOAD_DATE'] = pd.to_datetime(df['START_LOAD_DATE'], errors='coerce').dt.date

# 7. Display 30 random samples to verify the results
print(df[["START_LOAD_DATE", "END_LOAD_DATE", "START_DELIVERY_DATE", "END_DELIVERY_DATE"]].sample(30))

      START_LOAD_DATE END_LOAD_DATE START_DELIVERY_DATE END_DELIVERY_DATE
9291       2022-12-15    2022-12-15          2022-12-15        2022-12-15
29481      2023-08-30    2023-08-30          2023-08-30        2023-08-30
252        2020-06-22    2020-06-22          2020-06-23        2020-06-23
28309      2023-08-18    2023-08-18          2023-08-21        2023-08-21
13721      2023-02-21    2023-02-21          2023-02-22        2023-02-22
21939      2023-05-31    2023-05-31          2023-05-31        2023-05-31
23000      2023-06-10    2023-06-10          2023-06-11        2023-06-11
25839      2023-07-11    2023-07-11          2023-07-12        2023-07-12
6527       2022-11-04    2022-11-04          2022-11-07        2022-11-07
43347      2024-02-07    2024-02-07          2024-02-08        2024-02-08
988        2021-03-12    2021-03-12          2021-03-15        2021-03-15
31874      2023-09-29    2023-09-29          2023-10-02        2023-10-02
38548      2023-12-08    2023-12-08   

In [None]:
# Save the cleaned data to CSV
df.to_csv("cleaned_data.csv", index=False)