In [1]:
# Step 1: Load libraries
import pandas as pd
import os

In [2]:
# Step 2: Load cleaned traffic data
# --- Define the path to cleaned traffic data ---
TRAFFIC_DATA_PATH = "../data/processed/cleaned_traffic_data_aggregated.csv"

print(f"--- Loading Cleaned Traffic Data from: {TRAFFIC_DATA_PATH} ---")

try:
    # Load the CSV file
    df_traffic = pd.read_csv(TRAFFIC_DATA_PATH)

    # --- IMPORTANT: Convert 'Datetime' column to datetime objects ---
    # This is crucial for extracting the date part later
    df_traffic['Datetime'] = pd.to_datetime(df_traffic['Datetime'], format='%d/%m/%Y %H:%M', errors='coerce')
    
    # Drop rows where Datetime conversion failed (if any)
    original_rows = len(df_traffic)
    df_traffic.dropna(subset=['Datetime'], inplace=True)
    if len(df_traffic) < original_rows:
        print(f"Warning: Dropped {original_rows - len(df_traffic)} rows due to invalid Datetime format.")


    print("Traffic data loaded successfully.")

    # Display the first few rows and info
    print("\nDataFrame Head:")
    display(df_traffic.head())

    print("\nDataFrame Info:")
    df_traffic.info()

except FileNotFoundError:
    print(f"Error: File not found at {TRAFFIC_DATA_PATH}")
    # Stop execution if the file isn't found
    raise 
except Exception as e:
    print(f"An error occurred while loading the traffic data: {type(e).__name__} - {e}")
    raise

--- Loading Cleaned Traffic Data from: ../data/processed/cleaned_traffic_data_aggregated.csv ---
Traffic data loaded successfully.

DataFrame Head:


Unnamed: 0,Datetime,Motorcycle,Car,Truck_Lorry,Bus
0,2024-01-01 00:00:00,77,291,0,0
1,2024-01-01 01:00:00,102,291,0,0
2,2024-01-01 02:00:00,161,383,0,0
3,2024-01-01 03:00:00,90,287,0,0
4,2024-01-01 04:00:00,113,301,1,0



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8781 entries, 0 to 8780
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Datetime     8781 non-null   datetime64[ns]
 1   Motorcycle   8781 non-null   int64         
 2   Car          8781 non-null   int64         
 3   Truck_Lorry  8781 non-null   int64         
 4   Bus          8781 non-null   int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 343.1 KB


In [3]:
# Step 3: Load holiday data
# --- Define the path to your holiday data ---
HOLIDAY_DATA_PATH = "../data/raw/Malaysia Holiday.csv"

print(f"\n--- Loading and Processing Holiday Data from: {HOLIDAY_DATA_PATH} ---")

try:
    holiday_df = pd.read_csv(HOLIDAY_DATA_PATH)
    print(f"Loaded holiday data: {len(holiday_df)} rows.")

    # --- Clean Holiday Dates ---
    holiday_df['Date'] = pd.to_datetime(holiday_df['Date'], format='%d/%m/%Y', errors='coerce')

    # Drop rows where date parsing failed
    original_holidays = len(holiday_df)
    holiday_df = holiday_df.dropna(subset=['Date'])
    if len(holiday_df) < original_holidays:
         print(f"Warning: Dropped {original_holidays - len(holiday_df)} rows from holiday data due to invalid date format.")

    # --- Filter for National or Melaka Holidays ---
    holiday_df['Description'] = holiday_df['Description'].astype(str) # Handle potential NaN
    relevant_holidays = holiday_df[
        holiday_df['Description'].str.contains("National", case=False, na=False) |
        holiday_df['Description'].str.contains("Melaka", case=False, na=False)
    ].copy() # Use .copy() to avoid SettingWithCopyWarning

    # Create a set of relevant holiday dates for fast lookup
    holiday_dates = set(relevant_holidays['Date'])
    print(f"Found {len(holiday_dates)} relevant National or Melaka holidays.")
    # print("Relevant holiday dates:", sorted(list(holiday_dates))) # Optional: Print dates
except FileNotFoundError:
    print(f"Error: Holiday file not found at {HOLIDAY_DATA_PATH}")
    raise
except Exception as e:
    print(f"An error occurred during holiday processing: {type(e).__name__} - {e}")
    raise


--- Loading and Processing Holiday Data from: ../data/raw/Malaysia Holiday.csv ---
Loaded holiday data: 54 rows.
Found 20 relevant National or Melaka holidays.


In [4]:
# Step 4: Create is_holiday column
# Check if df_traffic and holiday_dates exist
if 'df_traffic' in locals() and not df_traffic.empty and 'holiday_dates' in locals():
    print(f"\n--- Adding 'is_holiday' column to Traffic Data ---")

    # --- Create a 'Date' column from 'Datetime' ---
    # This extracts just the date part (YYYY-MM-DD) for comparison
    df_traffic['Date'] = pd.to_datetime(df_traffic['Datetime'].dt.date, format='%d/%m/%Y')
    print("Created 'Date' column from 'Datetime'.")

    # --- Create 'is_holiday' column using .isin() ---
    # This is generally faster than .apply() for this task
    # It returns True if the Date is in the holiday_dates set, False otherwise
    # .astype(int) converts True to 1 and False to 0
    df_traffic['Is_Holiday'] = df_traffic['Date'].isin(holiday_dates).astype(int)

    print("'Is_Holiday' column created.")

    # --- Verification ---
    holidays_found_in_traffic = df_traffic[df_traffic['Is_Holiday'] == 1]['Date'].nunique()
    print(f"Marked {holidays_found_in_traffic} unique dates as holidays in the traffic data.")

    print("\nTraffic data with 'Is_Holiday' column (head):")
    # Display relevant columns
    display(df_traffic.tail(5))

    print("\nExample of a holiday found (if any):")
    display(df_traffic[df_traffic['Is_Holiday'] == 1][['Datetime', 'Date', 'Is_Holiday']].head(5))

else:
     print("\n'df_traffic' DataFrame or 'holiday_dates' not found. Run previous steps first.")


--- Adding 'is_holiday' column to Traffic Data ---
Created 'Date' column from 'Datetime'.
'Is_Holiday' column created.
Marked 20 unique dates as holidays in the traffic data.

Traffic data with 'Is_Holiday' column (head):


Unnamed: 0,Datetime,Motorcycle,Car,Truck_Lorry,Bus,Date,Is_Holiday
8776,2024-12-31 19:00:00,408,1674,18,4,2024-12-31,0
8777,2024-12-31 20:00:00,362,1738,17,3,2024-12-31,0
8778,2024-12-31 21:00:00,306,1377,13,3,2024-12-31,0
8779,2024-12-31 22:00:00,348,1533,16,3,2024-12-31,0
8780,2024-12-31 23:00:00,326,1410,16,3,2024-12-31,0



Example of a holiday found (if any):


Unnamed: 0,Datetime,Date,Is_Holiday
0,2024-01-01 00:00:00,2024-01-01,1
1,2024-01-01 01:00:00,2024-01-01,1
2,2024-01-01 02:00:00,2024-01-01,1
3,2024-01-01 03:00:00,2024-01-01,1
4,2024-01-01 04:00:00,2024-01-01,1


In [6]:
#Step 5: Add Is_Weekend column
# --- 5. Create 'Is_Weekend' column ---
if 'df_traffic' in locals() and not df_traffic.empty and 'Datetime' in df_traffic.columns:
    print("\n--- Creating 'Is_Weekend' column ---")

    # Ensure 'Datetime' column is datetime type
    if not pd.api.types.is_datetime64_any_dtype(df_traffic['Datetime']):
        df_traffic['Datetime'] = pd.to_datetime(df_traffic['Datetime'], errors='coerce')
        df_traffic.dropna(subset=['Datetime'], inplace=True) # Drop if conversion failed

    # Get the day of the week (Monday=0, Sunday=6)
    df_traffic['Day_Of_Week'] = df_traffic['Datetime'].dt.dayofweek

    # Create 'is_weekend': 1 if Day_Of_Week is 5 (Sat) or 6 (Sun), else 0
    df_traffic['Is_Weekend'] = df_traffic['Day_Of_Week'].apply(lambda x: 1 if x >= 5 else 0)

    # Alternatively, using .isin()
    # df_traffic['Is_Weekend'] = df_traffic['Day_Of_Week'].isin([5, 6]).astype(int)

    print("'Is_Weekend' column created.")

    # --- Verification ---
    print("\nWeekend vs Weekday counts:")
    print(df_traffic['Is_Weekend'].value_counts()) # 0 = Weekday, 1 = Weekend

    print("\nTraffic data with 'Is_Weekend' column (head):")
    # Display relevant columns
    display(df_traffic[['Datetime', 'Date', 'Is_Holiday', 'Day_Of_Week', 'Is_Weekend']].head())

    # Drop the temporary day_of_week column if you don't need it
    # df_traffic = df_traffic.drop(columns=['day_of_week'])

else:
     print("\n'df_traffic' DataFrame not found or 'Datetime' column missing. Run previous steps first.")


--- Creating 'Is_Weekend' column ---
'Is_Weekend' column created.

Weekend vs Weekday counts:
Is_Weekend
0    6288
1    2493
Name: count, dtype: int64

Traffic data with 'Is_Weekend' column (head):


Unnamed: 0,Datetime,Date,Is_Holiday,Day_Of_Week,Is_Weekend
0,2024-01-01 00:00:00,2024-01-01,1,0,0
1,2024-01-01 01:00:00,2024-01-01,1,0,0
2,2024-01-01 02:00:00,2024-01-01,1,0,0
3,2024-01-01 03:00:00,2024-01-01,1,0,0
4,2024-01-01 04:00:00,2024-01-01,1,0,0


In [7]:
#Step 6: Add Hour, and Month columns ---
if 'df_traffic' in locals() and not df_traffic.empty and 'Datetime' in df_traffic.columns:
    print("\n--- Adding Hour, and Month columns ---")

    # Ensure 'Datetime' column is datetime type
    if not pd.api.types.is_datetime64_any_dtype(df_traffic['Datetime']):
        df_traffic['Datetime'] = pd.to_datetime(df_traffic['Datetime'], errors='coerce')
        df_traffic.dropna(subset=['Datetime'], inplace=True) # Drop if conversion failed

    # Extract Hour (0-23)
    df_traffic['Hour'] = df_traffic['Datetime'].dt.hour

    # Extract Month (1-12)
    df_traffic['Month'] = df_traffic['Datetime'].dt.month

    print("Added 'Hour', and 'Month' columns.")
else:
     print("\n'df_traffic' DataFrame not found or 'Datetime' column missing. Run previous steps first.")


--- Adding Hour, and Month columns ---
Added 'Hour', and 'Month' columns.


In [8]:
#Step 5: Save dataframe
# --- Optional: Save the updated DataFrame ---
if 'df_traffic' in locals() and not df_traffic.empty:
    UPDATED_TRAFFIC_DATA_PATH = "../data/processed/cleaned_traffic_data_aggregated_with_holidays.csv"
    try:
        df_traffic.to_csv(UPDATED_TRAFFIC_DATA_PATH, index=False)
        print(f"\nFinal data including holidays saved to: {UPDATED_TRAFFIC_DATA_PATH}")
    except Exception as e:
        print(f"Error saving file: {e}")
else:
    print("\nNo final DataFrame ('df_traffic') to save.")


Final data including holidays saved to: ../data/processed/cleaned_traffic_data_aggregated_with_holidays.csv
